From 6309bcbc41cd824a38f63c10a12e3bafe7198c6a Mon Sep 17 00:00:00 2001
From: hooke007 <hooke007@qq.com>
Date: Fri, 26 May 2023 00:58:48 +0100
Subject: [PATCH] =?UTF-8?q?=E5=90=8C=E6=AD=A5=E4=B8=8E=E6=95=B4=E5=90=88?=
 =?UTF-8?q?=E4=B8=8A=E6=B8=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

核心：
同步 --slang --alang --subs-fallback --hwdec

脚本：
uosc 同步至 4.7.0+ ；撤回临时修复，使用上游方案代替

着色器：
nlmeans再次经历了重构，只采用原始仓库标准和hqx目录中的着色器并移除微调式（关联如下）的变体

- guided guided_fast
- guided_s guided_s_fast
- nlmeans nlmeans_sharpen_denoise nlmeans_sharpen_only
- nlmeans_temporal nlmeans_temporal_sharpen_denoise
---
 portable_config/mpv.conf                      |   15 +-
 portable_config/scripts/osc_plus.lua          |   39 +-
 portable_config/scripts/thumbfast.lua         |   11 +-
 .../scripts/uosc/elements/Elements.lua        |    2 +-
 .../scripts/uosc/elements/Menu.lua            |  192 +-
 .../scripts/uosc/elements/Timeline.lua        |    2 +-
 .../scripts/uosc/elements/TopBar.lua          |    8 +-
 portable_config/scripts/uosc/lib/std.lua      |   37 +-
 portable_config/scripts/uosc/lib/text.lua     |   15 +-
 portable_config/scripts/uosc/lib/utils.lua    |  101 +-
 portable_config/scripts/uosc/main.lua         |   40 +-
 portable_config/shaders/nlmeans.glsl          | 1821 +++++++---
 portable_config/shaders/nlmeans_2x.glsl       | 1247 -------
 portable_config/shaders/nlmeans_hqx.glsl      | 2947 ++++++++++++++---
 portable_config/shaders/nlmeans_lgc.glsl      | 1043 ------
 portable_config/shaders/nlmeans_lq.glsl       | 1086 ------
 portable_config/shaders/nlmeans_temporal.glsl | 1819 +++++++---
 portable_config/vs/SR_ESRGAN_DML.vpy          |    4 +-
 portable_config/vs/SR_ESRGAN_NV.vpy           |    4 +-
 19 files changed, 5616 insertions(+), 4817 deletions(-)
 delete mode 100644 portable_config/shaders/nlmeans_2x.glsl
 delete mode 100644 portable_config/shaders/nlmeans_lgc.glsl
 delete mode 100644 portable_config/shaders/nlmeans_lq.glsl

diff --git a/portable_config/mpv.conf b/portable_config/mpv.conf
index 85699d72..a7bd5eca 100644
--- a/portable_config/mpv.conf
+++ b/portable_config/mpv.conf
@@ -21,9 +21,9 @@
  d3d11-exclusive-fs = no             # [当 --gpu-api=d3d11 时] 全屏时独占，默认 no
  d3d11-flip         = yes            # （通常在 --d3d11-exclusive-fs=yes 和 --on-top 一起使用时）禁用它可避免MPV全屏时的冻屏问题，默认 yes
 
- hwdec              = no             # 指定应使用的硬件视频解码API，默认软解（no）。10系以上N卡如需硬解强烈建议使用 nvdec-copy
-                                     # 值 auto 等效 yes 即原生硬解。追求效率可使用，但不支持部分设置/滤镜/着色器
-                                     # 平衡选择推荐使用 auto-copy
+ hwdec              = no             # 指定应使用的硬件视频解码API，默认值 no 为软解。值 auto 等效 yes 即原生硬解，但不支持部分设置/滤镜。
+                                     # 它也可以是多个值组成的优先级列表，例如值 vulkan-copy,nvdec-copy,dxva2-copy 表示依次尝试这些解码模式
+                                     # 更多详情参见Wiki的FAQ页面下的“软硬解的选择”部分
  hwdec-codecs       = "h264,vc1,hevc,vp8,vp9,av1,prores"
                                      # 对限定范围内的编码尝试硬解，特殊值 all 即任意格式都尝试硬解，当前版本默认值 h264,vc1,hevc,vp8,vp9,av1,prores
  vd-lavc-dr         = auto           # <默认auto|yes|no> 是否直接解码到显存，个别低端英特尔处理器可能需要显式禁用此功能以大幅提速解码
@@ -139,7 +139,8 @@
                                     # 特殊值 stereo 强制多声道音源下混为双声道输出（避免可能的7.1/5.1→2.0声音丢失和音量过小）
  audio-pitch-correction = yes       # 变速播放时的音调修正，默认 yes
  alang                  =
-                                    # 音轨首选语言，但MPV优先加载外挂轨道，此项参数可能实际用处不大。默认为空，例值（优选中文） chs,sc,zh,chi,zho
+                                    # 音轨首选语言，但MPV优先加载外挂轨道，此项参数可能实际用处不大。
+                                    # 默认为空，特殊值可为 auto （尝试匹配系统语言），例值（优选中文） chs,sc,zh,chi,zho
  audio-file-auto        = no        # <默认no|exact|fuzzy|all> 自动加载同名外挂音轨（fuzzy为模糊名，exact为精确名）
 
 
@@ -252,8 +253,10 @@
  sub-file-paths                 =
                                               # 在指定的额外目录中寻找匹配的字幕。支持相对和绝对路径，默认为空
                                               # 例值（ sub;subtitles;字幕;C:/字幕库 ）即自动搜索当前文件路径下名为"sub","subtitles","字幕"和C盘的"字幕库"文件夹内
- slang                          =
-                                              # 字幕首选语言，但MPV优先加载外挂轨道，此项参数可能实际用处不大。默认为空，例值（优选中文） chs,sc,zh,chi,zho
+ slang                          = auto
+                                              # 字幕首选语言，但MPV优先加载外挂轨道，此项参数可能实际用处不大。
+                                              # 默认值为 auto （尝试匹配系统语言），例值（优选中文） chs,sc,zh,chi,zho
+ subs-fallback                  = no          # <yes|default|默认no> 现有字幕轨无法满足 --slang 的条件时是否回退选择其它字幕，值 default 表示仅选择带有“默认”标记的轨道
  blend-subtitles                = no          # <yes|video|默认no> 在插值和颜色管理之前，将字幕混合到视频帧上。值video类似于yes，但是以视频的原始分辨率绘制字幕，并与视频一起缩放
                                               # 启用此功能会将字幕限制在视频的可见部分（不能出现在视频下方的黑色空白处）
                                               # 还会让字幕受 --icc-profile --target-prim --target-trc --interpolation --gamma-factor --glsl-shaders 的影响
diff --git a/portable_config/scripts/osc_plus.lua b/portable_config/scripts/osc_plus.lua
index 9108658b..6a4e0bd7 100644
--- a/portable_config/scripts/osc_plus.lua
+++ b/portable_config/scripts/osc_plus.lua
@@ -1,6 +1,6 @@
 --[[
 SOURCE_ https://github.com/mpv-player/mpv/blob/master/player/lua/osc.lua
-COMMIT_ b7ffe0d16eec8153d9609382997baaf6a29e5e4f
+COMMIT_ 945d7c1eda47c97c4bfba884fb21f398a64b2289
 文档_ https://github.com/hooke007/MPV_lazy/discussions/18
 
 改进版本的OSC，不兼容其它OSC类脚本（实现全部功能需搭配 新缩略图引擎 thumbfast ）
@@ -1432,6 +1432,11 @@ layouts["box"] = function ()
         {x = posX - pos_offsetX, y = bigbtnrowY, an = 7, w = 70, h = 18}
     lo.style = osc_styles.smallButtonsL
 
+    lo = add_layout("tog_forced_only")
+    lo.geometry =
+        {x = posX - pos_offsetX + 70, y = bigbtnrowY - 1, an = 7, w = 25, h = 18}
+    lo.style = osc_styles.smallButtonsL
+
     lo = add_layout("tog_fs")
     lo.geometry =
         {x = posX+pos_offsetX - 25, y = bigbtnrowY, an = 4, w = 25, h = 25}
@@ -1941,6 +1946,12 @@ function bar_layout(direction)
     lo.geometry = geo
     lo.style = osc_styles.smallButtonsBar
 
+    -- Forced-subs-only button
+    geo = { x = geo.x - geo.w - padX, y = geo.y, an = geo.an, w = geo.w, h = geo.h }
+    lo = add_layout("tog_forced_only")
+    lo.geometry = geo
+    lo.style = osc_styles.smallButtonsBar
+
     -- Track selection buttons
     geo = { x = geo.x - tsW - padX, y = geo.y, an = geo.an, w = tsW, h = geo.h }
     lo = add_layout("cy_sub")
@@ -2327,6 +2338,32 @@ function osc_init()
     ne.eventresponder["shift+mbtn_left_down"] =
         function () show_message(get_tracklist("sub"), 2) end
 
+    -- tog_forced_only
+    local tog_forced_only = new_element("tog_forced_only", "button")
+
+    ne = tog_forced_only
+    ne.content = function ()
+        sub_codec = mp.get_property("current-tracks/sub/codec")
+        if (sub_codec ~= "dvd_subtitle" and sub_codec ~= "hdmv_pgs_subtitle") then
+            return ""
+        end
+        local base_a = tog_forced_only.layout.alpha
+        local alpha = base_a[1]
+        if not mp.get_property_bool("sub-forced-only-cur") then
+            alpha = 255
+        end
+        local ret = assdraw.ass_new()
+        ret:append("[")
+        ass_append_alpha(ret, {[1] = alpha, [2] = 1, [3] = base_a[3], [4] = base_a[4]}, 0)
+        ret:append("F")
+        ass_append_alpha(ret, base_a, 0)
+        ret:append("]")
+        return ret.text
+    end
+    ne.eventresponder["mbtn_left_up"] = function ()
+        mp.set_property_bool("sub-forced-only", (not mp.get_property_bool("sub-forced-only-cur")))
+    end
+
     ne.eventresponder["wheel_up_press"] =
         function () set_track("sub", -1) end
     ne.eventresponder["wheel_down_press"] =
diff --git a/portable_config/scripts/thumbfast.lua b/portable_config/scripts/thumbfast.lua
index 5a5a36f4..b1e09549 100644
--- a/portable_config/scripts/thumbfast.lua
+++ b/portable_config/scripts/thumbfast.lua
@@ -1,6 +1,6 @@
 --[[
 SOURCE_ https://github.com/po5/thumbfast/blob/master/thumbfast.lua
-COMMIT_ 8aa6faf10adad899e05cc9b850cde904d37515be
+COMMIT_ 4241c7daa444d3859b51b65a39d30e922adb87e9
 
 适配多个OSC类脚本的新缩略图引擎
 
@@ -260,8 +260,8 @@ local auto_run = options.auto_run
 
 local function info(w, h)
     local short_video = mp.get_property_number("duration", 0) <= options.min_duration
-    local image = properties["current-tracks"] and properties["current-tracks"]["video"] and properties["current-tracks"]["video"]["image"]
-    local albumart = image and properties["current-tracks"]["video"]["albumart"]
+    local image = properties["current-tracks/video"] and properties["current-tracks/video"]["image"]
+    local albumart = image and properties["current-tracks/video"]["albumart"]
 
     disabled = (w or 0) == 0 or (h or 0) == 0 or
         has_vid == 0 or
@@ -692,8 +692,7 @@ local function update_tracklist(name, value)
     -- current-tracks shim
     for _, track in ipairs(value) do
         if track.type == "video" and track.selected then
-            properties["current-tracks/video/image"] = track.image
-            properties["current-tracks/video/albumart"] = track.albumart
+            properties["current-tracks/video"] = track
             return
         end
     end
@@ -748,7 +747,7 @@ local function shutdown()
     end
 end
 
-mp.observe_property("current-tracks", "native", function(name, value)
+mp.observe_property("current-tracks/video", "native", function(name, value)
     update_property(name, value)
 end)
 
diff --git a/portable_config/scripts/uosc/elements/Elements.lua b/portable_config/scripts/uosc/elements/Elements.lua
index 489819a8..fc1cc55f 100644
--- a/portable_config/scripts/uosc/elements/Elements.lua
+++ b/portable_config/scripts/uosc/elements/Elements.lua
@@ -22,7 +22,7 @@ function Elements:remove(idOrElement)
 	if element then
 		if not element.destroyed then element:destroy() end
 		element.enabled = false
-		self.itable = itable_remove(self.itable, self[id])
+		self.itable = itable_delete_value(self.itable, self[id])
 		self[id] = nil
 		request_render()
 	end
diff --git a/portable_config/scripts/uosc/elements/Menu.lua b/portable_config/scripts/uosc/elements/Menu.lua
index 99d736f3..0a1b1f34 100644
--- a/portable_config/scripts/uosc/elements/Menu.lua
+++ b/portable_config/scripts/uosc/elements/Menu.lua
@@ -3,13 +3,13 @@ local Element = require('elements/Element')
 -- Menu data structure accepted by `Menu:open(menu)`.
 ---@alias MenuData {type?: string; title?: string; hint?: string; keep_open?: boolean; separator?: boolean; items?: MenuDataItem[]; selected_index?: integer;}
 ---@alias MenuDataItem MenuDataValue|MenuData
----@alias MenuDataValue {title?: string; hint?: string; icon?: string; value: any; bold?: boolean; italic?: boolean; muted?: boolean; active?: boolean; keep_open?: boolean; separator?: boolean;}
+---@alias MenuDataValue {title?: string; hint?: string; icon?: string; value: any; bold?: boolean; italic?: boolean; muted?: boolean; active?: boolean; keep_open?: boolean; separator?: boolean; selectable?: boolean; align?: 'left'|'center'|'right'}
 ---@alias MenuOptions {mouse_nav?: boolean; on_open?: fun(); on_close?: fun(); on_back?: fun(); on_move_item?: fun(from_index: integer, to_index: integer, submenu_path: integer[]); on_delete_item?: fun(index: integer, submenu_path: integer[])}
 
 -- Internal data structure created from `Menu`.
 ---@alias MenuStack {id?: string; type?: string; title?: string; hint?: string; selected_index?: number; keep_open?: boolean; separator?: boolean; items: MenuStackItem[]; parent_menu?: MenuStack; submenu_path: integer[]; active?: boolean; width: number; height: number; top: number; scroll_y: number; scroll_height: number; title_width: number; hint_width: number; max_width: number; is_root?: boolean; fling?: Fling}
 ---@alias MenuStackItem MenuStackValue|MenuStack
----@alias MenuStackValue {title?: string; hint?: string; icon?: string; value: any; active?: boolean; bold?: boolean; italic?: boolean; muted?: boolean; keep_open?: boolean; separator?: boolean; title_width: number; hint_width: number}
+---@alias MenuStackValue {title?: string; hint?: string; icon?: string; value: any; active?: boolean; bold?: boolean; italic?: boolean; muted?: boolean; keep_open?: boolean; separator?: boolean; selectable?: boolean; align?: 'left'|'center'|'right'; title_width: number; hint_width: number}
 ---@alias Fling {y: number, distance: number, time: number, easing: fun(x: number), duration: number, update_cursor?: boolean}
 
 ---@alias Modifiers {shift?: boolean, ctrl?: boolean, alt?: boolean}
@@ -156,7 +156,7 @@ function Menu:update(data)
 
 		-- Update items
 		local first_active_index = nil
-		menu.items = {} -- {{title = lang._menu_item_empty_title, value = 'ignore', italic = 'true', muted = 'true'}}
+		menu.items = {} -- {{title = lang._menu_item_empty_title, value = 'ignore', italic = 'true', muted = 'true', selectable = false, align = 'center'}}
 
 		for i, item_data in ipairs(menu_data.items or {}) do
 			if item_data.active and not first_active_index then first_active_index = i end
@@ -164,6 +164,7 @@ function Menu:update(data)
 			local item = {}
 			table_assign(item, item_data, {
 				'title', 'icon', 'hint', 'active', 'bold', 'italic', 'muted', 'value', 'keep_open', 'separator',
+				'selectable', 'align'
 			})
 			if item.keep_open == nil then item.keep_open = menu.keep_open end
 
@@ -265,10 +266,14 @@ function Menu:reset_navigation()
 
 	-- Reset indexes and scroll
 	self:scroll_to(menu.scroll_y) -- clamps scroll_y to scroll limits
-	if self.mouse_nav then
-		self:select_item_below_cursor()
+	if menu.items and #menu.items > 0 then
+		-- Normalize existing selected_index always, and force it only in keyboard navigation
+		if not self.mouse_nav and not menu.selected_index then
+			local from = clamp(1, menu.selected_index or 1, #menu.items)
+			self:select_index(itable_find(menu.items, function(item) return item.selectable ~= false end, from), menu)
+		end
 	else
-		self:select_index((menu.items and #menu.items > 0) and clamp(1, menu.selected_index or 1, #menu.items) or nil)
+		self:select_index(nil)
 	end
 
 	-- Walk up the parent menu chain and activate items that lead to current menu
@@ -289,12 +294,6 @@ end
 
 function Menu:fadeout(callback) self:tween_property('opacity', 1, 0, callback) end
 
-function Menu:get_item_index_below_cursor()
-	local menu = self.current
-	if #menu.items < 1 or self.proximity_raw > 0 then return nil end
-	return math.max(1, math.min(math.ceil((cursor.y - self.ay + menu.scroll_y) / self.scroll_step), #menu.items))
-end
-
 function Menu:get_first_active_index(menu)
 	menu = menu or self.current
 	for index, item in ipairs(self.current.items) do
@@ -445,15 +444,31 @@ end
 ---@param menu? MenuStack
 function Menu:prev(menu)
 	menu = menu or self.current
-	menu.selected_index = math.max(menu.selected_index and menu.selected_index - 1 or #menu.items, 1)
-	self:scroll_to_index(menu.selected_index, menu, true)
+	local initial_index = menu.selected_index and menu.selected_index - 1 or #menu.items
+	if initial_index > 0 then
+		menu.selected_index = itable_find(menu.items, function(item) return item.selectable ~= false end, initial_index, 1)
+		self:scroll_to_index(menu.selected_index, menu, true)
+	end
 end
 
 ---@param menu? MenuStack
 function Menu:next(menu)
 	menu = menu or self.current
-	menu.selected_index = math.min(menu.selected_index and menu.selected_index + 1 or 1, #menu.items)
-	self:scroll_to_index(menu.selected_index, menu, true)
+	local initial_index = menu.selected_index and menu.selected_index + 1 or 1
+	if initial_index <= #menu.items then
+		menu.selected_index = itable_find(menu.items, function(item) return item.selectable ~= false end, initial_index)
+		self:scroll_to_index(menu.selected_index, menu, true)
+	end
+end
+
+---@param menu MenuStack One of menus in `self.all`.
+---@param x number `x` coordinate to slide from.
+function Menu:slide_in_menu(menu, x)
+	local current = self.current
+	current.selected_index = nil
+	self:activate_menu(menu)
+	self:tween(-(display.width / 2 - menu.width / 2 - x), 0, function(offset) self:set_offset_x(offset) end)
+	self.opacity = 1 -- in case tween above canceled fade in animation
 end
 
 function Menu:back()
@@ -462,20 +477,17 @@ function Menu:back()
 		if self.is_closed then return end
 	end
 
-	local menu = self.current
-	local parent = menu.parent_menu
+	local current = self.current
+	local parent = current.parent_menu
 
 	if parent then
-		menu.selected_index = nil
-		self:activate_menu(parent)
-		self:tween(self.offset_x - menu.width / 2, 0, function(offset) self:set_offset_x(offset) end)
-		self.opacity = 1 -- in case tween above canceled fade in animation
+		self:slide_in_menu(parent, display.width / 2 - current.width / 2 - parent.width / 2 + self.offset_x)
 	else
 		self:close()
 	end
 end
 
----@param opts? {keep_open?: boolean, preselect_submenu_item?: boolean}
+---@param opts? {keep_open?: boolean, preselect_first_item?: boolean}
 function Menu:open_selected_item(opts)
 	opts = opts or {}
 	local menu = self.current
@@ -483,7 +495,7 @@ function Menu:open_selected_item(opts)
 		local item = menu.items[menu.selected_index]
 		-- Is submenu
 		if item.items then
-			if opts.preselect_submenu_item then
+			if opts.preselect_first_item then
 				item.selected_index = #item.items > 0 and 1 or nil
 			end
 			self:activate_menu(item)
@@ -497,8 +509,7 @@ function Menu:open_selected_item(opts)
 end
 
 function Menu:open_selected_item_soft() self:open_selected_item({keep_open = true}) end
-function Menu:open_selected_item_preselect() self:open_selected_item({preselect_submenu_item = true}) end
-function Menu:select_item_below_cursor() self.current.selected_index = self:get_item_index_below_cursor() end
+function Menu:open_selected_item_preselect() self:open_selected_item({preselect_first_item = true}) end
 
 ---@param index integer
 function Menu:move_selected_item_to(index)
@@ -531,8 +542,7 @@ function Menu:handle_cursor_down()
 		self.drag_data = {{y = cursor.y, time = mp.get_time()}}
 		self.current.fling = nil
 	else
-		if cursor.x < self.ax and self.current.parent_menu then self:back()
-		else self:close() end
+		self:close()
 	end
 end
 
@@ -548,8 +558,7 @@ end
 
 function Menu:handle_cursor_up()
 	if self.proximity_raw == 0 and self.drag_data and not self.is_dragging then
-		self:select_item_below_cursor()
-		self:open_selected_item({preselect_submenu_item = false, keep_open = self.modifiers and self.modifiers.shift})
+		self:open_selected_item({preselect_first_item = false, keep_open = self.modifiers and self.modifiers.shift})
 	end
 	if self.is_dragging then
 		local distance = self:fling_distance()
@@ -564,7 +573,6 @@ function Menu:handle_cursor_up()
 	self.drag_data = nil
 end
 
-
 function Menu:on_global_mouse_move()
 	self.mouse_nav = true
 	if self.drag_data then
@@ -573,8 +581,6 @@ function Menu:on_global_mouse_move()
 		if distance ~= 0 then self:set_scroll_by(distance) end
 		self.drag_data[#self.drag_data + 1] = {y = cursor.y, time = mp.get_time()}
 	end
-	if self.proximity_raw == 0 or self.is_dragging then self:select_item_below_cursor()
-	else self.current.selected_index = nil end
 	request_render()
 end
 
@@ -673,17 +679,14 @@ function Menu:create_key_action(name, modifiers)
 end
 
 function Menu:render()
-	local update_cursor = false
 	for _, menu in ipairs(self.all) do
 		if menu.fling then
-			update_cursor = update_cursor or menu.fling.update_cursor or false
 			local time_delta = state.render_last_time - menu.fling.time
 			local progress = menu.fling.easing(math.min(time_delta / menu.fling.duration, 1))
 			self:set_scroll_to(round(menu.fling.y + menu.fling.distance * progress), menu)
 			if progress < 1 then request_render() else menu.fling = nil end
 		end
 	end
-	if update_cursor then self:select_item_below_cursor() end
 
 	cursor.on_primary_down = function() self:handle_cursor_down() end
 	cursor.on_primary_up = function() self:handle_cursor_up() end
@@ -696,28 +699,44 @@ function Menu:render()
 	local opacity = options.menu_opacity * self.opacity
 	local spacing = self.item_padding
 	local icon_size = self.font_size
-
-	function draw_menu(menu, x, y, opacity)
-		local ax, ay, bx, by = x, y, x + menu.width, y + menu.height
+	local menu_gap, menu_padding = 2, 2
+
+	---@param menu MenuStack
+	---@param x number
+	---@param pos number Horizontal position index. 0 = current menu, <0 parent menus, >1 submenu.
+	local function draw_menu(menu, x, pos)
+		local is_current, is_parent, is_submenu = pos == 0, pos < 0, pos > 0
+		local menu_opacity = pos == 0 and opacity or opacity * (options.menu_parent_opacity ^ math.abs(pos))
+		local ax, ay, bx, by = x, menu.top, x + menu.width, menu.top + menu.height
 		local draw_title = menu.is_root and menu.title
 		local scroll_clip = '\\clip(0,' .. ay .. ',' .. display.width .. ',' .. by .. ')'
 		local start_index = math.floor(menu.scroll_y / self.scroll_step) + 1
 		local end_index = math.ceil((menu.scroll_y + menu.height) / self.scroll_step)
-		local selected_index = menu.selected_index or -1
-		-- remove menu_opacity to start off with full opacity, but still decay for parent menus
-		local text_opacity = opacity / options.menu_opacity
+		-- Remove menu_opacity to start off with full, but still decay for parent menus
+		local text_opacity = menu_opacity / options.menu_opacity
+		local menu_rect = {ax = ax, ay = ay - (draw_title and self.item_height or 0) - 2, bx = bx, by = by + 2}
+		local blur_selected_index = is_current and self.mouse_nav
 
 		-- Background
-		ass:rect(ax, ay - (draw_title and self.item_height or 0) - 2, bx, by + 2, {
-			color = bg, opacity = opacity, radius = 4,
-		})
+		ass:rect(menu_rect.ax, menu_rect.ay, menu_rect.bx, menu_rect.by, {color = bg, opacity = menu_opacity, radius = 4})
+
+		if is_parent and get_point_to_rectangle_proximity(cursor, menu_rect) == 0 then
+			cursor.on_primary_down = function() self:slide_in_menu(menu, x) end
+		end
+
+		-- Draw submenu if selected
+		local submenu_rect, current_item = nil, is_current and menu.selected_index and menu.items[menu.selected_index]
+		local submenu_is_hovered = false
+		if current_item and current_item.items then
+			submenu_rect = draw_menu(current_item, menu_rect.bx + menu_gap, 1)
+			submenu_is_hovered = get_point_to_rectangle_proximity(cursor, submenu_rect) == 0
+			if submenu_is_hovered then
+				cursor.on_primary_down = function() self:open_selected_item({preselect_first_item = false}) end
+			end
+		end
 
 		for index = start_index, end_index, 1 do
 			local item = menu.items[index]
-			local next_item = menu.items[index + 1]
-			local is_highlighted = selected_index == index or item.active
-			local next_is_active = next_item and next_item.active
-			local next_is_highlighted = selected_index == index + 1 or next_is_active
 
 			if not item then break end
 
@@ -726,24 +745,47 @@ function Menu:render()
 			local item_center_y = item_ay + (self.item_height / 2)
 			local item_clip = (item_ay < ay or item_by > by) and scroll_clip or nil
 			local content_ax, content_bx = ax + spacing, bx - spacing
+			local is_selected = menu.selected_index == index or item.active
+
+			-- Select hovered item
+			if is_current and self.mouse_nav then
+				if submenu_rect and cursor.direction_to_rectangle_distance(submenu_rect) then
+					blur_selected_index = false
+				else
+					local item_rect_hitbox = {
+						ax = menu_rect.ax + menu_padding,
+						ay = item_ay,
+						bx = menu_rect.bx + (item.items and menu_gap or -menu_padding), -- to bridge the gap with cursor
+						by = item_by
+					}
+					if submenu_is_hovered or get_point_to_rectangle_proximity(cursor, item_rect_hitbox) == 0 then
+						blur_selected_index = false
+						menu.selected_index = index
+					end
+				end
+			end
+
+			local next_item = menu.items[index + 1]
+			local next_is_active = next_item and next_item.active
+			local next_is_highlighted = menu.selected_index == index + 1 or next_is_active
 			local font_color = item.active and fgt or bgt
 			local shadow_color = item.active and fg or bg
 
 			-- Separator
 			local separator_ay = item.separator and item_by - 1 or item_by
 			local separator_by = item_by + (item.separator and 2 or 1)
-			if is_highlighted then separator_ay = item_by + 1 end
+			if is_selected then separator_ay = item_by + 1 end
 			if next_is_highlighted then separator_by = item_by end
 			if separator_by - separator_ay > 0 and item_by < by then
 				ass:rect(ax + spacing / 2, separator_ay, bx - spacing / 2, separator_by, {
-					color = fg, opacity = opacity * (item.separator and 0.08 or 0.06),
+					color = fg, opacity = menu_opacity * (item.separator and 0.08 or 0.06),
 				})
 			end
 
 			-- Highlight
-			local highlight_opacity = 0 + (item.active and 0.8 or 0) + (selected_index == index and 0.15 or 0)
-			if highlight_opacity > 0 then
-				ass:rect(ax + 2, item_ay, bx - 2, item_by, {
+			local highlight_opacity = 0 + (item.active and 0.8 or 0) + (menu.selected_index == index and 0.15 or 0)
+			if not is_submenu and highlight_opacity > 0 then
+				ass:rect(ax + menu_padding, item_ay, bx - menu_padding, item_by, {
 					radius = 2, color = fg, opacity = highlight_opacity * text_opacity,
 					clip = item_clip,
 				})
@@ -777,7 +819,7 @@ function Menu:render()
 				local clip = '\\clip(' .. title_cut_x .. ',' ..
 					math.max(item_ay, ay) .. ',' .. bx .. ',' .. math.min(item_by, by) .. ')'
 				ass:txt(content_bx, item_center_y, 6, item.ass_safe_hint, {
-					size = self.font_size_hint, color = font_color, wrap = 2, opacity = 0.5 * opacity, clip = clip,
+					size = self.font_size_hint, color = font_color, wrap = 2, opacity = 0.5 * menu_opacity, clip = clip,
 					shadow = 1, shadow_color = shadow_color,
 				})
 			end
@@ -787,7 +829,13 @@ function Menu:render()
 				item.ass_safe_title = item.ass_safe_title or ass_escape(item.title)
 				local clip = '\\clip(' .. ax .. ',' .. math.max(item_ay, ay) .. ','
 					.. title_cut_x .. ',' .. math.min(item_by, by) .. ')'
-				ass:txt(content_ax, item_center_y, 4, item.ass_safe_title, {
+				local title_x, align = content_ax, 4
+				if item.align == 'right' then
+					title_x, align = title_cut_x, 6
+				elseif item.align == 'center' then
+					title_x, align = content_ax + (title_cut_x - content_ax) / 2, 5
+				end
+				ass:txt(title_x, item_center_y, align, item.ass_safe_title, {
 					size = self.font_size, color = font_color, italic = item.italic, bold = item.bold, wrap = 2,
 					opacity = text_opacity * (item.muted and 0.5 or 1), clip = clip,
 					shadow = 1, shadow_color = shadow_color,
@@ -803,15 +851,15 @@ function Menu:render()
 
 			-- Background
 			ass:rect(ax + 2, title_ay, bx - 2, title_ay + title_height, {
-				color = fg, opacity = opacity * 0.8, radius = 2,
+				color = fg, opacity = menu_opacity * 0.8, radius = 2,
 			})
 			ass:texture(ax + 2, title_ay, bx - 2, title_ay + title_height, 'n', {
-				size = 80, color = bg, opacity = opacity * 0.1,
+				size = 80, color = bg, opacity = menu_opacity * 0.1,
 			})
 
 			-- Title
 			ass:txt(ax + menu.width / 2, title_ay + (title_height / 2), 5, menu.ass_safe_title, {
-				size = self.font_size, bold = true, color = bg, wrap = 2, opacity = opacity,
+				size = self.font_size, bold = true, color = bg, wrap = 2, opacity = menu_opacity,
 				clip = '\\clip(' .. ax .. ',' .. title_ay .. ',' .. bx .. ',' .. ay .. ')',
 			})
 		end
@@ -821,33 +869,31 @@ function Menu:render()
 			local groove_height = menu.height - 2
 			local thumb_height = math.max((menu.height / (menu.scroll_height + menu.height)) * groove_height, 40)
 			local thumb_y = ay + 1 + ((menu.scroll_y / menu.scroll_height) * (groove_height - thumb_height))
-			ass:rect(bx - 3, thumb_y, bx - 1, thumb_y + thumb_height, {color = fg, opacity = opacity * 0.8})
+			ass:rect(bx - 3, thumb_y, bx - 1, thumb_y + thumb_height, {color = fg, opacity = menu_opacity * 0.8})
+		end
+
+		-- We are in mouse nav and cursor isn't hovering any item
+		if blur_selected_index then
+			menu.selected_index = nil
 		end
+
+		return menu_rect
 	end
 
 	-- Main menu
-	draw_menu(self.current, self.ax, self.ay, opacity)
+	draw_menu(self.current, self.ax, 0)
 
 	-- Parent menus
 	local parent_menu = self.current.parent_menu
-	local parent_offset_x = self.ax
-	local parent_opacity_factor = options.menu_parent_opacity
-	local menu_gap = 2
+	local parent_offset_x, parent_horizontal_index = self.ax, -1
 
 	while parent_menu do
 		parent_offset_x = parent_offset_x - parent_menu.width - menu_gap
-		draw_menu(parent_menu, parent_offset_x, parent_menu.top, parent_opacity_factor * opacity)
-		parent_opacity_factor = parent_opacity_factor * parent_opacity_factor
+		draw_menu(parent_menu, parent_offset_x, parent_horizontal_index)
+		parent_horizontal_index = parent_horizontal_index - 1
 		parent_menu = parent_menu.parent_menu
 	end
 
-	-- Selected menu
-	local selected_menu = self.current.items[self.current.selected_index]
-
-	if selected_menu and selected_menu.items then
-		draw_menu(selected_menu, self.bx + menu_gap, selected_menu.top, options.menu_parent_opacity * opacity)
-	end
-
 	return ass
 end
 
diff --git a/portable_config/scripts/uosc/elements/Timeline.lua b/portable_config/scripts/uosc/elements/Timeline.lua
index ebcd2684..29708bfb 100644
--- a/portable_config/scripts/uosc/elements/Timeline.lua
+++ b/portable_config/scripts/uosc/elements/Timeline.lua
@@ -411,7 +411,7 @@ function Timeline:render()
 
 		-- Chapter title
 		if #state.chapters > 0 then
-			local _, chapter = itable_find(state.chapters, function(c) return hovered_seconds >= c.time end, true)
+			local _, chapter = itable_find(state.chapters, function(c) return hovered_seconds >= c.time end, #state.chapters, 1)
 			if chapter and not chapter.is_end_only then
 				ass:tooltip(tooltip_anchor, chapter.title_wrapped, {
 					size = self.font_size, offset = 10, responsive = false, bold = true,
diff --git a/portable_config/scripts/uosc/elements/TopBar.lua b/portable_config/scripts/uosc/elements/TopBar.lua
index b15121fc..85366682 100644
--- a/portable_config/scripts/uosc/elements/TopBar.lua
+++ b/portable_config/scripts/uosc/elements/TopBar.lua
@@ -197,13 +197,15 @@ function TopBar:render()
 				}
 				local bx = math.min(max_bx, title_ax + text_width(main_title, opts) + padding * 2)
 				local by = self.by - bg_margin
-				local rect = {ax = title_ax, ay = self.ay, bx = self.title_bx, by = self.by}
+				local title_rect = {ax = title_ax, ay = title_ay, bx = bx, by = by}
 
-				if get_point_to_rectangle_proximity(cursor, rect) == 0 then
+				if options.top_bar_alt_title_place == 'toggle'
+					and get_point_to_rectangle_proximity(cursor, title_rect) == 0 then
 					cursor.on_primary_down = function() self:toggle_title() end
+					cursor.allow_dragging = true
 				end
 
-				ass:rect(title_ax, title_ay, bx, by, {
+				ass:rect(title_rect.ax, title_rect.ay, title_rect.bx, title_rect.by, {
 					color = bg, opacity = visibility * options.top_bar_title_opacity, radius = 2,
 				})
 				ass:txt(title_ax + padding, self.ay + (self.size / 2), 4, main_title, opts)
diff --git a/portable_config/scripts/uosc/lib/std.lua b/portable_config/scripts/uosc/lib/std.lua
index 12616661..c72ccb14 100644
--- a/portable_config/scripts/uosc/lib/std.lua
+++ b/portable_config/scripts/uosc/lib/std.lua
@@ -75,15 +75,25 @@ function itable_index_of(itable, value)
 	end
 end
 
+---@param itable table
+---@param value any
+---@return boolean
+function itable_has(itable, value)
+	return itable_index_of(itable, value) ~= nil
+end
+
 ---@param itable table
 ---@param compare fun(value: any, index: number)
----@param from_end? boolean Search from the end of the table.
+---@param from? number Where to start search, defaults to `1`.
+---@param to? number Where to end search, defaults to `#itable`.
 ---@return number|nil index
 ---@return any|nil value
-function itable_find(itable, compare, from_end)
-	local from, to, step = from_end and #itable or 1, from_end and 1 or #itable, from_end and -1 or 1
-	for index = from, to, step do
-		if compare(itable[index], index) then return index, itable[index] end
+function itable_find(itable, compare, from, to)
+	from, to = from or 1, to or #itable
+	for index = from, to, from < to and 1 or -1 do
+		if index > 0 and index <= #itable and compare(itable[index], index) then
+			return index, itable[index]
+		end
 	end
 end
 
@@ -99,8 +109,21 @@ end
 
 ---@param itable table
 ---@param value any
-function itable_remove(itable, value)
-	return itable_filter(itable, function(item) return item ~= value end)
+function itable_delete_value(itable, value)
+	for index = 1, #itable, 1 do
+		if itable[index] == value then table.remove(itable, index) end
+	end
+	return itable
+end
+
+---@param itable table
+---@param transformer fun(value: any, index: number) : any
+function itable_map(itable, transformer)
+	local result = {}
+	for index, value in ipairs(itable) do
+		result[index] = transformer(value, index)
+	end
+	return result
 end
 
 ---@param itable table
diff --git a/portable_config/scripts/uosc/lib/text.lua b/portable_config/scripts/uosc/lib/text.lua
index eca4de2a..0af8a6c5 100644
--- a/portable_config/scripts/uosc/lib/text.lua
+++ b/portable_config/scripts/uosc/lib/text.lua
@@ -51,11 +51,12 @@ local osd_width, osd_height = 100, 100
 ---@return integer
 local function utf8_char_bytes(str, i)
 	local char_byte = str:byte(i)
-	if char_byte < 0xC0 then return 1
-	elseif char_byte < 0xE0 then return 2
-	elseif char_byte < 0xF0 then return 3
-	elseif char_byte < 0xF8 then return 4
-	else return 1 end
+	local max_bytes = #str - i + 1
+	if char_byte < 0xC0 then return math.min(max_bytes, 1)
+	elseif char_byte < 0xE0 then return math.min(max_bytes, 2)
+	elseif char_byte < 0xF0 then return math.min(max_bytes, 3)
+	elseif char_byte < 0xF8 then return math.min(max_bytes, 4)
+	else return math.min(max_bytes, 1) end
 end
 
 ---Creates an iterator for an utf-8 encoded string
@@ -87,9 +88,7 @@ local function utf8_to_unicode(str, i)
 		unicode = char_byte * (2 ^ 6) ^ (byte_count - 1)
 	end
 	for j = 2, byte_count do
-		if i + j - 1 <= #str then -- 临时修复 https://github.com/tomasklaen/uosc/issues/515
-			char_byte = str:byte(i + j - 1) - 0x80
-		end
+		char_byte = str:byte(i + j - 1) - 0x80
 		unicode = unicode + char_byte * (2 ^ 6) ^ (byte_count - j)
 	end
 	return round(unicode)
diff --git a/portable_config/scripts/uosc/lib/utils.lua b/portable_config/scripts/uosc/lib/utils.lua
index 43892c98..134d65cb 100644
--- a/portable_config/scripts/uosc/lib/utils.lua
+++ b/portable_config/scripts/uosc/lib/utils.lua
@@ -100,6 +100,73 @@ function get_point_to_point_proximity(point_a, point_b)
 	return math.sqrt(dx * dx + dy * dy)
 end
 
+---@param lax number
+---@param lay number
+---@param lbx number
+---@param lby number
+---@param max number
+---@param may number
+---@param mbx number
+---@param mby number
+function get_line_to_line_intersection(lax, lay, lbx, lby, max, may, mbx, mby)
+	-- Calculate the direction of the lines
+	local uA = ((mbx-max)*(lay-may) - (mby-may)*(lax-max)) / ((mby-may)*(lbx-lax) - (mbx-max)*(lby-lay))
+	local uB = ((lbx-lax)*(lay-may) - (lby-lay)*(lax-max)) / ((mby-may)*(lbx-lax) - (mbx-max)*(lby-lay))
+
+	-- If uA and uB are between 0-1, lines are colliding
+	if uA >= 0 and uA <= 1 and uB >= 0 and uB <= 1 then
+		return lax + (uA * (lbx-lax)), lay + (uA * (lby-lay))
+	end
+
+	return nil, nil
+end
+
+-- Returns distance from the start of a finite ray assumed to be at (rax, ray)
+-- coordinates to a line.
+---@param rax number
+---@param ray number
+---@param rbx number
+---@param rby number
+---@param lax number
+---@param lay number
+---@param lbx number
+---@param lby number
+function get_ray_to_line_distance(rax, ray, rbx, rby, lax, lay, lbx, lby)
+	local x, y = get_line_to_line_intersection(rax, ray, rbx, rby, lax, lay, lbx, lby)
+	if x then
+		return math.sqrt((rax - x) ^ 2 + (ray - y) ^ 2)
+	end
+	return nil
+end
+
+-- Returns distance from the start of a finite ray assumed to be at (ax, ay)
+-- coordinates to a rectangle. Returns `0` if ray originates inside rectangle.
+---@param  ax number
+---@param  ay number
+---@param  bx number
+---@param  by number
+---@param  rect {ax: number; ay: number; bx: number; by: number}
+---@return number|nil
+function get_ray_to_rectangle_distance(ax, ay, bx, by, rect)
+	-- Is inside
+	if ax >= rect.ax and ax <= rect.bx and ay >= rect.ay and ay <= rect.by then
+		return 0
+	end
+
+	local closest = nil
+
+	local function updateDistance(distance)
+		if distance and (not closest or distance < closest) then closest = distance end
+	end
+
+	updateDistance(get_ray_to_line_distance(ax, ay, bx, by, rect.ax, rect.ay, rect.bx, rect.ay))
+	updateDistance(get_ray_to_line_distance(ax, ay, bx, by, rect.bx, rect.ay, rect.bx, rect.by))
+	updateDistance(get_ray_to_line_distance(ax, ay, bx, by, rect.ax, rect.by, rect.bx, rect.by))
+	updateDistance(get_ray_to_line_distance(ax, ay, bx, by, rect.ax, rect.ay, rect.ax, rect.by))
+
+	return closest
+end
+
 -- Call function with args if it exists
 function call_maybe(fn, ...)
 	if type(fn) == 'function' then fn(...) end
@@ -350,28 +417,39 @@ end
 
 -- Navigates in a list, using delta or, when `state.shuffle` is enabled,
 -- randomness to determine the next item. Loops around if `loop-playlist` is enabled.
----@param list table
+---@param paths table
 ---@param current_index number
 ---@param delta number
-function decide_navigation_in_list(list, current_index, delta)
-	if #list < 2 then return #list, list[#list] end
+function decide_navigation_in_list(paths, current_index, delta)
+	if #paths < 2 then return #paths, paths[#paths] end
 
+	-- Shuffle looks at the played files history trimmed to 80% length of the paths
+	-- and removes all paths in it from the potential shuffle pool. This guarantees
+	-- no path repetition until at least 80% of the playlist has been exhausted.
 	if state.shuffle then
-		local new_index = current_index
+		local trimmed_history = itable_slice(state.history, -math.floor(#paths * 0.8))
+		local shuffle_pool = {}
+
+		for index, value in ipairs(paths) do
+			if not itable_has(trimmed_history, value) then
+				shuffle_pool[#shuffle_pool + 1] = index
+			end
+		end
+
 		math.randomseed(os.time())
-		while current_index == new_index do new_index = math.random(#list) end
-		return new_index, list[new_index]
+		local next_index = shuffle_pool[math.random(#shuffle_pool)]
+		return next_index, paths[next_index]
 	end
 
 	local new_index = current_index + delta
 	if mp.get_property_native('loop-playlist') then
-		if new_index > #list then new_index = new_index % #list
-		elseif new_index < 1 then new_index = #list - new_index end
-	elseif new_index < 1 or new_index > #list then
+		if new_index > #paths then new_index = new_index % #paths
+		elseif new_index < 1 then new_index = #paths - new_index end
+	elseif new_index < 1 or new_index > #paths then
 		return
 	end
 
-	return new_index, list[new_index]
+	return new_index, paths[new_index]
 end
 
 ---@param delta number
@@ -389,7 +467,8 @@ end
 function navigate_playlist(delta)
 	local playlist, pos = mp.get_property_native('playlist'), mp.get_property_native('playlist-pos-1')
 	if playlist and #playlist > 1 and pos then
-		local index = decide_navigation_in_list(playlist, pos, delta)
+		local paths = itable_map(playlist, function(item) return normalize_path(item.filename) end)
+		local index = decide_navigation_in_list(paths, pos, delta)
 		if index then mp.commandv('playlist-play-index', index - 1) return true end
 	end
 	return false
diff --git a/portable_config/scripts/uosc/main.lua b/portable_config/scripts/uosc/main.lua
index b411b9c0..73fd62b5 100644
--- a/portable_config/scripts/uosc/main.lua
+++ b/portable_config/scripts/uosc/main.lua
@@ -1,6 +1,6 @@
 --[[
 SOURCE_ https://github.com/tomasklaen/uosc/tree/main/scripts
-COMMIT_ 5e2c93055155bc9aec7534d13804d4f0d7f8a72d
+COMMIT_ c8ad77a1a92d0667e1e66f11e84692cd03796ec8
 文档_ https://github.com/hooke007/MPV_lazy/discussions/186
 
 极简主义设计驱动的多功能界面脚本群组，兼容 thumbfast 新缩略图引擎
@@ -347,10 +347,14 @@ cursor = {
 	on_primary_up = nil,
 	on_wheel_down = nil,
 	on_wheel_up = nil,
+	allow_dragging = false,
+	history = {}, -- {x, y}[] history
+	history_size = 10,
 	-- Called at the beginning of each render
 	reset_handlers = function()
 		cursor.on_primary_down, cursor.on_primary_up = nil, nil
 		cursor.on_wheel_down, cursor.on_wheel_up = nil, nil
+		cursor.allow_dragging = false
 	end,
 	mbtn_left_enabled = nil,
 	wheel_enabled = nil,
@@ -359,7 +363,8 @@ cursor = {
 		local enable_mbtn_left = (cursor.on_primary_down or cursor.on_primary_up) ~= nil
 		local enable_wheel = (cursor.on_wheel_down or cursor.on_wheel_up) ~= nil
 		if enable_mbtn_left ~= cursor.mbtn_left_enabled then
-			mp[(enable_mbtn_left and 'enable' or 'disable') .. '_key_bindings']('mbtn_left')
+			local flags = cursor.allow_dragging and 'allow-vo-dragging' or nil
+			mp[(enable_mbtn_left and 'enable' or 'disable') .. '_key_bindings']('mbtn_left', flags)
 			cursor.mbtn_left_enabled = enable_mbtn_left
 		end
 		if enable_wheel ~= cursor.wheel_enabled then
@@ -381,6 +386,17 @@ cursor = {
 			cursor.autohide_timer:kill()
 			cursor.autohide_timer:resume()
 		end
+	end,
+	-- Calculates distance in which cursor reaches rectangle if it continues moving in the same path.
+	-- Returns `nil` if cursor is not moving towards the rectangle.
+	direction_to_rectangle_distance = function(rect)
+		if cursor.hidden or not cursor.history[1] then
+			return false
+		end
+
+		local prev_x, prev_y = cursor.history[1][1], cursor.history[1][2]
+		local end_x, end_y = cursor.x + (cursor.x - prev_x) * 1e10, cursor.y + (cursor.y - prev_y) * 1e10
+		return get_ray_to_rectangle_distance(cursor.x, cursor.y, end_x, end_y, rect)
 	end
 }
 state = {
@@ -397,6 +413,7 @@ state = {
 	end)(),
 	cwd = mp.get_property('working-directory'),
 	path = nil, -- current file path or URL
+	history = {}, -- history of last played files stored as full paths
 	title = nil,
 	alt_title = nil,
 	time = nil, -- current media playback time
@@ -579,18 +596,24 @@ function update_cursor_position(x, y)
 		else x, y = INFINITY, INFINITY end
 	end
 
-	-- add 0.5 to be in the middle of the pixel
+	-- Add 0.5 to be in the middle of the pixel
 	cursor.x, cursor.y = (x + 0.5) / display.scale_x, (y + 0.5) / display.scale_y
 
 	if old_x ~= cursor.x or old_y ~= cursor.y then
 		Elements:update_proximities()
 
 		if cursor.x == INFINITY or cursor.y == INFINITY then
-			cursor.hidden = true
+			cursor.hidden, cursor.history = true, {}
 			Elements:trigger('global_mouse_leave')
 		elseif cursor.hidden then
-			cursor.hidden = false
+			cursor.hidden, cursor.history = false, {}
 			Elements:trigger('global_mouse_enter')
+		else
+			-- Update cursor history
+			for i = 1, cursor.history_size - 1, 1 do
+				cursor.history[i] = cursor.history[i + 1]
+			end
+			cursor.history[cursor.history_size] = {x, y}
 		end
 
 		Elements:proximity_trigger('mouse_move')
@@ -658,7 +681,7 @@ end
 function select_current_chapter()
 	local current_chapter
 	if state.time and state.chapters then
-		_, current_chapter = itable_find(state.chapters, function(c) return state.time >= c.time end, true)
+		_, current_chapter = itable_find(state.chapters, function(c) return state.time >= c.time end, #state.chapters, 1)
 	end
 	set_state('current_chapter', current_chapter)
 end
@@ -699,7 +722,10 @@ end
 mp.observe_property('mouse-pos', 'native', handle_mouse_pos)
 mp.observe_property('osc', 'bool', function(name, value) if value == true then mp.set_property('osc', 'no') end end)
 mp.register_event('file-loaded', function()
-	set_state('path', normalize_path(mp.get_property_native('path')))
+	local path = normalize_path(mp.get_property_native('path'))
+	itable_delete_value(state.history, path)
+	state.history[#state.history + 1] = path
+	set_state('path', path)
 	Elements:flash({'top_bar'})
 end)
 mp.register_event('end-file', function(event)
diff --git a/portable_config/shaders/nlmeans.glsl b/portable_config/shaders/nlmeans.glsl
index 655da37d..5af94a5d 100644
--- a/portable_config/shaders/nlmeans.glsl
+++ b/portable_config/shaders/nlmeans.glsl
@@ -21,299 +21,1121 @@
 
 // Description: nlmeans.glsl: Default profile, general purpose, tuned for low noise
 
-/* The recommended usage of this shader and its variant profiles is to add them 
- * to input.conf and then dispatch the appropriate shader via a keybind during 
- * media playback. Here is an example input.conf entry:
+/* This shader is highly configurable via user variables below. Although the 
+ * default settings should offer good quality at a reasonable speed, you are 
+ * encouraged to tweak them to your preferences.
+ */
+
+// The following is shader code injected from ../LQ/nlmeans.glsl
+/* vi: ft=c
  *
- * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)"
+ * Based on vf_nlmeans.c from FFmpeg.
  *
- * These shaders can also be enabled by default in mpv.conf, for example:
+ * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
+ * Copyright (c) 2016 Clément Bœsch <u pkh me>
  *
- * glsl-shaders='~~/shaders/nlmeans.glsl'
+ * This program is free software: you can redistribute it and/or modify it 
+ * under the terms of the GNU Lesser General Public License as published by 
+ * the Free Software Foundation, either version 2.1 of the License, or (at 
+ * your option) any later version.
  *
- * Both of the examples above assume the shaders are located in a subdirectory 
- * named "shaders" within mpv's config directory. Refer to the mpv 
- * documentation for more details.
+ * This program is distributed in the hope that it will be useful, but WITHOUT 
+ * ANY WARRANTY;  without even the implied warranty of MERCHANTABILITY or 
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
+ * for more details.
  *
- * This shader is highly configurable via user variables below. Although the 
+ * You should have received a copy of the GNU Lesser General Public License 
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+// Description: nlmeans.glsl: Faster, but lower quality.
+
+/* This shader is highly configurable via user variables below. Although the 
  * default settings should offer good quality at a reasonable speed, you are 
- * encouraged to tweak them to your preferences. Be mindful that certain 
- * settings may greatly affect speed.
+ * encouraged to tweak them to your preferences.
+ */
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!BIND HOOKED
+//!DESC Non-local means (nlmeans.glsl)
+//!SAVE RF_LUMA
+
+// User variables
+
+// It is generally preferable to denoise luma and chroma differently, so the 
+// user variables for luma and chroma are split.
+
+// Denoising factor (level of blur, higher means more blur)
+#ifdef LUMA_raw
+#define S 3.5968056672833097
+#else
+#define S 5.191526541606411
+#endif
+
+/* Adaptive sharpening
+ *
+ * Performs an unsharp mask by subtracting the spatial kernel's blur from the 
+ * NLM blur. For sharpen+denoise the sharpening is limited to edge areas and 
+ * denoising is done everywhere else.
  *
- * Denoising is most useful for noisy content. If there is no perceptible 
- * noise, you probably won't see a positive difference.
+ * Use V=4 to visualize which areas are sharpened (black means sharpen).
+ *
+ * AS:
+ * 	 - 0: disable
+ * 	 - 1: sharpen+denoise
+ * 	 - 2: sharpen only
+ * ASF: Higher numbers make a sharper image
+ * ASA: Anti-ringing, higher numbers increase strength
+ * ASP: Power, lower numbers increase sharpening on lower frequency detail
+ */
+#ifdef LUMA_raw
+#define AS 0
+#define ASF 0.1625
+#define ASA 5.0
+#define ASP 0.5
+#else
+#define AS 0
+#define ASF 0.1625
+#define ASA 5.0
+#define ASP 0.5
+#endif
+
+/* Starting weight
  *
- * The default settings are generally tuned for low noise and high detail 
- * preservation. The "medium" and "heavy" profiles are tuned for higher levels 
- * of noise.
+ * Also known as the center weight. This represents the weight of the 
+ * pixel-of-interest. Lower numbers may help handle heavy noise & ringing.
  *
- * The denoiser will not work properly if the content has been upscaled 
- * beforehand (whether it was done by you or not). In such cases, consider 
- * issuing a command to downscale in the mpv console (backtick ` key):
+ * EPSILON should be used instead of zero to avoid divide-by-zero errors.
+ */
+#ifdef LUMA_raw
+#define SW 0.7392620481427672
+#else
+#define SW 0.6448288408806067
+#endif
+
+/* Weight discard
  *
- * vf toggle scale=-2:720
+ * Reduces weights that fall below a fraction of the average weight. This culls 
+ * the most dissimilar samples from the blur, which can yield a better result, 
+ * especially around edges.
+ * 
+ * WD:
+ * 	 - 2: Mean. Better quality, but slower and requires GLSL 4.0 or later
+ * 	 - 1: Moving cumulative average. Fast but inaccurate, blurs directionally.
+ * 	 - 0: Disable
  *
- * ...replacing 720 with whatever resolution seems appropriate. Rerun the 
- * command to undo the downscale. It may take some trial-and-error to find the 
- * proper resolution.
+ * WDT: Threshold coefficient, higher numbers discard more
+ * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
+ * WDS (not for WDK=is_zero): Higher numbers are more eager to reduce weights
  */
+#ifdef LUMA_raw
+#define WD 1
+#define WDT 0.580415381682815
+#define WDP 5.381278367349288
+#define WDS 1.0
+#else
+#define WD 1
+#define WDT 0.913447511792627
+#define WDP 5.832936323930807
+#define WDS 1.0
+#endif
 
-/* Regarding speed
+/* Extremes preserve
  *
- * Speed may vary wildly for different vo and gpu-api settings. Generally 
- * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this 
- * may be different for your system.
+ * Reduce denoising in very bright/dark areas.
  *
- * If your GPU doesn't support textureGather, or if you are on a version of mpv 
- * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile
+ * Disabled by default now. If you want to reenable this, set EP=3/ in 
+ * Makefile.nlm and rebuild.
  *
- * If you plan on tinkering with NLM's settings, read below:
+ * The downscaling factor of the EP shader stage affects what is considered a 
+ * bright/dark area.
  *
- * textureGather only applies to luma and limited to the these configurations:
+ * This is incompatible with RGB. If you have RGB hooks enabled then you will 
+ * have to delete the EP shader stage or specify EP=0 through shader_cfg.
  *
- * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}
- *   - Default, very fast, rotations and reflections should be free
- *   - If this is unusually slow then try changing gpu-api and vo
- *   - If it's still slow, try setting RI/RFI to 0.
+ * EP: 1 to enable, 0 to disable
+ * DP: EP strength on dark areas, 0 to fully denoise
+ * BP: EP strength on bright areas, 0 to fully denoise
+ */
+#ifdef LUMA_raw
+#define EP 0
+#define BP 0.75
+#define DP 0.25
+#else
+#define EP 0
+#define BP 0.0
+#define DP 0.0
+#endif
+
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+
+/* textureGather applicable configurations:
  *
- * - PS=6:RI={0,1,3}:RFI={0,1,2}
+ * - PS={0,3,7,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2}
+ * - PS={0,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2}
+ * - PS=6:RI=0:RFI=0
  *   - Currently the only scalable variant
- *   - Patch shape is asymmetric on two axis
- *   - Rotations should have very little speed impact
- *   - Reflections may have a significant speed impact
  *
  * Options which always disable textureGather:
- * 	- PD
- * 	- NG
+ * 	 - NG
+ * 	 - SAMPLE
+ * 	 - PD
+ *
+ * Running without textureGather may be much slower.
  */
 
-// The following is shader code injected from guided.glsl
-/* vi: ft=c
+/* Patch & research sizes
  *
- * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
+ * P should be an odd number. Higher values are slower and not always better.
  *
- * This program is free software: you can redistribute it and/or modify it 
- * under the terms of the GNU Lesser General Public License as published by 
- * the Free Software Foundation, either version 2.1 of the License, or (at 
- * your option) any later version.
+ * R should be an odd number greater than or equal to 3. Higher values are 
+ * generally better, but slower, blurrier, and gives diminishing returns.
+ */
+#ifdef LUMA_raw
+#define P 3
+#define R 5
+#else
+#define P 3
+#define R 5
+#endif
+
+/* Patch and research shapes
  *
- * This program is distributed in the hope that it will be useful, but WITHOUT 
- * ANY WARRANTY;  without even the implied warranty of MERCHANTABILITY or 
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
- * for more details.
+ * Different shapes have different speed and quality characteristics. Every 
+ * shape (besides square) is smaller than square.
  *
- * You should have received a copy of the GNU Lesser General Public License 
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ * PS applies applies to patches, RS applies to research zones.
+ *
+ * 0: square (symmetrical)
+ * 1: horizontal line (asymmetric)
+ * 2: vertical line (asymmetric)
+ * 3: diamond (symmetrical)
+ * 4: triangle (asymmetric, pointing upward)
+ * 5: truncated triangle (asymmetric on two axis, last row halved)
+ * 6: even sized square (asymmetric on two axis)
+ * 7: plus (symmetrical)
+ * 8: plus X (symmetrical)
  */
+#ifdef LUMA_raw
+#define RS 3
+#define PS 4
+#else
+#define RS 3
+#define PS 3
+#endif
 
-// Description: guided.glsl: Guided by the downscaled image
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Computes weights on a guide, which could be a downscaled image or the output 
+ * of another shader, and applies the weights to the original image
+ */
+#define RF_LUMA 0
+#define RF 0
 
-/* The radius can be adjusted with the MEANI stage's downscaling factor. 
- * Higher numbers give a bigger radius.
+/* Rotational/reflectional invariance
+ *
+ * Number of rotations/reflections to try for each patch comparison. Can be 
+ * slow, but improves feature preservation. More rotations/reflections gives 
+ * diminishing returns. The most similar rotation/reflection will be used.
  *
- * The E variable can be found in the A stage.
+ * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
+ * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
  *
- * The subsampling (fast guided filter) can be adjusted with the I stage's 
- * downscaling factor. Higher numbers are faster.
+ * Consider setting SAMPLE=1 if setting RI to a setting that would require 
+ * sampling between pixels.
  *
- * The guide's subsampling can be adjusted with the PREI stage's downscaling 
- * factor. Higher numbers downscale more.
+ * RI: Rotational invariance
+ * RFI (0 to 2): Reflectional invariance
  */
+#ifdef LUMA_raw
+#define RI 0
+#define RFI 0
+#else
+#define RI 0
+#define RFI 0
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND HOOKED
-//!WIDTH HOOKED.w 1.25 /
-//!HEIGHT HOOKED.h 1.25 /
-//!DESC Guided filter (PREI)
-//!SAVE _INJ_PREI
+/* Temporal denoising
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Caveats:
+ * 	 - Slower:
+ * 	 	 - Each frame needs to be researched (more samples & more math)
+ * 	 	 - Gather optimizations only apply to the current frame
+ * 	 - Requires vo=gpu-next
+ * 	 - Luma-only (this is a bug)
+ * 	 - Buggy
+ *
+ * May cause motion blur and may struggle more with noise that persists across 
+ * multiple frames (e.g., from compression or duplicate frames), but can work 
+ * very well on high quality video.
+ *
+ * Motion estimation (ME) should improve quality without impacting speed.
+ *
+ * T: number of frames used
+ * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
+ * MEF: estimate factor, compensates for ME being one frame behind
+ * TRF: compare against the denoised frames
+ */
+#ifdef LUMA_raw
+#define T 0
+#define ME 1
+#define MEF 2
+#define TRF 0
+#else
+#define T 0
+#define ME 0
+#define MEF 2
+#define TRF 0
+#endif
+
+/* Spatial kernel
+ *
+ * Increasing the spatial denoising factor (SS) reduces the weight of further 
+ * pixels.
+ *
+ * Spatial distortion instructs the spatial kernel to view that axis as 
+ * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
+ * appear closer and increase blur between frames.
+ *
+ * The intra-patch variants are supposed to help with larger patch sizes.
+ *
+ * SST: enables spatial kernel if R>=PST, 0 fully disables
+ * SS: spatial sigma
+ * SD: spatial distortion (X, Y, time)
+ * PSS: intra-patch spatial sigma
+ * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
+ * PSD: intra-patch spatial distortion (X, Y)
+ */
+#ifdef LUMA_raw
+#define SST 1
+#define SS 0.49764743714339127
+#define SD vec3(1,1,1)
+#define PST 0
+#define PSS 0.0
+#define PSD vec2(1,1)
+#else
+#define SST 1
+#define SS 0.32091162692066677
+#define SD vec3(1,1,1)
+#define PST 0
+#define PSS 0.0
+#define PSD vec2(1,1)
+#endif
+
+/* Kernels
+ *
+ * SK: spatial kernel
+ * RK: range kernel (takes patch differences)
+ * PSK: intra-patch spatial kernel
+ * WDK: weight discard kernel
+ * WD1TK (WD=1 only): weight discard tolerance kernel
+ *
+ * List of available kernels:
+ *
+ * bicubic
+ * cos
+ * gaussian
+ * lanczos
+ * quadratic_ (unclamped)
+ * sinc
+ * sinc_ (unclamped)
+ * sinc3
+ * sphinx
+ * sphinx_ (unclamped)
+ * triangle_ (unclamped)
+ * triangle
+ */
+#ifdef LUMA_raw
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
+#define WDK is_zero
+#define WD1TK gaussian
+#else
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
+#define WDK is_zero
+#define WD1TK gaussian
+#endif
+
+/* Sampling method
+ *
+ * In most cases this shouldn't make any difference, only set to bilinear if 
+ * it's necessary to sample between pixels (e.g., RI=2).
+ *
+ * 0: nearest neighbor
+ * 1: bilinear
+ */
+#ifdef LUMA_raw
+#define SAMPLE 0
+#else
+#define SAMPLE 0
+#endif
+
+/* Research scaling factor
+ *
+ * Higher numbers sample more sparsely as the distance from the POI grows.
+ */
+#ifdef LUMA_raw
+#define RSF 0.0
+#else
+#define RSF 0.0
+#endif
+
+// Scaling factor (should match WIDTH/HEIGHT)
+#ifdef LUMA_raw
+#define SF 1
+#else
+#define SF 1
+#endif
+
+// Use the guide image as the input image
+#ifdef LUMA_raw
+#define GUIDE_INPUT 0
+#else
+#define GUIDE_INPUT 0
+#endif
+
+/* Visualization
+ *
+ * 0: off
+ * 1: absolute difference between input/output to the power of 0.25
+ * 2: difference between input/output centered on 0.5
+ * 3: post-WD weight map
+ * 4: pre-WD weight map
+ * 5: unsharp mask
+ * 6: EP
+ */
+#ifdef LUMA_raw
+#define V 0
+#else
+#define V 0
+#endif
+
+// Blur factor (0.0 returns the input image, 1.0 returns the output image)
+#ifdef LUMA_raw
+#define BF 1.0
+#else
+#define BF 1.0
+#endif
+
+// Force disable textureGather
+#ifdef LUMA_raw
+#define NG 0
+#else
+#define NG 0
+#endif
+
+// Patch donut (probably useless)
+#ifdef LUMA_raw
+#define PD 0
+#else
+#define PD 0
+#endif
+
+// Duplicate 1st weight (for luma-guided-chroma)
+#ifdef LUMA_raw
+#define D1W 0
+#else
+#define D1W 0
+#endif
+
+// Skip patch comparison
+#ifdef LUMA_raw
+#define SKIP_PATCH 0
+#else
+#define SKIP_PATCH 0
+#endif
+
+// Shader code
+
+#define EPSILON 1.2e-38
+#define M_PI 3.14159265358979323846
+#define POW2(x) ((x)*(x))
+#define POW3(x) ((x)*(x)*(x))
+#define bicubic_(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define bicubic(x) bicubic_(clamp((x), 0.0, 2.0))
+#define gaussian(x) exp(-1 * POW2(x))
+#define quadratic_(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
+#define quadratic(x) quadratic_(clamp((x), 0.0, 1.5))
+#define sinc_(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
+#define sinc(x) sinc_(clamp((x), 0.0, 1.0))
+#define sinc3(x) sinc_(clamp((x), 0.0, 3.0))
+#define lanczos(x) (sinc3(x) * sinc(x))
+#define sphinx_(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+#define sphinx(x) sphinx_(clamp((x), 0.0, 1.4302966531242027))
+#define triangle_(x) (1 - (x))
+#define triangle(x) triangle_(clamp((x), 0.0, 1.0))
+#define is_zero(x) int(x == 0)
+
+// XXX could maybe be better optimized on LGC
+#if defined(LUMA_raw)
+#define val float
+#define val_swizz(v) (v.x)
+#define unval(v) vec4(v.x, 0, 0, poi_.a)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#elif defined(CHROMA_raw)
+#define val vec2
+#define val_swizz(v) (v.xy)
+#define unval(v) vec4(v.x, v.y, 0, poi_.a)
+#define val_packed uint
+#define val_pack(v) packUnorm2x16(v)
+#define val_unpack(v) unpackUnorm2x16(v)
+#else
+#define val vec3
+#define val_swizz(v) (v.xyz)
+#define unval(v) vec4(v.x, v.y, v.z, poi_.a)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#endif
+
+#if PS == 6
+const int hp = P/2; 
+#else
+const float hp = int(P/2) - 0.5*(1-(P%2));  // sample between pixels for even patch sizes
+#endif
+
+#if RS == 6
+const int hr = R/2; 
+#else
+const float hr = int(R/2) - 0.5*(1-(R%2));  // sample between pixels for even research sizes
+#endif
+
+// patch/research shapes
+// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
+// dots (.) represent samples (pixels) and X represents the pixel-of-interest
+
+// Z    .....
+// Z    .....
+// Z    ..X..
+// Z    .....
+// Z    .....
+#define S_SQUARE(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -hz;  z.y <= hz;  incr)
+
+// (in this instance Z=4)
+// Z    ....
+// Z    ....
+// Z    ..X.
+// Z    ....
+#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz;  z.x < hz;  z.x++) for (z.y = -hz;  z.y < hz;  incr)
+
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+#define S_TRIANGLE(z,hz,incr) for (z.y = -hz;  z.y <= 0;  z.y++) for (z.x = -abs(abs(z.y) - hz);  z.x <= abs(abs(z.y) - hz);  incr)
+
+// Z-4    .
+// Z-2   ...
+// hz+1 ..X
+#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz;  z.y <= 0;  z.y++) for (z.x = -abs(abs(z.y) - hz);  z.x <= abs(abs(z.y) - hz)*int(z.y!=0);  incr)
+#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
+
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+// Z-2   ...
+// Z-4    .
+#define S_DIAMOND(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -abs(abs(z.x) - hz);  z.y <= abs(abs(z.x) - hz);  incr)
+#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
+
+//
+// Z    ..X..
+//
+#define S_HORIZONTAL(z,hz,incr) for (z.y = 0;  z.y <= 0;  z.y++) for (z.x = -hz;  z.x <= hz;  incr)
+
+// 90 degree rotation of S_HORIZONTAL
+#define S_VERTICAL(z,hz,incr) for (z.x = 0;  z.x <= 0;  z.x++) for (z.y = -hz;  z.y <= hz;  incr)
+
+// 1      .
+// 1      . 
+// Z    ..X..
+// 1      . 
+// 1      .
+#define S_PLUS(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -hz * int(z.x == 0);  z.y <= hz * int(z.x == 0);  incr)
+#define S_PLUS_A(hz,Z) (Z*2 - 1)
+
+// 3    . . .
+// 3     ...
+// Z    ..X..
+// 3     ...
+// 3    . . .
+#define S_PLUS_X(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -abs(z.x) + -hz * int(z.x == 0);  z.y <= abs(z.x) + hz * int(z.x == 0);  incr)
+#define S_PLUS_X_A(hz,Z) (Z*4 - 3)
+
+// 1x1 square
+#define S_1X1(z) for (z = vec3(0);  z.x <= 0;  z.x++)
+
+#define T1 (T+1)
+#define FOR_FRAME(r) for (r.z = 0;  r.z < T1;  r.z++)
+
+#ifdef LUMA_raw
+#define RF_ RF_LUMA
+#else
+#define RF_ RF
+#endif
+
+// donut increment, increments without landing on (0,0,0)
+// much faster than a continue statement
+#define DINCR(z,c,a) ((z.c += a),(z.c += int(z == vec3(0))))
+
+#define R_AREA(a) (a * T1 - 1)
+
+// research shapes
+// XXX would be nice to have the option of temporally-varying research sizes
+#if R == 0 || R == 1
+#define FOR_RESEARCH(r) S_1X1(r)
+const int r_area = R_AREA(1); 
+#elif RS == 8
+#define FOR_RESEARCH(r) S_PLUS_X(r,hr,DINCR(r,y,max(1,abs(r.x))))
+const int r_area = R_AREA(S_PLUS_X_A(hr,R)); 
+#elif RS == 7
+#define FOR_RESEARCH(r) S_PLUS(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(S_PLUS_A(hr,R)); 
+#elif RS == 6
+#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(R*R); 
+#elif RS == 5
+#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,DINCR(r,x,1))
+const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); 
+#elif RS == 4
+#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,DINCR(r,x,1))
+const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); 
+#elif RS == 3
+#define FOR_RESEARCH(r) S_DIAMOND(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(S_DIAMOND_A(hr,R)); 
+#elif RS == 2
+#define FOR_RESEARCH(r) S_VERTICAL(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(R); 
+#elif RS == 1
+#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,DINCR(r,x,1))
+const int r_area = R_AREA(R); 
+#elif RS == 0
+#define FOR_RESEARCH(r) S_SQUARE(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(R*R); 
+#endif
+
+#define RI1 (RI+1)
+#define RFI1 (RFI+1)
+
+#if RI
+#define FOR_ROTATION for (float ri = 0;  ri < 360;  ri+=360.0/RI1)
+#else
+#define FOR_ROTATION
+#endif
+
+#if RFI
+#define FOR_REFLECTION for (int rfi = 0;  rfi < RFI1;  rfi++)
+#else
+#define FOR_REFLECTION
+#endif
+
+#if PD
+#define PINCR DINCR
+#else
+#define PINCR(z,c,a) (z.c += a)
+#endif
+
+#define P_AREA(a) (a - PD)
+
+// patch shapes
+#if P == 0 || P == 1
+#define FOR_PATCH(p) S_1X1(p)
+const int p_area = P_AREA(1); 
+#elif PS == 8
+#define FOR_PATCH(p) S_PLUS_X(p,hp,PINCR(p,y,max(1,abs(p.x))))
+const int p_area = P_AREA(S_PLUS_X_A(hp,P)); 
+#elif PS == 7
+#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(S_PLUS_A(hp,P)); 
+#elif PS == 6
+#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(P*P); 
+#elif PS == 5
+#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x,1))
+const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); 
+#elif PS == 4
+#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x,1))
+const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); 
+#elif PS == 3
+#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(S_DIAMOND_A(hp,P)); 
+#elif PS == 2
+#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(P); 
+#elif PS == 1
+#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x,1))
+const int p_area = P_AREA(P); 
+#elif PS == 0
+#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(P*P); 
+#endif
+
+const float r_scale = 1.0/r_area; 
+const float p_scale = 1.0/p_area; 
+
+#if SAMPLE == 0
+#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * (vec2(off) + 0.5 - fract((pos) * (size))))
+#else
+#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * vec2(off))
+#endif
+
+#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
+
+#if RF_ && defined(LUMA_raw)
+#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off)
+#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
+#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
+#elif RF_ && D1W
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
+#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
+#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
+#elif RF_
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
+#else
+#define load2_(off) load_(off)
+#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
+#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
+#endif
+
+#if T
+val load(vec3 off)
+{
+	 switch (min(int(off.z), frame)) {
+	 case 0: return val_swizz(load_(off)); 
+
+	 }
+}
+val load2(vec3 off)
+{
+	 return off.z == 0 ? val_swizz(load2_(off)) : load(off); 
+}
+#else
+#define load(off) val_swizz(load_(off))
+#define load2(off) val_swizz(load2_(off))
+#endif
+
+val poi2 = load2(vec3(0));  // guide pixel-of-interest
+#if GUIDE_INPUT
+#define poi poi2
+#else
+vec4 poi_ = load_(vec3(0)); 
+val poi = val_swizz(poi_);  // pixel-of-interest
+#endif
+
+#if RI // rotation
+vec2 rot(vec2 p, float d)
+{
+	 return vec2(
+	 	 p.x * cos(radians(d)) - p.y * sin(radians(d)),
+	 	 p.y * sin(radians(d)) + p.x * cos(radians(d))
+	 ); 
+}
+#else
+#define rot(p, d) (p)
+#endif
+
+#if RFI // reflection
+vec2 ref(vec2 p, int d)
+{
+	 switch (d) {
+	 case 0: return p; 
+	 case 1: return p * vec2(1, -1); 
+	 case 2: return p * vec2(-1, 1); 
+	 }
+}
+#else
+#define ref(p, d) (p)
+#endif
+
+#if SST && R >= SST
+float spatial_r(vec3 v)
+{
+	 v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size); 
+	 return SK(length(v*SD)*SS); 
+}
+#else
+#define spatial_r(v) (1)
+#endif
+
+#if PST && P >= PST
+#define spatial_p(v) PSK(length(v*PSD)*PSS)
+#else
+#define spatial_p(v) (1)
+#endif
+
+val range(val pdiff_sq)
+{
+	 const float h = max(S, 0.0) * 0.013; 
+	 const float pdiff_scale = 1.0/(h*h); 
+	 pdiff_sq = sqrt(pdiff_sq * pdiff_scale); 
+#if defined(LUMA_raw)
+	 return RK(pdiff_sq); 
+#elif defined(CHROMA_raw)
+	 return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y)); 
+#else
+	 return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z)); 
+#endif
+}
 
-vec4 hook()
+val patch_comparison(vec3 r, vec3 r2)
 {
-	 return HOOKED_texOff(0); 
+	 vec3 p; 
+	 val min_rot = val(p_area); 
+
+	 FOR_ROTATION FOR_REFLECTION {
+	 	 val pdiff_sq = val(0); 
+	 	 FOR_PATCH(p) {
+	 	 	 vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z); 
+	 	 	 val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF); 
+	 	 	 diff_sq *= diff_sq; 
+	 	 	 diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy); 
+	 	 	 pdiff_sq += diff_sq; 
+	 	 }
+	 	 min_rot = min(min_rot, pdiff_sq); 
+	 }
+
+	 return min_rot * p_scale; 
 }
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND _INJ_PREI
-//!WIDTH HOOKED.w
-//!HEIGHT HOOKED.h
-//!DESC Guided filter (I)
-//!SAVE _INJ_I
+#define NO_GATHER (PD == 0 && NG == 0 && SAMPLE == 0) // never textureGather if any of these conditions are false
+#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3 || RI == 7)
 
-vec4 hook()
+#if (defined(LUMA_gather) || D1W) && ((PS == 0 || ((PS == 3 || PS == 7) && RI != 7) || PS == 8) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
+// 3x3 diamond/plus patch_comparison_gather
+// XXX extend to support arbitrary sizes (probably requires code generation)
+// XXX support PSS
+const ivec2 offsets_adj[4] = { ivec2(0,-1), ivec2(1,0), ivec2(0,1), ivec2(-1,0) }; 
+const ivec2 offsets_adj_sf[4] = { ivec2(0,-1) * SF, ivec2(1,0) * SF, ivec2(0,1) * SF, ivec2(-1,0) * SF }; 
+vec4 poi_patch_adj = gather_offs(0, offsets_adj); 
+#if PS == 0 || PS == 8
+const ivec2 offsets_diag[4] = { ivec2(-1,-1), ivec2(1,-1), ivec2(1,1), ivec2(-1,1) }; 
+const ivec2 offsets_diag_sf[4] = { ivec2(-1,-1) * SF, ivec2(1,-1) * SF, ivec2(1,1) * SF, ivec2(-1,1) * SF }; 
+vec4 poi_patch_diag = gather_offs(0, offsets_diag); 
+#endif
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
-return _INJ_PREI_texOff(0);
-}
-
+	 float min_rot = p_area - 1; 
+	 vec4 transformer_adj = gather_offs(r, offsets_adj_sf); 
+#if PS == 0 || PS == 8
+	 vec4 transformer_diag = gather_offs(r, offsets_diag_sf); 
+#endif
+	 FOR_ROTATION {
+	 	 FOR_REFLECTION {
+#if RFI
+	 	 	 /* xxy
+	 	 	  * w y
+	 	 	  * wzz
+	 	 	  */
+	 	 	 switch(rfi) {
+	 	 	 case 1:
+	 	 	 	 transformer_adj = transformer_adj.zyxw; 
+#if PS == 0 || PS == 8
+	 	 	 	 transformer_diag = transformer_diag.zyxw; 
+#endif
+	 	 	 	 break; 
+	 	 	 case 2:
+	 	 	 	 transformer_adj = transformer_adj.xwzy; 
+#if PS == 0 || PS == 8
+	 	 	 	 transformer_diag = transformer_diag.xwzy; 
+#endif
+	 	 	 	 break; 
+	 	 	 }
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (P)
-//!BIND HOOKED
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_P
+	 	 	 vec4 diff = poi_patch_adj - transformer_adj; 
+#if PS == 0 || PS == 8
+	 	 	 diff += poi_patch_diag - transformer_diag; 
+#endif
+	 	 	 float diff_sq = dot(diff * diff, vec4(1)); 
+	 	 	 min_rot = min(diff_sq, min_rot); 
 
-vec4 hook()
+// un-reflect
+#if RFI
+	 	 	 switch(rfi) {
+	 	 	 case 1:
+	 	 	 	 transformer_adj = transformer_adj.zyxw; 
+#if PS == 0 || PS == 8
+	 	 	 	 transformer_diag = transformer_diag.zyxw; 
+#endif
+	 	 	 	 break; 
+	 	 	 case 2:
+	 	 	 	 transformer_adj = transformer_adj.xwzy; 
+#if PS == 0 || PS == 8
+	 	 	 	 transformer_diag = transformer_diag.xwzy; 
+#endif
+	 	 	 	 break; 
+	 	 	 }
+#endif
+	 	 } // FOR_REFLECTION
+#if RI == 7
+	 	 transformer_adj = transformer_adj.wxyz; 
+	 	 // swap adjacents for diagonals
+	 	 transformer_adj += transformer_diag; 
+	 	 transformer_diag = transformer_adj - transformer_diag; 
+	 	 transformer_adj -= transformer_diag; 
+#elif RI == 3
+	 	 transformer_adj = transformer_adj.wxyz; 
+#elif RI == 1
+	 	 transformer_adj = transformer_adj.zwxy; 
+#endif
+#if RI == 3 && (PS == 0 || PS == 8)
+	 	 transformer_diag = transformer_diag.wxyz; 
+#elif RI == 1 && (PS == 0 || PS == 8)
+	 	 transformer_diag = transformer_diag.zwxy; 
+#endif
+	 } // FOR_ROTATION
+	 float center_diff = poi2.x - load2(r).x; 
+	 return (center_diff * center_diff + min_rot) * p_scale; 
+}
+#elif (defined(LUMA_gather) || D1W) && PS == 4 && P == 3 && RI == 0 && RFI == 0 && NO_GATHER
+const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,0), ivec2(1,0) }; 
+const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,0) * SF, ivec2(1,0) * SF }; 
+vec4 poi_patch = gather_offs(0, offsets); 
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
-	 return HOOKED_texOff(0); 
+	 vec4 pdiff = poi_patch - gather_offs(r, offsets_sf); 
+	 return dot(pdiff * pdiff, vec4(1)) * p_scale; 
 }
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANI)
-//!BIND _INJ_I
-//!WIDTH _INJ_I.w 1.5 /
-//!HEIGHT _INJ_I.h 1.5 /
-//!SAVE _INJ_MEANI
-
-vec4 hook()
+#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
+// tiled even square patch_comparison_gather
+// XXX extend to support odd square?
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
-return _INJ_I_texOff(0);
+	 vec2 tile; 
+	 float min_rot = p_area; 
+
+	 /* gather order:
+	  * w z
+	  * x y
+	  */
+	 float pdiff_sq = 0; 
+	 for (tile.x = -hp;  tile.x < hp;  tile.x+=2) for (tile.y = -hp;  tile.y < hp;  tile.y+=2) {
+	 	 vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy); 
+	 	 diff_sq *= diff_sq; 
+	 	 diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)),
+	 	 	                                  spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0))); 
+	 	 pdiff_sq += dot(diff_sq, vec4(1)); 
+	 }
+	 min_rot = min(min_rot, pdiff_sq); 
+
+	 return min_rot * p_scale; 
 }
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANP)
-//!BIND _INJ_P
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_MEANP
+#else
+#define patch_comparison_gather patch_comparison
+#endif
 
 vec4 hook()
 {
-return _INJ_P_texOff(0);
-}
+	 val total_weight = val(0); 
+	 val sum = val(0); 
+	 val result = val(0); 
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (_INJ_I_SQ)
-//!BIND _INJ_I
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_I_SQ
+	 vec3 r = vec3(0); 
+	 vec3 p = vec3(0); 
+	 vec3 me = vec3(0); 
 
-vec4 hook()
-{
-return _INJ_I_texOff(0) * _INJ_I_texOff(0);
-}
+#if T && ME == 1 // temporal & motion estimation
+	 vec3 me_tmp = vec3(0); 
+	 float maxweight = 0; 
+#elif T && ME == 2 // temporal & motion estimation
+	 vec3 me_sum = vec3(0); 
+	 float me_weight = 0; 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (_INJ_IXP)
-//!BIND _INJ_I
-//!BIND _INJ_P
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_IXP
+#if AS
+	 val total_weight_s = val(0); 
+	 val sum_s = val(0); 
+#endif
 
-vec4 hook()
-{
-return _INJ_I_texOff(0) * _INJ_P_texOff(0);
-}
+#if WD == 2 // weight discard (mean)
+	 int r_index = 0; 
+	 val_packed all_weights[r_area]; 
+	 val_packed all_pixels[r_area]; 
+#elif WD == 1 // weight discard (moving cumulative average)
+	 int r_iter = 1; 
+	 val wd_total_weight = val(0); 
+	 val wd_sum = val(0); 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (CORRI)
-//!BIND _INJ_I_SQ
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_CORRI
+	 FOR_FRAME(r) {
+	 // XXX ME is always a frame behind, should have the option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
+#if T && ME == 1 // temporal & motion estimation max weight
+	 if (r.z > 0) {
+	 	 me += me_tmp * MEF; 
+	 	 me_tmp = vec3(0); 
+	 	 maxweight = 0; 
+	 }
+#elif T && ME == 2 // temporal & motion estimation weighted average
+	 if (r.z > 0) {
+	 	 me += round(me_sum / me_weight * MEF); 
+	 	 me_sum = vec3(0); 
+	 	 me_weight = 0; 
+	 }
+#endif
+	 FOR_RESEARCH(r) {
+	 	 // r coords with appropriate transformations applied
+	 	 vec3 tr = vec3(r.xy + floor(r.xy * RSF), r.z); 
+	 	 float spatial_weight = spatial_r(tr); 
+	 	 tr.xy += me.xy; 
 
-vec4 hook()
-{
-return _INJ_I_SQ_texOff(0);
-}
+	 	 val px = load(tr); 
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (CORRP)
-//!BIND _INJ_IXP
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_CORRP
+#if SKIP_PATCH
+	 	 val weight = val(1); 
+#else
+	 	 val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(tr, vec3(0))) : patch_comparison(tr, vec3(0)); 
+	 	 val weight = range(pdiff_sq); 
+#endif
 
-vec4 hook()
-{
-return _INJ_IXP_texOff(0);
-}
+#if T && ME == 1 // temporal & motion estimation max weight
+	 	 me_tmp = vec3(tr.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); 
+	 	 maxweight = max(maxweight, weight.x); 
+#elif T && ME == 2 // temporal & motion estimation weighted average
+	 	 me_sum += vec3(tr.xy,0) * weight.x; 
+	 	 me_weight += weight.x; 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (A)
-//!BIND _INJ_MEANI
-//!BIND _INJ_MEANP
-//!BIND _INJ_CORRI
-//!BIND _INJ_CORRP
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_A
+#if D1W
+	 	 weight = val(weight.x); 
+#endif
 
-#define E 0.0013
+	 	 weight *= spatial_weight; 
 
-vec4 hook()
-{
-vec4 var = _INJ_CORRI_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANI_texOff(0);
-vec4 cov = _INJ_CORRP_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANP_texOff(0);
-	 return cov / (var + E); 
-}
+#if AS
+	 	 spatial_weight *= int(r.z == 0);  // ignore temporal
+	 	 sum_s += px * spatial_weight; 
+	 	 total_weight_s += spatial_weight; 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (B)
-//!BIND _INJ_A
-//!BIND _INJ_MEANI
-//!BIND _INJ_MEANP
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_B
+#if WD == 2 // weight discard (mean)
+	 	 all_weights[r_index] = val_pack(weight); 
+	 	 all_pixels[r_index] = val_pack(px); 
+	 	 r_index++; 
+#elif WD == 1 // weight discard (moving cumulative average)
+	 	 val wd_scale = val(1.0/r_iter); 
+	 	 val below_threshold = WDS * abs(min(val(0.0), weight - (total_weight * wd_scale * WDT * WD1TK(sqrt(wd_scale*WDP))))); 
+#if defined(LUMA_raw)
+	 	 val wdkf = WDK(below_threshold); 
+#elif defined(CHROMA_raw)
+	 	 val wdkf = vec2(WDK(below_threshold.x), WDK(below_threshold.y)); 
+#else
+	 	 val wdkf = vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.y)); 
+#endif
+	 	 wd_sum += px * weight * wdkf; 
+	 	 wd_total_weight += weight * wdkf; 
+	 	 r_iter++; 
+#endif
 
-vec4 hook()
-{
-return _INJ_MEANP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANI_texOff(0);
-}
+	 	 sum += px * weight; 
+	 	 total_weight += weight; 
+	 } // FOR_RESEARCH
+	 } // FOR_FRAME
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANA)
-//!BIND _INJ_A
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_MEANA
+	 val avg_weight = total_weight * r_scale; 
+	 val old_avg_weight = avg_weight; 
 
-vec4 hook()
-{
-return _INJ_A_texOff(0);
-}
+#if WD == 2 // weight discard (mean)
+	 total_weight = val(0); 
+	 sum = val(0); 
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANB)
-//!BIND _INJ_B
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_MEANB
+	 for (int i = 0;  i < r_area;  i++) {
+	 	 val weight = val_unpack(all_weights[i]); 
+	 	 val px = val_unpack(all_pixels[i]); 
 
-vec4 hook()
-{
-return _INJ_B_texOff(0);
-}
+	 	 val below_threshold = WDS * abs(min(val(0.0), weight - (avg_weight * WDT))); 
+#if defined(LUMA_raw)
+	 	 weight *= WDK(below_threshold); 
+#elif defined(CHROMA_raw)
+	 	 weight *= vec2(WDK(below_threshold.x), WDK(below_threshold.y)); 
+#else
+	 	 weight *= vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.z)); 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter
-//!BIND HOOKED
-//!BIND _INJ_MEANA
-//!BIND _INJ_MEANB
-//!SAVE RF_LUMA
+	 	 sum += px * weight; 
+	 	 total_weight += weight; 
+	 }
+#elif WD == 1 // weight discard (moving cumulative average)
+	 total_weight = wd_total_weight; 
+	 sum = wd_sum; 
+#endif
+#if WD // weight discard
+	 avg_weight = total_weight * r_scale; 
+#endif
 
-vec4 hook()
-{
-return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
+	 total_weight += SW * spatial_r(vec3(0)); 
+	 sum += poi * SW * spatial_r(vec3(0)); 
+	 result = val(sum / total_weight); 
+
+	 // store frames for temporal
+#if T > 1
+
+#endif
+#if T && TRF
+	 imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result)); 
+#elif T
+	 imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2)); 
+#endif
+
+#if AS == 1 // sharpen+denoise
+#define AS_base result
+#elif AS == 2 // sharpen only
+#define AS_base poi
+#endif
+#if AS
+	 val usm = result - sum_s/total_weight_s; 
+	 usm = exp(log(abs(usm))*ASP) * sign(usm);  // avoiding pow() since it's buggy on nvidia
+	 usm *= gaussian(abs((AS_base + usm - 0.5) / 1.5) * ASA); 
+	 usm *= ASF; 
+	 result = AS_base + usm; 
+#endif
+
+#if EP // extremes preserve
+	 float luminance = EP_texOff(0).x; 
+	 // EPSILON is needed since pow(0,0) is undefined
+	 float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); 
+	 result = mix(poi, result, ep_weight); 
+#else
+	 float ep_weight = 0; 
+#endif
+
+#if V == 1
+	 result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0); 
+#elif V == 2
+	 result = (poi - result) * 0.5 + 0.5; 
+#elif V == 3 // post-WD weight map
+	 result = avg_weight; 
+#elif V == 4 // pre-WD edge map
+	 result = old_avg_weight; 
+#elif V == 5
+	 result = 0.5 + usm; 
+#elif V == 6
+	 result = val(1 - ep_weight); 
+#endif
+
+// XXX visualize chroma for these
+#if defined(CHROMA_raw) && (V == 3 || V == 4 || V == 6)
+	 return vec4(0.5); 
+#endif
+
+	 return unval(mix(poi, result, BF)); 
 }
 
-// End of source code injected from guided.glsl 
+// End of source code injected from ../LQ/nlmeans.glsl 
 
 //!HOOK LUMA
 //!HOOK CHROMA
@@ -328,25 +1150,11 @@ vec4 hook()
 	return RF_LUMA_texOff(0);
 }
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND LUMA
-//!WIDTH LUMA.w 3 /
-//!HEIGHT LUMA.h 3 /
-//!DESC Non-local means (EP)
-//!SAVE EP
-
-vec4 hook()
-{
-	return LUMA_texOff(0);
-}
-
 //!HOOK LUMA
 //!HOOK CHROMA
 //!BIND HOOKED
 //!BIND RF_LUMA
 //!BIND RF
-//!BIND EP
 //!DESC Non-local means (nlmeans.glsl)
 
 // User variables
@@ -356,49 +1164,37 @@ vec4 hook()
 
 // Denoising factor (level of blur, higher means more blur)
 #ifdef LUMA_raw
-#define S 2.0
+#define S 2.0522687499802097
 #else
-#define S 5.0
+#define S 2.5168955531436197
 #endif
 
 /* Adaptive sharpening
  *
- * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
- * weight map to restrict the sharpening to edges.
- *
- * If you just want to increase/decrease sharpness then you want to change ASF.
+ * Performs an unsharp mask by subtracting the spatial kernel's blur from the 
+ * NLM blur. For sharpen+denoise the sharpening is limited to edge areas and 
+ * denoising is done everywhere else.
  *
  * Use V=4 to visualize which areas are sharpened (black means sharpen).
  *
  * AS:
- * 	- 0 to disable
- * 	- 1 to sharpen+denoise
- * 	- 2 to sharpen only
+ * 	- 0: disable
+ * 	- 1: sharpen+denoise
+ * 	- 2: sharpen only
  * ASF: Higher numbers make a sharper image
- * ASP: Higher numbers use more of the sharp image
- * ASW:
- * 	- 0 to use pre-WD weights
- * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
- * ASK: Weight kernel:
- * 	- 0 for power. This is the old method.
- * 	- 1 for sigmoid. This is generally recommended.
- * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
- * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
+ * ASA: Anti-ringing, higher numbers increase strength
+ * ASP: Power, lower numbers increase sharpening on lower frequency detail
  */
 #ifdef LUMA_raw
 #define AS 0
-#define ASF 3.0
-#define ASP 1.0
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
+#define ASF 0.1625
+#define ASA 5.0
+#define ASP 0.5
 #else
 #define AS 0
-#define ASF 3.0
-#define ASP 1.0
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
+#define ASF 0.1625
+#define ASA 5.0
+#define ASP 0.5
 #endif
 
 /* Starting weight
@@ -409,52 +1205,57 @@ vec4 hook()
  * EPSILON should be used instead of zero to avoid divide-by-zero errors.
  */
 #ifdef LUMA_raw
-#define SW 1.0
+#define SW 1.3011446081346498
 #else
-#define SW 0.5
+#define SW 1.2219854377433914
 #endif
 
 /* Weight discard
  *
- * Discard weights that fall below a fraction of the average weight. This culls 
- * the most dissimilar samples from the blur, yielding a much more pleasant 
- * result, especially around edges.
+ * Reduces weights that fall below a fraction of the average weight. This culls 
+ * the most dissimilar samples from the blur, which can yield a better result, 
+ * especially around edges.
  * 
  * WD:
- * 	- 2: True average. Better quality, but slower and requires GLSL 4.0 or later
- * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
+ * 	- 2: Mean. Better quality, but slower and requires GLSL 4.0 or later
+ * 	- 1: Moving cumulative average. Fast but inaccurate, blurs directionally.
  * 	- 0: Disable
  *
  * WDT: Threshold coefficient, higher numbers discard more
  * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
+ * WDS (not for WDK=is_zero): Higher numbers are more eager to reduce weights
  */
 #ifdef LUMA_raw
 #define WD 2
-#define WDT 0.5
-#define WDP 6.0
+#define WDT 0.11671341022864548
+#define WDP 5.381278367349288
+#define WDS 1.0
 #else
-#define WD 2
-#define WDT 0.75
-#define WDP 6.0
+#define WD 0
+#define WDT 0.002713346103131793
+#define WDP 5.832936323930807
+#define WDS 1.0
 #endif
 
 /* Extremes preserve
  *
- * Reduces denoising around very bright/dark areas.
+ * Reduce denoising in very bright/dark areas.
+ *
+ * Disabled by default now. If you want to reenable this, set EP=3/ in 
+ * Makefile.nlm and rebuild.
  *
  * The downscaling factor of the EP shader stage affects what is considered a 
- * bright/dark area. The default of 3 should be fine, it's not recommended to 
- * change this.
+ * bright/dark area.
  *
  * This is incompatible with RGB. If you have RGB hooks enabled then you will 
  * have to delete the EP shader stage or specify EP=0 through shader_cfg.
  *
  * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
+ * DP: EP strength on dark areas, 0 to fully denoise
+ * BP: EP strength on bright areas, 0 to fully denoise
  */
 #ifdef LUMA_raw
-#define EP 1
+#define EP 0
 #define BP 0.75
 #define DP 0.25
 #else
@@ -469,12 +1270,26 @@ vec4 hook()
 /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
 /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
 
+/* textureGather applicable configurations:
+ *
+ * - PS={0,3,7,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2}
+ * - PS={0,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2}
+ * - PS=6:RI=0:RFI=0
+ *   - Currently the only scalable variant
+ *
+ * Options which always disable textureGather:
+ * 	- NG
+ * 	- SAMPLE
+ * 	- PD
+ *
+ * Running without textureGather may be much slower.
+ */
+
 /* Patch & research sizes
  *
- * Patch size should be an odd number greater than or equal to 3. Higher values 
- * are slower and not always better.
+ * P should be an odd number. Higher values are slower and not always better.
  *
- * Research size be an odd number greater than or equal to 3. Higher values are 
+ * R should be an odd number greater than or equal to 3. Higher values are 
  * generally better, but slower, blurrier, and gives diminishing returns.
  */
 #ifdef LUMA_raw
@@ -492,8 +1307,6 @@ vec4 hook()
  *
  * PS applies applies to patches, RS applies to research zones.
  *
- * Be wary of gather optimizations (see the Regarding Speed comment at the top)
- *
  * 0: square (symmetrical)
  * 1: horizontal line (asymmetric)
  * 2: vertical line (asymmetric)
@@ -502,6 +1315,7 @@ vec4 hook()
  * 5: truncated triangle (asymmetric on two axis, last row halved)
  * 6: even sized square (asymmetric on two axis)
  * 7: plus (symmetrical)
+ * 8: plus X (symmetrical)
  */
 #ifdef LUMA_raw
 #define RS 3
@@ -516,8 +1330,8 @@ vec4 hook()
  * This setting is dependent on code generation from shader_cfg, so this 
  * setting can only be enabled via shader_cfg.
  *
- * Compares the pixel-of-interest against a guide, which could be a downscaled 
- * image or the output of another shader
+ * Computes weights on a guide, which could be a downscaled image or the output 
+ * of another shader, and applies the weights to the original image
  */
 #define RF_LUMA 1
 #define RF 1
@@ -531,11 +1345,14 @@ vec4 hook()
  * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
  * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
  *
+ * Consider setting SAMPLE=1 if setting RI to a setting that would require 
+ * sampling between pixels.
+ *
  * RI: Rotational invariance
  * RFI (0 to 2): Reflectional invariance
  */
 #ifdef LUMA_raw
-#define RI 3
+#define RI 0
 #define RFI 2
 #else
 #define RI 0
@@ -598,14 +1415,14 @@ vec4 hook()
  */
 #ifdef LUMA_raw
 #define SST 1
-#define SS 0.25
+#define SS 0.5296176863733414
 #define SD vec3(1,1,1)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #else
 #define SST 1
-#define SS 0.25
+#define SS 0.26295970436981203
 #define SD vec3(1,1,1)
 #define PST 0
 #define PSS 0.0
@@ -617,6 +1434,8 @@ vec4 hook()
  * SK: spatial kernel
  * RK: range kernel (takes patch differences)
  * PSK: intra-patch spatial kernel
+ * WDK: weight discard kernel
+ * WD1TK (WD=1 only): weight discard tolerance kernel
  *
  * List of available kernels:
  *
@@ -624,18 +1443,51 @@ vec4 hook()
  * cos
  * gaussian
  * lanczos
- * quadratic
+ * quadratic_ (unclamped)
  * sinc
+ * sinc_ (unclamped)
+ * sinc3
  * sphinx
+ * sphinx_ (unclamped)
+ * triangle_ (unclamped)
+ * triangle
  */
 #ifdef LUMA_raw
 #define SK gaussian
 #define RK gaussian
 #define PSK gaussian
+#define WDK is_zero
+#define WD1TK gaussian
 #else
 #define SK gaussian
 #define RK gaussian
 #define PSK gaussian
+#define WDK is_zero
+#define WD1TK gaussian
+#endif
+
+/* Sampling method
+ *
+ * In most cases this shouldn't make any difference, only set to bilinear if 
+ * it's necessary to sample between pixels (e.g., RI=2).
+ *
+ * 0: nearest neighbor
+ * 1: bilinear
+ */
+#ifdef LUMA_raw
+#define SAMPLE 0
+#else
+#define SAMPLE 0
+#endif
+
+/* Research scaling factor
+ *
+ * Higher numbers sample more sparsely as the distance from the POI grows.
+ */
+#ifdef LUMA_raw
+#define RSF 0.0
+#else
+#define RSF 0.0
 #endif
 
 // Scaling factor (should match WIDTH/HEIGHT)
@@ -645,13 +1497,22 @@ vec4 hook()
 #define SF 1
 #endif
 
+// Use the guide image as the input image
+#ifdef LUMA_raw
+#define GUIDE_INPUT 0
+#else
+#define GUIDE_INPUT 0
+#endif
+
 /* Visualization
  *
  * 0: off
  * 1: absolute difference between input/output to the power of 0.25
  * 2: difference between input/output centered on 0.5
- * 3: avg_weight
- * 4: edge map (based on the relevant AS settings)
+ * 3: post-WD weight map
+ * 4: pre-WD weight map
+ * 5: unsharp mask
+ * 6: EP
  */
 #ifdef LUMA_raw
 #define V 0
@@ -696,37 +1557,44 @@ vec4 hook()
 
 // Shader code
 
-#define EPSILON 0.00000000001
+#define EPSILON 1.2e-38
 #define M_PI 3.14159265358979323846
 #define POW2(x) ((x)*(x))
 #define POW3(x) ((x)*(x)*(x))
-#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define bicubic_(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define bicubic(x) bicubic_(clamp((x), 0.0, 2.0))
 #define gaussian(x) exp(-1 * POW2(x))
-#define lanczos(x) POW2(sinc(x))
-#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
-#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
-#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+#define quadratic_(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
+#define quadratic(x) quadratic_(clamp((x), 0.0, 1.5))
+#define sinc_(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
+#define sinc(x) sinc_(clamp((x), 0.0, 1.0))
+#define sinc3(x) sinc_(clamp((x), 0.0, 3.0))
+#define lanczos(x) (sinc3(x) * sinc(x))
+#define sphinx_(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+#define sphinx(x) sphinx_(clamp((x), 0.0, 1.4302966531242027))
+#define triangle_(x) (1 - (x))
+#define triangle(x) triangle_(clamp((x), 0.0, 1.0))
+#define is_zero(x) int(x == 0)
 
 // XXX could maybe be better optimized on LGC
-// XXX return original alpha component instead of 1.0
 #if defined(LUMA_raw)
 #define val float
 #define val_swizz(v) (v.x)
-#define unval(v) vec4(v.x, 0, 0, 1.0)
+#define unval(v) vec4(v.x, 0, 0, poi_.a)
 #define val_packed val
 #define val_pack(v) (v)
 #define val_unpack(v) (v)
 #elif defined(CHROMA_raw)
 #define val vec2
 #define val_swizz(v) (v.xy)
-#define unval(v) vec4(v.x, v.y, 0, 1.0)
+#define unval(v) vec4(v.x, v.y, 0, poi_.a)
 #define val_packed uint
 #define val_pack(v) packUnorm2x16(v)
 #define val_unpack(v) unpackUnorm2x16(v)
 #else
 #define val vec3
 #define val_swizz(v) (v.xyz)
-#define unval(v) vec4(v.x, v.y, v.z, 1.0)
+#define unval(v) vec4(v.x, v.y, v.z, poi_.a)
 #define val_packed val
 #define val_pack(v) (v)
 #define val_unpack(v) (v)
@@ -744,10 +1612,6 @@ const int hr = R/2;
 const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes
 #endif
 
-// donut increment, increments without landing on (0,0,0)
-// much faster than a continue statement
-#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
-
 // patch/research shapes
 // each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
 // dots (.) represent samples (pixels) and X represents the pixel-of-interest
@@ -788,7 +1652,7 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 //
 // Z    ..X..
 //
-#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
+#define S_HORIZONTAL(z,hz,incr) for (z.y = 0; z.y <= 0; z.y++) for (z.x = -hz; z.x <= hz; incr)
 
 // 90 degree rotation of S_HORIZONTAL
 #define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
@@ -801,19 +1665,13 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
 #define S_PLUS_A(hz,Z) (Z*2 - 1)
 
-// XXX implement S_PLUS w/ an X overlayed:
 // 3    . . .
 // 3     ...
 // Z    ..X..
 // 3     ...
 // 3    . . .
-
-// XXX implement an X shape:
-// 2    .   .
-// 2     . .
-// 1      X  
-// 2     . .
-// 2    .   .
+#define S_PLUS_X(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(z.x) + -hz * int(z.x == 0); z.y <= abs(z.x) + hz * int(z.x == 0); incr)
+#define S_PLUS_X_A(hz,Z) (Z*4 - 3)
 
 // 1x1 square
 #define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
@@ -827,43 +1685,43 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #define RF_ RF
 #endif
 
-// Skip comparing the pixel-of-interest against itself, unless RF is enabled
-#if RF_
-#define RINCR(z,c) (z.c++)
-#else
-#define RINCR DINCR
-#endif
+// donut increment, increments without landing on (0,0,0)
+// much faster than a continue statement
+#define DINCR(z,c,a) ((z.c += a),(z.c += int(z == vec3(0))))
 
-#define R_AREA(a) (a * T1 + RF_-1)
+#define R_AREA(a) (a * T1 - 1)
 
 // research shapes
 // XXX would be nice to have the option of temporally-varying research sizes
 #if R == 0 || R == 1
 #define FOR_RESEARCH(r) S_1X1(r)
 const int r_area = R_AREA(1);
+#elif RS == 8
+#define FOR_RESEARCH(r) S_PLUS_X(r,hr,DINCR(r,y,max(1,abs(r.x))))
+const int r_area = R_AREA(S_PLUS_X_A(hr,R));
 #elif RS == 7
-#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_PLUS(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(S_PLUS_A(hr,R));
 #elif RS == 6
-#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(R*R);
 #elif RS == 5
-#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x))
+#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,DINCR(r,x,1))
 const int r_area = R_AREA(S_TRIANGLE_A(hr,hr));
 #elif RS == 4
-#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x))
+#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,DINCR(r,x,1))
 const int r_area = R_AREA(S_TRIANGLE_A(hr,R));
 #elif RS == 3
-#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_DIAMOND(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(S_DIAMOND_A(hr,R));
 #elif RS == 2
-#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_VERTICAL(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(R);
 #elif RS == 1
-#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x))
+#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,DINCR(r,x,1))
 const int r_area = R_AREA(R);
 #elif RS == 0
-#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_SQUARE(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(R*R);
 #endif
 
@@ -885,7 +1743,7 @@ const int r_area = R_AREA(R*R);
 #if PD
 #define PINCR DINCR
 #else
-#define PINCR(z,c) (z.c++)
+#define PINCR(z,c,a) (z.c += a)
 #endif
 
 #define P_AREA(a) (a - PD)
@@ -894,36 +1752,44 @@ const int r_area = R_AREA(R*R);
 #if P == 0 || P == 1
 #define FOR_PATCH(p) S_1X1(p)
 const int p_area = P_AREA(1);
+#elif PS == 8
+#define FOR_PATCH(p) S_PLUS_X(p,hp,PINCR(p,y,max(1,abs(p.x))))
+const int p_area = P_AREA(S_PLUS_X_A(hp,P));
 #elif PS == 7
-#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(S_PLUS_A(hp,P));
 #elif PS == 6
-#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(P*P);
 #elif PS == 5
-#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x))
+#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x,1))
 const int p_area = P_AREA(S_TRIANGLE_A(hp,hp));
 #elif PS == 4
-#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x))
+#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x,1))
 const int p_area = P_AREA(S_TRIANGLE_A(hp,P));
 #elif PS == 3
-#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(S_DIAMOND_A(hp,P));
 #elif PS == 2
-#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(P);
 #elif PS == 1
-#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x))
+#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x,1))
 const int p_area = P_AREA(P);
 #elif PS == 0
-#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(P*P);
 #endif
 
 const float r_scale = 1.0/r_area;
 const float p_scale = 1.0/p_area;
 
-#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size)))
+#if SAMPLE == 0
+#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * (vec2(off) + 0.5 - fract((pos) * (size))))
+#else
+#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * vec2(off))
+#endif
+
 #define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
 
 #if RF_ && defined(LUMA_raw)
@@ -959,8 +1825,13 @@ val load2(vec3 off)
 #define load2(off) val_swizz(load2_(off))
 #endif
 
-val poi = load(vec3(0)); // pixel-of-interest
 val poi2 = load2(vec3(0)); // guide pixel-of-interest
+#if GUIDE_INPUT
+#define poi poi2
+#else
+vec4 poi_ = load_(vec3(0));
+val poi = val_swizz(poi_); // pixel-of-interest
+#endif
 
 #if RI // rotation
 vec2 rot(vec2 p, float d)
@@ -1005,7 +1876,7 @@ float spatial_r(vec3 v)
 
 val range(val pdiff_sq)
 {
-	const float h = S*0.013;
+	const float h = max(S, 0.0) * 0.013;
 	const float pdiff_scale = 1.0/(h*h);
 	pdiff_sq = sqrt(pdiff_sq * pdiff_scale);
 #if defined(LUMA_raw)
@@ -1015,10 +1886,6 @@ val range(val pdiff_sq)
 #else
 	return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z));
 #endif
-	//return exp(-pdiff_sq * pdiff_scale);
-
-	// weight function from the NLM paper, it's not very good
-	//return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale);
 }
 
 val patch_comparison(vec3 r, vec3 r2)
@@ -1041,42 +1908,104 @@ val patch_comparison(vec3 r, vec3 r2)
 	return min_rot * p_scale;
 }
 
-#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
-#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
+#define NO_GATHER (PD == 0 && NG == 0 && SAMPLE == 0) // never textureGather if any of these conditions are false
+#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3 || RI == 7)
 
-#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
+#if (defined(LUMA_gather) || D1W) && ((PS == 0 || ((PS == 3 || PS == 7) && RI != 7) || PS == 8) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
 // 3x3 diamond/plus patch_comparison_gather
 // XXX extend to support arbitrary sizes (probably requires code generation)
-// XXX extend to support 3x3 square
 // XXX support PSS
-const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
-const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
-vec4 poi_patch = gather_offs(0, offsets);
+const ivec2 offsets_adj[4] = { ivec2(0,-1), ivec2(1,0), ivec2(0,1), ivec2(-1,0) };
+const ivec2 offsets_adj_sf[4] = { ivec2(0,-1) * SF, ivec2(1,0) * SF, ivec2(0,1) * SF, ivec2(-1,0) * SF };
+vec4 poi_patch_adj = gather_offs(0, offsets_adj);
+#if PS == 0 || PS == 8
+const ivec2 offsets_diag[4] = { ivec2(-1,-1), ivec2(1,-1), ivec2(1,1), ivec2(-1,1) };
+const ivec2 offsets_diag_sf[4] = { ivec2(-1,-1) * SF, ivec2(1,-1) * SF, ivec2(1,1) * SF, ivec2(-1,1) * SF };
+vec4 poi_patch_diag = gather_offs(0, offsets_diag);
+#endif
 float patch_comparison_gather(vec3 r, vec3 r2)
 {
 	float min_rot = p_area - 1;
-	vec4 transformer = gather_offs(r, offsets_sf);
+	vec4 transformer_adj = gather_offs(r, offsets_adj_sf);
+#if PS == 0 || PS == 8
+	vec4 transformer_diag = gather_offs(r, offsets_diag_sf);
+#endif
 	FOR_ROTATION {
 		FOR_REFLECTION {
-			float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1));
+#if RFI
+			/* xxy
+			 * w y
+			 * wzz
+			 */
+			switch(rfi) {
+			case 1:
+				transformer_adj = transformer_adj.zyxw;
+#if PS == 0 || PS == 8
+				transformer_diag = transformer_diag.zyxw;
+#endif
+				break;
+			case 2:
+				transformer_adj = transformer_adj.xwzy;
+#if PS == 0 || PS == 8
+				transformer_diag = transformer_diag.xwzy;
+#endif
+				break;
+			}
+#endif
+
+			vec4 diff = poi_patch_adj - transformer_adj;
+#if PS == 0 || PS == 8
+			diff += poi_patch_diag - transformer_diag;
+#endif
+			float diff_sq = dot(diff * diff, vec4(1));
 			min_rot = min(diff_sq, min_rot);
+
+// un-reflect
 #if RFI
 			switch(rfi) {
-			case 0: transformer = transformer.zyxw; break;
-			case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror
-			case 2: transformer = transformer.zyxw; break; // undoes last mirror
+			case 1:
+				transformer_adj = transformer_adj.zyxw;
+#if PS == 0 || PS == 8
+				transformer_diag = transformer_diag.zyxw;
+#endif
+				break;
+			case 2:
+				transformer_adj = transformer_adj.xwzy;
+#if PS == 0 || PS == 8
+				transformer_diag = transformer_diag.xwzy;
+#endif
+				break;
 			}
 #endif
-		}
-#if RI == 3
-		transformer = transformer.wxyz;
+		} // FOR_REFLECTION
+#if RI == 7
+		transformer_adj = transformer_adj.wxyz;
+		// swap adjacents for diagonals
+		transformer_adj += transformer_diag;
+		transformer_diag = transformer_adj - transformer_diag;
+		transformer_adj -= transformer_diag;
+#elif RI == 3
+		transformer_adj = transformer_adj.wxyz;
 #elif RI == 1
-		transformer = transformer.zwxy;
+		transformer_adj = transformer_adj.zwxy;
 #endif
-	}
-	float center_diff_sq = poi2.x - load2(r).x;
-	center_diff_sq *= center_diff_sq;
-	return (min_rot + center_diff_sq) * p_scale;
+#if RI == 3 && (PS == 0 || PS == 8)
+		transformer_diag = transformer_diag.wxyz;
+#elif RI == 1 && (PS == 0 || PS == 8)
+		transformer_diag = transformer_diag.zwxy;
+#endif
+	} // FOR_ROTATION
+	float center_diff = poi2.x - load2(r).x;
+	return (center_diff * center_diff + min_rot) * p_scale;
+}
+#elif (defined(LUMA_gather) || D1W) && PS == 4 && P == 3 && RI == 0 && RFI == 0 && NO_GATHER
+const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,0), ivec2(1,0) };
+const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,0) * SF, ivec2(1,0) * SF };
+vec4 poi_patch = gather_offs(0, offsets);
+float patch_comparison_gather(vec3 r, vec3 r2)
+{
+	vec4 pdiff = poi_patch - gather_offs(r, offsets_sf);
+	return dot(pdiff * pdiff, vec4(1)) * p_scale;
 }
 #elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
 // tiled even square patch_comparison_gather
@@ -1124,18 +2053,23 @@ vec4 hook()
 	float me_weight = 0;
 #endif
 
-#if WD == 2 // weight discard
+#if AS
+	val total_weight_s = val(0);
+	val sum_s = val(0);
+#endif
+
+#if WD == 2 // weight discard (mean)
 	int r_index = 0;
 	val_packed all_weights[r_area];
 	val_packed all_pixels[r_area];
-#elif WD == 1 // weight discard
-	val no_weights = val(0);
-	val discard_total_weight = val(0);
-	val discard_sum = val(0);
+#elif WD == 1 // weight discard (moving cumulative average)
+	int r_iter = 1;
+	val wd_total_weight = val(0);
+	val wd_sum = val(0);
 #endif
 
 	FOR_FRAME(r) {
-	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
+	// XXX ME is always a frame behind, should have the option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
 #if T && ME == 1 // temporal & motion estimation max weight
 	if (r.z > 0) {
 		me += me_tmp * MEF;
@@ -1149,19 +2083,26 @@ vec4 hook()
 		me_weight = 0;
 	}
 #endif
-	FOR_RESEARCH(r) { // main NLM logic
+	FOR_RESEARCH(r) {
+		// r coords with appropriate transformations applied
+		vec3 tr = vec3(r.xy + floor(r.xy * RSF), r.z);
+		float spatial_weight = spatial_r(tr);
+		tr.xy += me.xy;
+
+		val px = load(tr);
+
 #if SKIP_PATCH
 		val weight = val(1);
 #else
-		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0));
+		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(tr, vec3(0))) : patch_comparison(tr, vec3(0));
 		val weight = range(pdiff_sq);
 #endif
 
 #if T && ME == 1 // temporal & motion estimation max weight
-		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
+		me_tmp = vec3(tr.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
 		maxweight = max(maxweight, weight.x);
 #elif T && ME == 2 // temporal & motion estimation weighted average
-		me_sum += vec3(r.xy,0) * weight.x;
+		me_sum += vec3(tr.xy,0) * weight.x;
 		me_weight += weight.x;
 #endif
 
@@ -1169,21 +2110,34 @@ vec4 hook()
 		weight = val(weight.x);
 #endif
 
-		weight *= spatial_r(r);
+		weight *= spatial_weight;
 
-#if WD == 2 // weight discard
+#if AS
+		spatial_weight *= int(r.z == 0); // ignore temporal
+		sum_s += px * spatial_weight;
+		total_weight_s += spatial_weight;
+#endif
+
+#if WD == 2 // weight discard (mean)
 		all_weights[r_index] = val_pack(weight);
-		all_pixels[r_index] = val_pack(load(r+me));
+		all_pixels[r_index] = val_pack(px);
 		r_index++;
-#elif WD == 1 // weight discard
-		val wd_scale = 1.0/max(no_weights, 1);
-		val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
-		discard_sum += load(r+me) * weight * (1 - keeps);
-		discard_total_weight += weight * (1 - keeps);
-		no_weights += keeps;
+#elif WD == 1 // weight discard (moving cumulative average)
+		val wd_scale = val(1.0/r_iter);
+		val below_threshold = WDS * abs(min(val(0.0), weight - (total_weight * wd_scale * WDT * WD1TK(sqrt(wd_scale*WDP)))));
+#if defined(LUMA_raw)
+		val wdkf = WDK(below_threshold);
+#elif defined(CHROMA_raw)
+		val wdkf = vec2(WDK(below_threshold.x), WDK(below_threshold.y));
+#else
+		val wdkf = vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.y));
+#endif
+		wd_sum += px * weight * wdkf;
+		wd_total_weight += weight * wdkf;
+		r_iter++;
 #endif
 
-		sum += load(r+me) * weight;
+		sum += px * weight;
 		total_weight += weight;
 	} // FOR_RESEARCH
 	} // FOR_FRAME
@@ -1191,37 +2145,37 @@ vec4 hook()
 	val avg_weight = total_weight * r_scale;
 	val old_avg_weight = avg_weight;
 
-#if WD == 2 // true average
+#if WD == 2 // weight discard (mean)
 	total_weight = val(0);
 	sum = val(0);
-	val no_weights = val(0);
 
 	for (int i = 0; i < r_area; i++) {
-		val w = val_unpack(all_weights[i]);
+		val weight = val_unpack(all_weights[i]);
 		val px = val_unpack(all_pixels[i]);
-		val keeps = step(avg_weight*WDT, w);
 
-		w *= keeps;
-		sum += px * w;
-		total_weight += w;
-		no_weights += keeps;
+		val below_threshold = WDS * abs(min(val(0.0), weight - (avg_weight * WDT)));
+#if defined(LUMA_raw)
+		weight *= WDK(below_threshold);
+#elif defined(CHROMA_raw)
+		weight *= vec2(WDK(below_threshold.x), WDK(below_threshold.y));
+#else
+		weight *= vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.z));
+#endif
+
+		sum += px * weight;
+		total_weight += weight;
 	}
-#elif WD == 1 // moving cumulative average
-	total_weight -= discard_total_weight;
-	sum -= discard_sum;
+#elif WD == 1 // weight discard (moving cumulative average)
+	total_weight = wd_total_weight;
+	sum = wd_sum;
 #endif
 #if WD // weight discard
-	avg_weight = total_weight / no_weights;
+	avg_weight = total_weight * r_scale;
 #endif
 
 	total_weight += SW * spatial_r(vec3(0));
 	sum += poi * SW * spatial_r(vec3(0));
-
-#if V == 3 // weight map
-	result = val(avg_weight);
-#else // mean
 	result = val(sum / total_weight);
-#endif
 
 	// store frames for temporal
 #if T > 1
@@ -1233,27 +2187,17 @@ vec4 hook()
 	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2));
 #endif
 
-#if ASW == 0 // pre-WD weights
-#define AS_weight old_avg_weight
-#elif ASW == 1 // post-WD weights
-#define AS_weight avg_weight
-#endif
-
-#if ASK == 0
-	val sharpening_strength = pow(AS_weight, val(ASP));
-#elif ASK == 1
-	val sharpening_strength = mix(
-			pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)),
-			AS_weight, ASC);
-	// XXX normalize the result to account for a negative ASC?
-#elif ASK == 2
-	val sharpening_strength = val(ASP);
-#endif
-
 #if AS == 1 // sharpen+denoise
-	val sharpened = result + (poi - result) * ASF;
+#define AS_base result
 #elif AS == 2 // sharpen only
-	val sharpened = poi + (poi - result) * ASF;
+#define AS_base poi
+#endif
+#if AS
+	val usm = result - sum_s/total_weight_s;
+	usm = exp(log(abs(usm))*ASP) * sign(usm); // avoiding pow() since it's buggy on nvidia
+	usm *= gaussian(abs((AS_base + usm - 0.5) / 1.5) * ASA);
+	usm *= ASF;
+	result = AS_base + usm;
 #endif
 
 #if EP // extremes preserve
@@ -1261,26 +2205,27 @@ vec4 hook()
 	// EPSILON is needed since pow(0,0) is undefined
 	float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP));
 	result = mix(poi, result, ep_weight);
-#endif
-
-#if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_strength);
-#elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_strength);
-#endif
-
-#if V == 4 // edge map
-	result = sharpening_strength;
-#endif
-
-#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations
-	return vec4(0.5);
+#else
+	float ep_weight = 0;
 #endif
 
 #if V == 1
 	result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0);
 #elif V == 2
 	result = (poi - result) * 0.5 + 0.5;
+#elif V == 3 // post-WD weight map
+	result = avg_weight;
+#elif V == 4 // pre-WD edge map
+	result = old_avg_weight;
+#elif V == 5
+	result = 0.5 + usm;
+#elif V == 6
+	result = val(1 - ep_weight);
+#endif
+
+// XXX visualize chroma for these
+#if defined(CHROMA_raw) && (V == 3 || V == 4 || V == 6)
+	return vec4(0.5);
 #endif
 
 	return unval(mix(poi, result, BF));
diff --git a/portable_config/shaders/nlmeans_2x.glsl b/portable_config/shaders/nlmeans_2x.glsl
deleted file mode 100644
index 737f245e..00000000
--- a/portable_config/shaders/nlmeans_2x.glsl
+++ /dev/null
@@ -1,1247 +0,0 @@
-/* vi: ft=c
- *
- * Based on vf_nlmeans.c from FFmpeg.
- *
- * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
- * Copyright (c) 2016 Clément Bœsch <u pkh me>
- *
- * This program is free software: you can redistribute it and/or modify it 
- * under the terms of the GNU Lesser General Public License as published by 
- * the Free Software Foundation, either version 2.1 of the License, or (at 
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT 
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
- * for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License 
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-// Description: nlmeans_2x.glsl: Experimental upscaler
-
-/* The recommended usage of this shader and its variant profiles is to add them 
- * to input.conf and then dispatch the appropriate shader via a keybind during 
- * media playback. Here is an example input.conf entry:
- *
- * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)"
- *
- * These shaders can also be enabled by default in mpv.conf, for example:
- *
- * glsl-shaders='~~/shaders/nlmeans.glsl'
- *
- * Both of the examples above assume the shaders are located in a subdirectory 
- * named "shaders" within mpv's config directory. Refer to the mpv 
- * documentation for more details.
- *
- * This shader is highly configurable via user variables below. Although the 
- * default settings should offer good quality at a reasonable speed, you are 
- * encouraged to tweak them to your preferences. Be mindful that certain 
- * settings may greatly affect speed.
- *
- * Denoising is most useful for noisy content. If there is no perceptible 
- * noise, you probably won't see a positive difference.
- *
- * The default settings are generally tuned for low noise and high detail 
- * preservation. The "medium" and "heavy" profiles are tuned for higher levels 
- * of noise.
- *
- * The denoiser will not work properly if the content has been upscaled 
- * beforehand (whether it was done by you or not). In such cases, consider 
- * issuing a command to downscale in the mpv console (backtick ` key):
- *
- * vf toggle scale=-2:720
- *
- * ...replacing 720 with whatever resolution seems appropriate. Rerun the 
- * command to undo the downscale. It may take some trial-and-error to find the 
- * proper resolution.
- */
-
-/* Regarding speed
- *
- * Speed may vary wildly for different vo and gpu-api settings. Generally 
- * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this 
- * may be different for your system.
- *
- * If your GPU doesn't support textureGather, or if you are on a version of mpv 
- * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile
- *
- * If you plan on tinkering with NLM's settings, read below:
- *
- * textureGather only applies to luma and limited to the these configurations:
- *
- * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}
- *   - Default, very fast, rotations and reflections should be free
- *   - If this is unusually slow then try changing gpu-api and vo
- *   - If it's still slow, try setting RI/RFI to 0.
- *
- * - PS=6:RI={0,1,3}:RFI={0,1,2}
- *   - Currently the only scalable variant
- *   - Patch shape is asymmetric on two axis
- *   - Rotations should have very little speed impact
- *   - Reflections may have a significant speed impact
- *
- * Options which always disable textureGather:
- * 	- PD
- * 	- NG
- */
-
-// The following is shader code injected from guided.glsl
-/* vi: ft=c
- *
- * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
- *
- * This program is free software: you can redistribute it and/or modify it 
- * under the terms of the GNU Lesser General Public License as published by 
- * the Free Software Foundation, either version 2.1 of the License, or (at 
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT 
- * ANY WARRANTY;  without even the implied warranty of MERCHANTABILITY or 
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
- * for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License 
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-// Description: guided.glsl: Guided by the downscaled image
-
-/* The radius can be adjusted with the MEANI stage's downscaling factor. 
- * Higher numbers give a bigger radius.
- *
- * The E variable can be found in the A stage.
- *
- * The subsampling (fast guided filter) can be adjusted with the I stage's 
- * downscaling factor. Higher numbers are faster.
- *
- * The guide's subsampling can be adjusted with the PREI stage's downscaling 
- * factor. Higher numbers downscale more.
- */
-
-//!HOOK LUMA
-//!BIND HOOKED
-//!WIDTH HOOKED.w 1.25 /
-//!HEIGHT HOOKED.h 1.25 /
-//!DESC Guided filter (PREI)
-//!SAVE _INJ_PREI
-
-vec4 hook()
-{
-	 return HOOKED_texOff(0); 
-}
-
-//!HOOK LUMA
-//!BIND _INJ_PREI
-//!WIDTH HOOKED.w
-//!HEIGHT HOOKED.h
-//!DESC Guided filter (I)
-//!SAVE _INJ_I
-
-vec4 hook()
-{
-return _INJ_PREI_texOff(0);
-}
-
-
-//!HOOK LUMA
-//!DESC Guided filter (P)
-//!BIND HOOKED
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_P
-
-vec4 hook()
-{
-	 return HOOKED_texOff(0); 
-}
-
-//!HOOK LUMA
-//!DESC Guided filter (MEANI)
-//!BIND _INJ_I
-//!WIDTH _INJ_I.w 1.5 /
-//!HEIGHT _INJ_I.h 1.5 /
-//!SAVE _INJ_MEANI
-
-vec4 hook()
-{
-return _INJ_I_texOff(0);
-}
-
-//!HOOK LUMA
-//!DESC Guided filter (MEANP)
-//!BIND _INJ_P
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_MEANP
-
-vec4 hook()
-{
-return _INJ_P_texOff(0);
-}
-
-//!HOOK LUMA
-//!DESC Guided filter (_INJ_I_SQ)
-//!BIND _INJ_I
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_I_SQ
-
-vec4 hook()
-{
-return _INJ_I_texOff(0) * _INJ_I_texOff(0);
-}
-
-//!HOOK LUMA
-//!DESC Guided filter (_INJ_IXP)
-//!BIND _INJ_I
-//!BIND _INJ_P
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_IXP
-
-vec4 hook()
-{
-return _INJ_I_texOff(0) * _INJ_P_texOff(0);
-}
-
-//!HOOK LUMA
-//!DESC Guided filter (CORRI)
-//!BIND _INJ_I_SQ
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_CORRI
-
-vec4 hook()
-{
-return _INJ_I_SQ_texOff(0);
-}
-
-//!HOOK LUMA
-//!DESC Guided filter (CORRP)
-//!BIND _INJ_IXP
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_CORRP
-
-vec4 hook()
-{
-return _INJ_IXP_texOff(0);
-}
-
-//!HOOK LUMA
-//!DESC Guided filter (A)
-//!BIND _INJ_MEANI
-//!BIND _INJ_MEANP
-//!BIND _INJ_CORRI
-//!BIND _INJ_CORRP
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_A
-
-#define E 0.0013
-
-vec4 hook()
-{
-vec4 var = _INJ_CORRI_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANI_texOff(0);
-vec4 cov = _INJ_CORRP_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANP_texOff(0);
-	 return cov / (var + E); 
-}
-
-//!HOOK LUMA
-//!DESC Guided filter (B)
-//!BIND _INJ_A
-//!BIND _INJ_MEANI
-//!BIND _INJ_MEANP
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_B
-
-vec4 hook()
-{
-return _INJ_MEANP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANI_texOff(0);
-}
-
-//!HOOK LUMA
-//!DESC Guided filter (MEANA)
-//!BIND _INJ_A
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_MEANA
-
-vec4 hook()
-{
-return _INJ_A_texOff(0);
-}
-
-//!HOOK LUMA
-//!DESC Guided filter (MEANB)
-//!BIND _INJ_B
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_MEANB
-
-vec4 hook()
-{
-return _INJ_B_texOff(0);
-}
-
-//!HOOK LUMA
-//!DESC Guided filter
-//!BIND HOOKED
-//!BIND _INJ_MEANA
-//!BIND _INJ_MEANB
-//!SAVE RF_LUMA
-
-vec4 hook()
-{
-return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
-}
-
-// End of source code injected from guided.glsl 
-
-//!HOOK LUMA
-//!BIND HOOKED
-//!BIND RF_LUMA
-//!DESC Non-local means (nlmeans_2x.glsl)
-//!WIDTH HOOKED.w 2 *
-//!HEIGHT HOOKED.h 2 *
-
-// User variables
-
-// It is generally preferable to denoise luma and chroma differently, so the 
-// user variables for luma and chroma are split.
-
-// Denoising factor (level of blur, higher means more blur)
-#ifdef LUMA_raw
-#define S 12.8125
-#else
-#define S 12.8125
-#endif
-
-/* Adaptive sharpening
- *
- * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
- * weight map to restrict the sharpening to edges.
- *
- * If you just want to increase/decrease sharpness then you want to change ASF.
- *
- * Use V=4 to visualize which areas are sharpened (black means sharpen).
- *
- * AS:
- * 	- 0 to disable
- * 	- 1 to sharpen+denoise
- * 	- 2 to sharpen only
- * ASF: Higher numbers make a sharper image
- * ASP: Higher numbers use more of the sharp image
- * ASW:
- * 	- 0 to use pre-WD weights
- * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
- * ASK: Weight kernel:
- * 	- 0 for power. This is the old method.
- * 	- 1 for sigmoid. This is generally recommended.
- * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
- * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
- */
-#ifdef LUMA_raw
-#define AS 0
-#define ASF 3.0
-#define ASP 1.0
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
-#else
-#define AS 0
-#define ASF 3.0
-#define ASP 1.0
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
-#endif
-
-/* Starting weight
- *
- * Also known as the center weight. This represents the weight of the 
- * pixel-of-interest. Lower numbers may help handle heavy noise & ringing.
- *
- * EPSILON should be used instead of zero to avoid divide-by-zero errors.
- */
-#ifdef LUMA_raw
-#define SW 0.14876
-#else
-#define SW 0.14876
-#endif
-
-/* Weight discard
- *
- * Discard weights that fall below a fraction of the average weight. This culls 
- * the most dissimilar samples from the blur, yielding a much more pleasant 
- * result, especially around edges.
- * 
- * WD:
- * 	- 2: True average. Better quality, but slower and requires GLSL 4.0 or later
- * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
- * 	- 0: Disable
- *
- * WDT: Threshold coefficient, higher numbers discard more
- * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
- */
-#ifdef LUMA_raw
-#define WD 2
-#define WDT 0.63888239592
-#define WDP 6.0
-#else
-#define WD 2
-#define WDT 0.63888239592
-#define WDP 6.0
-#endif
-
-/* Extremes preserve
- *
- * Reduces denoising around very bright/dark areas.
- *
- * The downscaling factor of the EP shader stage affects what is considered a 
- * bright/dark area. The default of 3 should be fine, it's not recommended to 
- * change this.
- *
- * This is incompatible with RGB. If you have RGB hooks enabled then you will 
- * have to delete the EP shader stage or specify EP=0 through shader_cfg.
- *
- * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
- */
-#ifdef LUMA_raw
-#define EP 0
-#define BP 0.75
-#define DP 0.25
-#else
-#define EP 0
-#define BP 0.0
-#define DP 0.0
-#endif
-
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-
-/* Patch & research sizes
- *
- * Patch size should be an odd number greater than or equal to 3. Higher values 
- * are slower and not always better.
- *
- * Research size be an odd number greater than or equal to 3. Higher values are 
- * generally better, but slower, blurrier, and gives diminishing returns.
- */
-#ifdef LUMA_raw
-#define P 3
-#define R 5
-#else
-#define P 3
-#define R 5
-#endif
-
-/* Patch and research shapes
- *
- * Different shapes have different speed and quality characteristics. Every 
- * shape (besides square) is smaller than square.
- *
- * PS applies applies to patches, RS applies to research zones.
- *
- * Be wary of gather optimizations (see the Regarding Speed comment at the top)
- *
- * 0: square (symmetrical)
- * 1: horizontal line (asymmetric)
- * 2: vertical line (asymmetric)
- * 3: diamond (symmetrical)
- * 4: triangle (asymmetric, pointing upward)
- * 5: truncated triangle (asymmetric on two axis, last row halved)
- * 6: even sized square (asymmetric on two axis)
- * 7: plus (symmetrical)
- */
-#ifdef LUMA_raw
-#define RS 3
-#define PS 3
-#else
-#define RS 3
-#define PS 3
-#endif
-
-/* Robust filtering
- *
- * This setting is dependent on code generation from shader_cfg, so this 
- * setting can only be enabled via shader_cfg.
- *
- * Compares the pixel-of-interest against a guide, which could be a downscaled 
- * image or the output of another shader
- */
-#define RF_LUMA 1
-#define RF 0
-
-/* Rotational/reflectional invariance
- *
- * Number of rotations/reflections to try for each patch comparison. Can be 
- * slow, but improves feature preservation. More rotations/reflections gives 
- * diminishing returns. The most similar rotation/reflection will be used.
- *
- * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
- * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
- *
- * RI: Rotational invariance
- * RFI (0 to 2): Reflectional invariance
- */
-#ifdef LUMA_raw
-#define RI 3
-#define RFI 2
-#else
-#define RI 0
-#define RFI 0
-#endif
-
-/* Temporal denoising
- *
- * This setting is dependent on code generation from shader_cfg, so this 
- * setting can only be enabled via shader_cfg.
- *
- * Caveats:
- * 	- Slower:
- * 		- Each frame needs to be researched (more samples & more math)
- * 		- Gather optimizations only apply to the current frame
- * 	- Requires vo=gpu-next
- * 	- Luma-only (this is a bug)
- * 	- Buggy
- *
- * May cause motion blur and may struggle more with noise that persists across 
- * multiple frames (e.g., from compression or duplicate frames), but can work 
- * very well on high quality video.
- *
- * Motion estimation (ME) should improve quality without impacting speed.
- *
- * T: number of frames used
- * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
- * MEF: estimate factor, compensates for ME being one frame behind
- * TRF: compare against the denoised frames
- */
-#ifdef LUMA_raw
-#define T 0
-#define ME 1
-#define MEF 2
-#define TRF 0
-#else
-#define T 0
-#define ME 0
-#define MEF 2
-#define TRF 0
-#endif
-
-/* Spatial kernel
- *
- * Increasing the spatial denoising factor (SS) reduces the weight of further 
- * pixels.
- *
- * Spatial distortion instructs the spatial kernel to view that axis as 
- * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
- * appear closer and increase blur between frames.
- *
- * The intra-patch variants are supposed to help with larger patch sizes.
- *
- * SST: enables spatial kernel if R>=PST, 0 fully disables
- * SS: spatial sigma
- * SD: spatial distortion (X, Y, time)
- * PSS: intra-patch spatial sigma
- * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
- * PSD: intra-patch spatial distortion (X, Y)
- */
-#ifdef LUMA_raw
-#define SST 1
-#define SS 0.5547703803256947
-#define SD vec3(1,1,1)
-#define PST 0
-#define PSS 0.0
-#define PSD vec2(1,1)
-#else
-#define SST 1
-#define SS 0.5547703803256947
-#define SD vec3(1,1,1)
-#define PST 0
-#define PSS 0.0
-#define PSD vec2(1,1)
-#endif
-
-/* Kernels
- *
- * SK: spatial kernel
- * RK: range kernel (takes patch differences)
- * PSK: intra-patch spatial kernel
- *
- * List of available kernels:
- *
- * bicubic
- * cos
- * gaussian
- * lanczos
- * quadratic
- * sinc
- * sphinx
- */
-#ifdef LUMA_raw
-#define SK lanczos
-#define RK gaussian
-#define PSK gaussian
-#else
-#define SK lanczos
-#define RK gaussian
-#define PSK gaussian
-#endif
-
-// Scaling factor (should match WIDTH/HEIGHT)
-#ifdef LUMA_raw
-#define SF 1
-#else
-#define SF 1
-#endif
-
-/* Visualization
- *
- * 0: off
- * 1: absolute difference between input/output to the power of 0.25
- * 2: difference between input/output centered on 0.5
- * 3: avg_weight
- * 4: edge map (based on the relevant AS settings)
- */
-#ifdef LUMA_raw
-#define V 0
-#else
-#define V 0
-#endif
-
-// Blur factor (0.0 returns the input image, 1.0 returns the output image)
-#ifdef LUMA_raw
-#define BF 1.0
-#else
-#define BF 1.0
-#endif
-
-// Force disable textureGather
-#ifdef LUMA_raw
-#define NG 0
-#else
-#define NG 0
-#endif
-
-// Patch donut (probably useless)
-#ifdef LUMA_raw
-#define PD 0
-#else
-#define PD 0
-#endif
-
-// Duplicate 1st weight (for luma-guided-chroma)
-#ifdef LUMA_raw
-#define D1W 0
-#else
-#define D1W 0
-#endif
-
-// Skip patch comparison
-#ifdef LUMA_raw
-#define SKIP_PATCH 0
-#else
-#define SKIP_PATCH 0
-#endif
-
-// Shader code
-
-#define EPSILON 0.00000000001
-#define M_PI 3.14159265358979323846
-#define POW2(x) ((x)*(x))
-#define POW3(x) ((x)*(x)*(x))
-#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
-#define gaussian(x) exp(-1 * POW2(x))
-#define lanczos(x) POW2(sinc(x))
-#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
-#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
-#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
-
-// XXX could maybe be better optimized on LGC
-// XXX return original alpha component instead of 1.0
-#if defined(LUMA_raw)
-#define val float
-#define val_swizz(v) (v.x)
-#define unval(v) vec4(v.x, 0, 0, 1.0)
-#define val_packed val
-#define val_pack(v) (v)
-#define val_unpack(v) (v)
-#elif defined(CHROMA_raw)
-#define val vec2
-#define val_swizz(v) (v.xy)
-#define unval(v) vec4(v.x, v.y, 0, 1.0)
-#define val_packed uint
-#define val_pack(v) packUnorm2x16(v)
-#define val_unpack(v) unpackUnorm2x16(v)
-#else
-#define val vec3
-#define val_swizz(v) (v.xyz)
-#define unval(v) vec4(v.x, v.y, v.z, 1.0)
-#define val_packed val
-#define val_pack(v) (v)
-#define val_unpack(v) (v)
-#endif
-
-#if PS == 6
-const int hp = P/2;
-#else
-const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes
-#endif
-
-#if RS == 6
-const int hr = R/2;
-#else
-const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes
-#endif
-
-// donut increment, increments without landing on (0,0,0)
-// much faster than a continue statement
-#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
-
-// patch/research shapes
-// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
-// dots (.) represent samples (pixels) and X represents the pixel-of-interest
-
-// Z    .....
-// Z    .....
-// Z    ..X..
-// Z    .....
-// Z    .....
-#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
-
-// (in this instance Z=4)
-// Z    ....
-// Z    ....
-// Z    ..X.
-// Z    ....
-#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
-
-// Z-4    .
-// Z-2   ...
-// Z    ..X..
-#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr)
-
-// Z-4    .
-// Z-2   ...
-// hz+1 ..X
-#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr)
-#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
-
-// Z-4    .
-// Z-2   ...
-// Z    ..X..
-// Z-2   ...
-// Z-4    .
-#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr)
-#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
-
-//
-// Z    ..X..
-//
-#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
-
-// 90 degree rotation of S_HORIZONTAL
-#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
-
-// 1      .
-// 1      . 
-// Z    ..X..
-// 1      . 
-// 1      .
-#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
-#define S_PLUS_A(hz,Z) (Z*2 - 1)
-
-// XXX implement S_PLUS w/ an X overlayed:
-// 3    . . .
-// 3     ...
-// Z    ..X..
-// 3     ...
-// 3    . . .
-
-// XXX implement an X shape:
-// 2    .   .
-// 2     . .
-// 1      X  
-// 2     . .
-// 2    .   .
-
-// 1x1 square
-#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
-
-#define T1 (T+1)
-#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++)
-
-#ifdef LUMA_raw
-#define RF_ RF_LUMA
-#else
-#define RF_ RF
-#endif
-
-// Skip comparing the pixel-of-interest against itself, unless RF is enabled
-#if RF_
-#define RINCR(z,c) (z.c++)
-#else
-#define RINCR DINCR
-#endif
-
-#define R_AREA(a) (a * T1 + RF_-1)
-
-// research shapes
-// XXX would be nice to have the option of temporally-varying research sizes
-#if R == 0 || R == 1
-#define FOR_RESEARCH(r) S_1X1(r)
-const int r_area = R_AREA(1);
-#elif RS == 7
-#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y))
-const int r_area = R_AREA(S_PLUS_A(hr,R));
-#elif RS == 6
-#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R*R);
-#elif RS == 5
-#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x))
-const int r_area = R_AREA(S_TRIANGLE_A(hr,hr));
-#elif RS == 4
-#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x))
-const int r_area = R_AREA(S_TRIANGLE_A(hr,R));
-#elif RS == 3
-#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y))
-const int r_area = R_AREA(S_DIAMOND_A(hr,R));
-#elif RS == 2
-#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R);
-#elif RS == 1
-#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x))
-const int r_area = R_AREA(R);
-#elif RS == 0
-#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R*R);
-#endif
-
-#define RI1 (RI+1)
-#define RFI1 (RFI+1)
-
-#if RI
-#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1)
-#else
-#define FOR_ROTATION
-#endif
-
-#if RFI
-#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++)
-#else
-#define FOR_REFLECTION
-#endif
-
-#if PD
-#define PINCR DINCR
-#else
-#define PINCR(z,c) (z.c++)
-#endif
-
-#define P_AREA(a) (a - PD)
-
-// patch shapes
-#if P == 0 || P == 1
-#define FOR_PATCH(p) S_1X1(p)
-const int p_area = P_AREA(1);
-#elif PS == 7
-#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y))
-const int p_area = P_AREA(S_PLUS_A(hp,P));
-#elif PS == 6
-#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P*P);
-#elif PS == 5
-#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x))
-const int p_area = P_AREA(S_TRIANGLE_A(hp,hp));
-#elif PS == 4
-#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x))
-const int p_area = P_AREA(S_TRIANGLE_A(hp,P));
-#elif PS == 3
-#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y))
-const int p_area = P_AREA(S_DIAMOND_A(hp,P));
-#elif PS == 2
-#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P);
-#elif PS == 1
-#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x))
-const int p_area = P_AREA(P);
-#elif PS == 0
-#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P*P);
-#endif
-
-const float r_scale = 1.0/r_area;
-const float p_scale = 1.0/p_area;
-
-#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size)))
-#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
-
-#if RF_ && defined(LUMA_raw)
-#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off)
-#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
-#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
-#elif RF_ && D1W
-#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
-#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
-#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
-#elif RF_
-#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
-#else
-#define load2_(off) load_(off)
-#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
-#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
-#endif
-
-#if T
-val load(vec3 off)
-{
-	switch (min(int(off.z), frame)) {
-	case 0: return val_swizz(load_(off));
-
-	}
-}
-val load2(vec3 off)
-{
-	return off.z == 0 ? val_swizz(load2_(off)) : load(off);
-}
-#else
-#define load(off) val_swizz(load_(off))
-#define load2(off) val_swizz(load2_(off))
-#endif
-
-val poi = load(vec3(0)); // pixel-of-interest
-val poi2 = load2(vec3(0)); // guide pixel-of-interest
-
-#if RI // rotation
-vec2 rot(vec2 p, float d)
-{
-	return vec2(
-		p.x * cos(radians(d)) - p.y * sin(radians(d)),
-		p.y * sin(radians(d)) + p.x * cos(radians(d))
-	);
-}
-#else
-#define rot(p, d) (p)
-#endif
-
-#if RFI // reflection
-vec2 ref(vec2 p, int d)
-{
-	switch (d) {
-	case 0: return p;
-	case 1: return p * vec2(1, -1);
-	case 2: return p * vec2(-1, 1);
-	}
-}
-#else
-#define ref(p, d) (p)
-#endif
-
-#if SST && R >= SST
-float spatial_r(vec3 v)
-{
-	v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size);
-	return SK(length(v*SD)*SS);
-}
-#else
-#define spatial_r(v) (1)
-#endif
-
-#if PST && P >= PST
-#define spatial_p(v) PSK(length(v*PSD)*PSS)
-#else
-#define spatial_p(v) (1)
-#endif
-
-val range(val pdiff_sq)
-{
-	const float h = S*0.013;
-	const float pdiff_scale = 1.0/(h*h);
-	pdiff_sq = sqrt(pdiff_sq * pdiff_scale);
-#if defined(LUMA_raw)
-	return RK(pdiff_sq);
-#elif defined(CHROMA_raw)
-	return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y));
-#else
-	return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z));
-#endif
-	//return exp(-pdiff_sq * pdiff_scale);
-
-	// weight function from the NLM paper, it's not very good
-	//return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale);
-}
-
-val patch_comparison(vec3 r, vec3 r2)
-{
-	vec3 p;
-	val min_rot = val(p_area);
-
-	FOR_ROTATION FOR_REFLECTION {
-		val pdiff_sq = val(0);
-		FOR_PATCH(p) {
-			vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z);
-			val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);
-			diff_sq *= diff_sq;
-			diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy);
-			pdiff_sq += diff_sq;
-		}
-		min_rot = min(min_rot, pdiff_sq);
-	}
-
-	return min_rot * p_scale;
-}
-
-#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
-#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
-
-#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
-// 3x3 diamond/plus patch_comparison_gather
-// XXX extend to support arbitrary sizes (probably requires code generation)
-// XXX extend to support 3x3 square
-// XXX support PSS
-const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
-const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
-vec4 poi_patch = gather_offs(0, offsets);
-float patch_comparison_gather(vec3 r, vec3 r2)
-{
-	float min_rot = p_area - 1;
-	vec4 transformer = gather_offs(r, offsets_sf);
-	FOR_ROTATION {
-		FOR_REFLECTION {
-			float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1));
-			min_rot = min(diff_sq, min_rot);
-#if RFI
-			switch(rfi) {
-			case 0: transformer = transformer.zyxw; break;
-			case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror
-			case 2: transformer = transformer.zyxw; break; // undoes last mirror
-			}
-#endif
-		}
-#if RI == 3
-		transformer = transformer.wxyz;
-#elif RI == 1
-		transformer = transformer.zwxy;
-#endif
-	}
-	float center_diff_sq = poi2.x - load2(r).x;
-	center_diff_sq *= center_diff_sq;
-	return (min_rot + center_diff_sq) * p_scale;
-}
-#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
-// tiled even square patch_comparison_gather
-// XXX extend to support odd square?
-float patch_comparison_gather(vec3 r, vec3 r2)
-{
-	vec2 tile;
-	float min_rot = p_area;
-
-	/* gather order:
-	 * w z
-	 * x y
-	 */
-	float pdiff_sq = 0;
-	for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
-		vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy);
-		diff_sq *= diff_sq;
-		diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)),
-			                                 spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0)));
-		pdiff_sq += dot(diff_sq, vec4(1));
-	}
-	min_rot = min(min_rot, pdiff_sq);
-
-	return min_rot * p_scale;
-}
-#else
-#define patch_comparison_gather patch_comparison
-#endif
-
-vec4 hook()
-{
-	val total_weight = val(0);
-	val sum = val(0);
-	val result = val(0);
-
-	vec3 r = vec3(0);
-	vec3 p = vec3(0);
-	vec3 me = vec3(0);
-
-#if T && ME == 1 // temporal & motion estimation
-	vec3 me_tmp = vec3(0);
-	float maxweight = 0;
-#elif T && ME == 2 // temporal & motion estimation
-	vec3 me_sum = vec3(0);
-	float me_weight = 0;
-#endif
-
-#if WD == 2 // weight discard
-	int r_index = 0;
-	val_packed all_weights[r_area];
-	val_packed all_pixels[r_area];
-#elif WD == 1 // weight discard
-	val no_weights = val(0);
-	val discard_total_weight = val(0);
-	val discard_sum = val(0);
-#endif
-
-	FOR_FRAME(r) {
-	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
-#if T && ME == 1 // temporal & motion estimation max weight
-	if (r.z > 0) {
-		me += me_tmp * MEF;
-		me_tmp = vec3(0);
-		maxweight = 0;
-	}
-#elif T && ME == 2 // temporal & motion estimation weighted average
-	if (r.z > 0) {
-		me += round(me_sum / me_weight * MEF);
-		me_sum = vec3(0);
-		me_weight = 0;
-	}
-#endif
-	FOR_RESEARCH(r) { // main NLM logic
-#if SKIP_PATCH
-		val weight = val(1);
-#else
-		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0));
-		val weight = range(pdiff_sq);
-#endif
-
-#if T && ME == 1 // temporal & motion estimation max weight
-		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
-		maxweight = max(maxweight, weight.x);
-#elif T && ME == 2 // temporal & motion estimation weighted average
-		me_sum += vec3(r.xy,0) * weight.x;
-		me_weight += weight.x;
-#endif
-
-#if D1W
-		weight = val(weight.x);
-#endif
-
-		weight *= spatial_r(r);
-
-#if WD == 2 // weight discard
-		all_weights[r_index] = val_pack(weight);
-		all_pixels[r_index] = val_pack(load(r+me));
-		r_index++;
-#elif WD == 1 // weight discard
-		val wd_scale = 1.0/max(no_weights, 1);
-		val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
-		discard_sum += load(r+me) * weight * (1 - keeps);
-		discard_total_weight += weight * (1 - keeps);
-		no_weights += keeps;
-#endif
-
-		sum += load(r+me) * weight;
-		total_weight += weight;
-	} // FOR_RESEARCH
-	} // FOR_FRAME
-
-	val avg_weight = total_weight * r_scale;
-	val old_avg_weight = avg_weight;
-
-#if WD == 2 // true average
-	total_weight = val(0);
-	sum = val(0);
-	val no_weights = val(0);
-
-	for (int i = 0; i < r_area; i++) {
-		val w = val_unpack(all_weights[i]);
-		val px = val_unpack(all_pixels[i]);
-		val keeps = step(avg_weight*WDT, w);
-
-		w *= keeps;
-		sum += px * w;
-		total_weight += w;
-		no_weights += keeps;
-	}
-#elif WD == 1 // moving cumulative average
-	total_weight -= discard_total_weight;
-	sum -= discard_sum;
-#endif
-#if WD // weight discard
-	avg_weight = total_weight / no_weights;
-#endif
-
-	total_weight += SW * spatial_r(vec3(0));
-	sum += poi * SW * spatial_r(vec3(0));
-
-#if V == 3 // weight map
-	result = val(avg_weight);
-#else // mean
-	result = val(sum / total_weight);
-#endif
-
-	// store frames for temporal
-#if T > 1
-
-#endif
-#if T && TRF
-	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result));
-#elif T
-	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2));
-#endif
-
-#if ASW == 0 // pre-WD weights
-#define AS_weight old_avg_weight
-#elif ASW == 1 // post-WD weights
-#define AS_weight avg_weight
-#endif
-
-#if ASK == 0
-	val sharpening_strength = pow(AS_weight, val(ASP));
-#elif ASK == 1
-	val sharpening_strength = mix(
-			pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)),
-			AS_weight, ASC);
-	// XXX normalize the result to account for a negative ASC?
-#elif ASK == 2
-	val sharpening_strength = val(ASP);
-#endif
-
-#if AS == 1 // sharpen+denoise
-	val sharpened = result + (poi - result) * ASF;
-#elif AS == 2 // sharpen only
-	val sharpened = poi + (poi - result) * ASF;
-#endif
-
-#if EP // extremes preserve
-	float luminance = EP_texOff(0).x;
-	// EPSILON is needed since pow(0,0) is undefined
-	float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP));
-	result = mix(poi, result, ep_weight);
-#endif
-
-#if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_strength);
-#elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_strength);
-#endif
-
-#if V == 4 // edge map
-	result = sharpening_strength;
-#endif
-
-#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations
-	return vec4(0.5);
-#endif
-
-#if V == 1
-	result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0);
-#elif V == 2
-	result = (poi - result) * 0.5 + 0.5;
-#endif
-
-	return unval(mix(poi, result, BF));
-}
-
diff --git a/portable_config/shaders/nlmeans_hqx.glsl b/portable_config/shaders/nlmeans_hqx.glsl
index d9b0a96e..a90f7876 100644
--- a/portable_config/shaders/nlmeans_hqx.glsl
+++ b/portable_config/shaders/nlmeans_hqx.glsl
@@ -19,301 +19,2249 @@
  * along with this program. If not, see <https://www.gnu.org/licenses/>.
  */
 
-// Description: nlmeans_hqx.glsl: Very slow, should offer the best quality.
+// Description: nlmeans.glsl: Very slow, should offer the best quality.
 
-/* The recommended usage of this shader and its variant profiles is to add them 
- * to input.conf and then dispatch the appropriate shader via a keybind during 
- * media playback. Here is an example input.conf entry:
+/* This shader is highly configurable via user variables below. Although the 
+ * default settings should offer good quality at a reasonable speed, you are 
+ * encouraged to tweak them to your preferences.
+ */
+
+// The following is shader code injected from ../nlmeans.glsl
+/* vi: ft=c
+ *
+ * Based on vf_nlmeans.c from FFmpeg.
+ *
+ * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
+ * Copyright (c) 2016 Clément Bœsch <u pkh me>
+ *
+ * This program is free software: you can redistribute it and/or modify it 
+ * under the terms of the GNU Lesser General Public License as published by 
+ * the Free Software Foundation, either version 2.1 of the License, or (at 
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT 
+ * ANY WARRANTY;  without even the implied warranty of MERCHANTABILITY or 
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License 
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+// Description: nlmeans.glsl: Default profile, general purpose, tuned for low noise
+
+/* This shader is highly configurable via user variables below. Although the 
+ * default settings should offer good quality at a reasonable speed, you are 
+ * encouraged to tweak them to your preferences.
+ */
+
+// The following is shader code injected from ../LQ/nlmeans.glsl
+/* vi: ft=c
  *
- * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)"
+ * Based on vf_nlmeans.c from FFmpeg.
  *
- * These shaders can also be enabled by default in mpv.conf, for example:
+ * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
+ * Copyright (c) 2016 Clément Bœsch <u pkh me>
  *
- * glsl-shaders='~~/shaders/nlmeans.glsl'
+ * This program is free software: you can redistribute it and/or modify it 
+ * under the terms of the GNU Lesser General Public License as published by 
+ * the Free Software Foundation, either version 2.1 of the License, or (at 
+ * your option) any later version.
  *
- * Both of the examples above assume the shaders are located in a subdirectory 
- * named "shaders" within mpv's config directory. Refer to the mpv 
- * documentation for more details.
+ * This program is distributed in the hope that it will be useful, but WITHOUT 
+ * ANY WARRANTY;   without even the implied warranty of MERCHANTABILITY or 
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
+ * for more details.
  *
- * This shader is highly configurable via user variables below. Although the 
+ * You should have received a copy of the GNU Lesser General Public License 
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+// Description: nlmeans.glsl: Faster, but lower quality.
+
+/* This shader is highly configurable via user variables below. Although the 
  * default settings should offer good quality at a reasonable speed, you are 
- * encouraged to tweak them to your preferences. Be mindful that certain 
- * settings may greatly affect speed.
+ * encouraged to tweak them to your preferences.
+ */
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!BIND HOOKED
+//!DESC Non-local means (nlmeans.glsl)
+//!SAVE _INJ_RF_LUMA
+
+// User variables
+
+// It is generally preferable to denoise luma and chroma differently, so the 
+// user variables for luma and chroma are split.
+
+// Denoising factor (level of blur, higher means more blur)
+#ifdef LUMA_raw
+#define S 3.5968056672833097
+#else
+#define S 5.191526541606411
+#endif
+
+/* Adaptive sharpening
+ *
+ * Performs an unsharp mask by subtracting the spatial kernel's blur from the 
+ * NLM blur. For sharpen+denoise the sharpening is limited to edge areas and 
+ * denoising is done everywhere else.
+ *
+ * Use V=4 to visualize which areas are sharpened (black means sharpen).
  *
- * Denoising is most useful for noisy content. If there is no perceptible 
- * noise, you probably won't see a positive difference.
+ * AS:
+ * 	  - 0: disable
+ * 	  - 1: sharpen+denoise
+ * 	  - 2: sharpen only
+ * ASF: Higher numbers make a sharper image
+ * ASA: Anti-ringing, higher numbers increase strength
+ * ASP: Power, lower numbers increase sharpening on lower frequency detail
+ */
+#ifdef LUMA_raw
+#define AS 0
+#define ASF 0.1625
+#define ASA 5.0
+#define ASP 0.5
+#else
+#define AS 0
+#define ASF 0.1625
+#define ASA 5.0
+#define ASP 0.5
+#endif
+
+/* Starting weight
  *
- * The default settings are generally tuned for low noise and high detail 
- * preservation. The "medium" and "heavy" profiles are tuned for higher levels 
- * of noise.
+ * Also known as the center weight. This represents the weight of the 
+ * pixel-of-interest. Lower numbers may help handle heavy noise & ringing.
  *
- * The denoiser will not work properly if the content has been upscaled 
- * beforehand (whether it was done by you or not). In such cases, consider 
- * issuing a command to downscale in the mpv console (backtick ` key):
+ * EPSILON should be used instead of zero to avoid divide-by-zero errors.
+ */
+#ifdef LUMA_raw
+#define SW 0.7392620481427672
+#else
+#define SW 0.6448288408806067
+#endif
+
+/* Weight discard
  *
- * vf toggle scale=-2:720
+ * Reduces weights that fall below a fraction of the average weight. This culls 
+ * the most dissimilar samples from the blur, which can yield a better result, 
+ * especially around edges.
+ * 
+ * WD:
+ * 	  - 2: Mean. Better quality, but slower and requires GLSL 4.0 or later
+ * 	  - 1: Moving cumulative average. Fast but inaccurate, blurs directionally.
+ * 	  - 0: Disable
  *
- * ...replacing 720 with whatever resolution seems appropriate. Rerun the 
- * command to undo the downscale. It may take some trial-and-error to find the 
- * proper resolution.
+ * WDT: Threshold coefficient, higher numbers discard more
+ * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
+ * WDS (not for WDK=is_zero): Higher numbers are more eager to reduce weights
  */
+#ifdef LUMA_raw
+#define WD 1
+#define WDT 0.580415381682815
+#define WDP 5.381278367349288
+#define WDS 1.0
+#else
+#define WD 1
+#define WDT 0.913447511792627
+#define WDP 5.832936323930807
+#define WDS 1.0
+#endif
 
-/* Regarding speed
+/* Extremes preserve
  *
- * Speed may vary wildly for different vo and gpu-api settings. Generally 
- * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this 
- * may be different for your system.
+ * Reduce denoising in very bright/dark areas.
  *
- * If your GPU doesn't support textureGather, or if you are on a version of mpv 
- * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile
+ * Disabled by default now. If you want to reenable this, set EP=3/ in 
+ * Makefile.nlm and rebuild.
  *
- * If you plan on tinkering with NLM's settings, read below:
+ * The downscaling factor of the EP shader stage affects what is considered a 
+ * bright/dark area.
  *
- * textureGather only applies to luma and limited to the these configurations:
+ * This is incompatible with RGB. If you have RGB hooks enabled then you will 
+ * have to delete the EP shader stage or specify EP=0 through shader_cfg.
  *
- * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}
- *   - Default, very fast, rotations and reflections should be free
- *   - If this is unusually slow then try changing gpu-api and vo
- *   - If it's still slow, try setting RI/RFI to 0.
+ * EP: 1 to enable, 0 to disable
+ * DP: EP strength on dark areas, 0 to fully denoise
+ * BP: EP strength on bright areas, 0 to fully denoise
+ */
+#ifdef LUMA_raw
+#define EP 0
+#define BP 0.75
+#define DP 0.25
+#else
+#define EP 0
+#define BP 0.0
+#define DP 0.0
+#endif
+
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+
+/* textureGather applicable configurations:
  *
- * - PS=6:RI={0,1,3}:RFI={0,1,2}
+ * - PS={0,3,7,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2}
+ * - PS={0,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2}
+ * - PS=6:RI=0:RFI=0
  *   - Currently the only scalable variant
- *   - Patch shape is asymmetric on two axis
- *   - Rotations should have very little speed impact
- *   - Reflections may have a significant speed impact
  *
  * Options which always disable textureGather:
- * 	- PD
- * 	- NG
+ * 	  - NG
+ * 	  - SAMPLE
+ * 	  - PD
+ *
+ * Running without textureGather may be much slower.
  */
 
-// The following is shader code injected from guided.glsl
-/* vi: ft=c
+/* Patch & research sizes
  *
- * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
+ * P should be an odd number. Higher values are slower and not always better.
  *
- * This program is free software: you can redistribute it and/or modify it 
- * under the terms of the GNU Lesser General Public License as published by 
- * the Free Software Foundation, either version 2.1 of the License, or (at 
- * your option) any later version.
+ * R should be an odd number greater than or equal to 3. Higher values are 
+ * generally better, but slower, blurrier, and gives diminishing returns.
+ */
+#ifdef LUMA_raw
+#define P 3
+#define R 5
+#else
+#define P 3
+#define R 5
+#endif
+
+/* Patch and research shapes
  *
- * This program is distributed in the hope that it will be useful, but WITHOUT 
- * ANY WARRANTY;  without even the implied warranty of MERCHANTABILITY or 
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
- * for more details.
+ * Different shapes have different speed and quality characteristics. Every 
+ * shape (besides square) is smaller than square.
  *
- * You should have received a copy of the GNU Lesser General Public License 
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ * PS applies applies to patches, RS applies to research zones.
+ *
+ * 0: square (symmetrical)
+ * 1: horizontal line (asymmetric)
+ * 2: vertical line (asymmetric)
+ * 3: diamond (symmetrical)
+ * 4: triangle (asymmetric, pointing upward)
+ * 5: truncated triangle (asymmetric on two axis, last row halved)
+ * 6: even sized square (asymmetric on two axis)
+ * 7: plus (symmetrical)
+ * 8: plus X (symmetrical)
  */
+#ifdef LUMA_raw
+#define RS 3
+#define PS 4
+#else
+#define RS 3
+#define PS 3
+#endif
 
-// Description: guided.glsl: Guided by the downscaled image
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Computes weights on a guide, which could be a downscaled image or the output 
+ * of another shader, and applies the weights to the original image
+ */
+#define RF_LUMA 0
+#define RF 0
 
-/* The radius can be adjusted with the MEANI stage's downscaling factor. 
- * Higher numbers give a bigger radius.
+/* Rotational/reflectional invariance
+ *
+ * Number of rotations/reflections to try for each patch comparison. Can be 
+ * slow, but improves feature preservation. More rotations/reflections gives 
+ * diminishing returns. The most similar rotation/reflection will be used.
  *
- * The E variable can be found in the A stage.
+ * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
+ * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
  *
- * The subsampling (fast guided filter) can be adjusted with the I stage's 
- * downscaling factor. Higher numbers are faster.
+ * Consider setting SAMPLE=1 if setting RI to a setting that would require 
+ * sampling between pixels.
  *
- * The guide's subsampling can be adjusted with the PREI stage's downscaling 
- * factor. Higher numbers downscale more.
+ * RI: Rotational invariance
+ * RFI (0 to 2): Reflectional invariance
  */
+#ifdef LUMA_raw
+#define RI 0
+#define RFI 0
+#else
+#define RI 0
+#define RFI 0
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND HOOKED
-//!WIDTH HOOKED.w 1.25 /
-//!HEIGHT HOOKED.h 1.25 /
-//!DESC Guided filter (PREI)
-//!SAVE _INJ_PREI
+/* Temporal denoising
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Caveats:
+ * 	  - Slower:
+ * 	  	  - Each frame needs to be researched (more samples & more math)
+ * 	  	  - Gather optimizations only apply to the current frame
+ * 	  - Requires vo=gpu-next
+ * 	  - Luma-only (this is a bug)
+ * 	  - Buggy
+ *
+ * May cause motion blur and may struggle more with noise that persists across 
+ * multiple frames (e.g., from compression or duplicate frames), but can work 
+ * very well on high quality video.
+ *
+ * Motion estimation (ME) should improve quality without impacting speed.
+ *
+ * T: number of frames used
+ * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
+ * MEF: estimate factor, compensates for ME being one frame behind
+ * TRF: compare against the denoised frames
+ */
+#ifdef LUMA_raw
+#define T 0
+#define ME 1
+#define MEF 2
+#define TRF 0
+#else
+#define T 0
+#define ME 0
+#define MEF 2
+#define TRF 0
+#endif
+
+/* Spatial kernel
+ *
+ * Increasing the spatial denoising factor (SS) reduces the weight of further 
+ * pixels.
+ *
+ * Spatial distortion instructs the spatial kernel to view that axis as 
+ * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
+ * appear closer and increase blur between frames.
+ *
+ * The intra-patch variants are supposed to help with larger patch sizes.
+ *
+ * SST: enables spatial kernel if R>=PST, 0 fully disables
+ * SS: spatial sigma
+ * SD: spatial distortion (X, Y, time)
+ * PSS: intra-patch spatial sigma
+ * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
+ * PSD: intra-patch spatial distortion (X, Y)
+ */
+#ifdef LUMA_raw
+#define SST 1
+#define SS 0.49764743714339127
+#define SD vec3(1,1,1)
+#define PST 0
+#define PSS 0.0
+#define PSD vec2(1,1)
+#else
+#define SST 1
+#define SS 0.32091162692066677
+#define SD vec3(1,1,1)
+#define PST 0
+#define PSS 0.0
+#define PSD vec2(1,1)
+#endif
+
+/* Kernels
+ *
+ * SK: spatial kernel
+ * RK: range kernel (takes patch differences)
+ * PSK: intra-patch spatial kernel
+ * WDK: weight discard kernel
+ * WD1TK (WD=1 only): weight discard tolerance kernel
+ *
+ * List of available kernels:
+ *
+ * bicubic
+ * cos
+ * gaussian
+ * lanczos
+ * quadratic_ (unclamped)
+ * sinc
+ * sinc_ (unclamped)
+ * sinc3
+ * sphinx
+ * sphinx_ (unclamped)
+ * triangle_ (unclamped)
+ * triangle
+ */
+#ifdef LUMA_raw
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
+#define WDK is_zero
+#define WD1TK gaussian
+#else
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
+#define WDK is_zero
+#define WD1TK gaussian
+#endif
+
+/* Sampling method
+ *
+ * In most cases this shouldn't make any difference, only set to bilinear if 
+ * it's necessary to sample between pixels (e.g., RI=2).
+ *
+ * 0: nearest neighbor
+ * 1: bilinear
+ */
+#ifdef LUMA_raw
+#define SAMPLE 0
+#else
+#define SAMPLE 0
+#endif
+
+/* Research scaling factor
+ *
+ * Higher numbers sample more sparsely as the distance from the POI grows.
+ */
+#ifdef LUMA_raw
+#define RSF 0.0
+#else
+#define RSF 0.0
+#endif
+
+// Scaling factor (should match WIDTH/HEIGHT)
+#ifdef LUMA_raw
+#define SF 1
+#else
+#define SF 1
+#endif
+
+// Use the guide image as the input image
+#ifdef LUMA_raw
+#define GUIDE_INPUT 0
+#else
+#define GUIDE_INPUT 0
+#endif
+
+/* Visualization
+ *
+ * 0: off
+ * 1: absolute difference between input/output to the power of 0.25
+ * 2: difference between input/output centered on 0.5
+ * 3: post-WD weight map
+ * 4: pre-WD weight map
+ * 5: unsharp mask
+ * 6: EP
+ */
+#ifdef LUMA_raw
+#define V 0
+#else
+#define V 0
+#endif
+
+// Blur factor (0.0 returns the input image, 1.0 returns the output image)
+#ifdef LUMA_raw
+#define BF 1.0
+#else
+#define BF 1.0
+#endif
+
+// Force disable textureGather
+#ifdef LUMA_raw
+#define NG 0
+#else
+#define NG 0
+#endif
+
+// Patch donut (probably useless)
+#ifdef LUMA_raw
+#define PD 0
+#else
+#define PD 0
+#endif
+
+// Duplicate 1st weight (for luma-guided-chroma)
+#ifdef LUMA_raw
+#define D1W 0
+#else
+#define D1W 0
+#endif
+
+// Skip patch comparison
+#ifdef LUMA_raw
+#define SKIP_PATCH 0
+#else
+#define SKIP_PATCH 0
+#endif
+
+// Shader code
+
+#define EPSILON 1.2e-38
+#define M_PI 3.14159265358979323846
+#define POW2(x) ((x)*(x))
+#define POW3(x) ((x)*(x)*(x))
+#define bicubic_(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define bicubic(x) bicubic_(clamp((x), 0.0, 2.0))
+#define gaussian(x) exp(-1 * POW2(x))
+#define quadratic_(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
+#define quadratic(x) quadratic_(clamp((x), 0.0, 1.5))
+#define sinc_(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
+#define sinc(x) sinc_(clamp((x), 0.0, 1.0))
+#define sinc3(x) sinc_(clamp((x), 0.0, 3.0))
+#define lanczos(x) (sinc3(x) * sinc(x))
+#define sphinx_(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+#define sphinx(x) sphinx_(clamp((x), 0.0, 1.4302966531242027))
+#define triangle_(x) (1 - (x))
+#define triangle(x) triangle_(clamp((x), 0.0, 1.0))
+#define is_zero(x) int(x == 0)
+
+// XXX could maybe be better optimized on LGC
+#if defined(LUMA_raw)
+#define val float
+#define val_swizz(v) (v.x)
+#define unval(v) vec4(v.x, 0, 0, poi_.a)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#elif defined(CHROMA_raw)
+#define val vec2
+#define val_swizz(v) (v.xy)
+#define unval(v) vec4(v.x, v.y, 0, poi_.a)
+#define val_packed uint
+#define val_pack(v) packUnorm2x16(v)
+#define val_unpack(v) unpackUnorm2x16(v)
+#else
+#define val vec3
+#define val_swizz(v) (v.xyz)
+#define unval(v) vec4(v.x, v.y, v.z, poi_.a)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#endif
+
+#if PS == 6
+const int hp = P/2;  
+#else
+const float hp = int(P/2) - 0.5*(1-(P%2));   // sample between pixels for even patch sizes
+#endif
+
+#if RS == 6
+const int hr = R/2;  
+#else
+const float hr = int(R/2) - 0.5*(1-(R%2));   // sample between pixels for even research sizes
+#endif
+
+// patch/research shapes
+// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
+// dots (.) represent samples (pixels) and X represents the pixel-of-interest
+
+// Z    .....
+// Z    .....
+// Z    ..X..
+// Z    .....
+// Z    .....
+#define S_SQUARE(z,hz,incr) for (z.x = -hz;   z.x <= hz;   z.x++) for (z.y = -hz;   z.y <= hz;   incr)
+
+// (in this instance Z=4)
+// Z    ....
+// Z    ....
+// Z    ..X.
+// Z    ....
+#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz;   z.x < hz;   z.x++) for (z.y = -hz;   z.y < hz;   incr)
+
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+#define S_TRIANGLE(z,hz,incr) for (z.y = -hz;   z.y <= 0;   z.y++) for (z.x = -abs(abs(z.y) - hz);   z.x <= abs(abs(z.y) - hz);   incr)
+
+// Z-4    .
+// Z-2   ...
+// hz+1 ..X
+#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz;   z.y <= 0;   z.y++) for (z.x = -abs(abs(z.y) - hz);   z.x <= abs(abs(z.y) - hz)*int(z.y!=0);   incr)
+#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
+
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+// Z-2   ...
+// Z-4    .
+#define S_DIAMOND(z,hz,incr) for (z.x = -hz;   z.x <= hz;   z.x++) for (z.y = -abs(abs(z.x) - hz);   z.y <= abs(abs(z.x) - hz);   incr)
+#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
+
+//
+// Z    ..X..
+//
+#define S_HORIZONTAL(z,hz,incr) for (z.y = 0;   z.y <= 0;   z.y++) for (z.x = -hz;   z.x <= hz;   incr)
+
+// 90 degree rotation of S_HORIZONTAL
+#define S_VERTICAL(z,hz,incr) for (z.x = 0;   z.x <= 0;   z.x++) for (z.y = -hz;   z.y <= hz;   incr)
+
+// 1      .
+// 1      . 
+// Z    ..X..
+// 1      . 
+// 1      .
+#define S_PLUS(z,hz,incr) for (z.x = -hz;   z.x <= hz;   z.x++) for (z.y = -hz * int(z.x == 0);   z.y <= hz * int(z.x == 0);   incr)
+#define S_PLUS_A(hz,Z) (Z*2 - 1)
+
+// 3    . . .
+// 3     ...
+// Z    ..X..
+// 3     ...
+// 3    . . .
+#define S_PLUS_X(z,hz,incr) for (z.x = -hz;   z.x <= hz;   z.x++) for (z.y = -abs(z.x) + -hz * int(z.x == 0);   z.y <= abs(z.x) + hz * int(z.x == 0);   incr)
+#define S_PLUS_X_A(hz,Z) (Z*4 - 3)
+
+// 1x1 square
+#define S_1X1(z) for (z = vec3(0);   z.x <= 0;   z.x++)
+
+#define T1 (T+1)
+#define FOR_FRAME(r) for (r.z = 0;   r.z < T1;   r.z++)
+
+#ifdef LUMA_raw
+#define RF_ RF_LUMA
+#else
+#define RF_ RF
+#endif
+
+// donut increment, increments without landing on (0,0,0)
+// much faster than a continue statement
+#define DINCR(z,c,a) ((z.c += a),(z.c += int(z == vec3(0))))
+
+#define R_AREA(a) (a * T1 - 1)
+
+// research shapes
+// XXX would be nice to have the option of temporally-varying research sizes
+#if R == 0 || R == 1
+#define FOR_RESEARCH(r) S_1X1(r)
+const int r_area = R_AREA(1);  
+#elif RS == 8
+#define FOR_RESEARCH(r) S_PLUS_X(r,hr,DINCR(r,y,max(1,abs(r.x))))
+const int r_area = R_AREA(S_PLUS_X_A(hr,R));  
+#elif RS == 7
+#define FOR_RESEARCH(r) S_PLUS(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(S_PLUS_A(hr,R));  
+#elif RS == 6
+#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(R*R);  
+#elif RS == 5
+#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,DINCR(r,x,1))
+const int r_area = R_AREA(S_TRIANGLE_A(hr,hr));  
+#elif RS == 4
+#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,DINCR(r,x,1))
+const int r_area = R_AREA(S_TRIANGLE_A(hr,R));  
+#elif RS == 3
+#define FOR_RESEARCH(r) S_DIAMOND(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(S_DIAMOND_A(hr,R));  
+#elif RS == 2
+#define FOR_RESEARCH(r) S_VERTICAL(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(R);  
+#elif RS == 1
+#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,DINCR(r,x,1))
+const int r_area = R_AREA(R);  
+#elif RS == 0
+#define FOR_RESEARCH(r) S_SQUARE(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(R*R);  
+#endif
+
+#define RI1 (RI+1)
+#define RFI1 (RFI+1)
+
+#if RI
+#define FOR_ROTATION for (float ri = 0;   ri < 360;   ri+=360.0/RI1)
+#else
+#define FOR_ROTATION
+#endif
+
+#if RFI
+#define FOR_REFLECTION for (int rfi = 0;   rfi < RFI1;   rfi++)
+#else
+#define FOR_REFLECTION
+#endif
+
+#if PD
+#define PINCR DINCR
+#else
+#define PINCR(z,c,a) (z.c += a)
+#endif
+
+#define P_AREA(a) (a - PD)
+
+// patch shapes
+#if P == 0 || P == 1
+#define FOR_PATCH(p) S_1X1(p)
+const int p_area = P_AREA(1);  
+#elif PS == 8
+#define FOR_PATCH(p) S_PLUS_X(p,hp,PINCR(p,y,max(1,abs(p.x))))
+const int p_area = P_AREA(S_PLUS_X_A(hp,P));  
+#elif PS == 7
+#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(S_PLUS_A(hp,P));  
+#elif PS == 6
+#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(P*P);  
+#elif PS == 5
+#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x,1))
+const int p_area = P_AREA(S_TRIANGLE_A(hp,hp));  
+#elif PS == 4
+#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x,1))
+const int p_area = P_AREA(S_TRIANGLE_A(hp,P));  
+#elif PS == 3
+#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(S_DIAMOND_A(hp,P));  
+#elif PS == 2
+#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(P);  
+#elif PS == 1
+#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x,1))
+const int p_area = P_AREA(P);  
+#elif PS == 0
+#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(P*P);  
+#endif
+
+const float r_scale = 1.0/r_area;  
+const float p_scale = 1.0/p_area;  
+
+#if SAMPLE == 0
+#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * (vec2(off) + 0.5 - fract((pos) * (size))))
+#else
+#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * vec2(off))
+#endif
+
+#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
+
+#if RF_ && defined(LUMA_raw)
+#define load2_(off) sample(_INJ_RF_LUMA_tex, _INJ_RF_LUMA_pos, _INJ_RF_LUMA_size, _INJ_RF_LUMA_pt, off)
+#define gather_offs(off, off_arr) (_INJ_RF_LUMA_mul * vec4(textureGatherOffsets(_INJ_RF_LUMA_raw, _INJ_RF_LUMA_pos + vec2(off) * _INJ_RF_LUMA_pt, off_arr)))
+#define gather(off) _INJ_RF_LUMA_gather(_INJ_RF_LUMA_pos + (off) * _INJ_RF_LUMA_pt, 0)
+#elif RF_ && D1W
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
+#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
+#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
+#elif RF_
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
+#else
+#define load2_(off) load_(off)
+#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
+#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
+#endif
+
+#if T
+val load(vec3 off)
+{
+	  switch (min(int(off.z), frame)) {
+	  case 0: return val_swizz(load_(off));  
+
+	  }
+}
+val load2(vec3 off)
+{
+	  return off.z == 0 ? val_swizz(load2_(off)) : load(off);  
+}
+#else
+#define load(off) val_swizz(load_(off))
+#define load2(off) val_swizz(load2_(off))
+#endif
+
+val poi2 = load2(vec3(0));   // guide pixel-of-interest
+#if GUIDE_INPUT
+#define poi poi2
+#else
+vec4 poi_ = load_(vec3(0));  
+val poi = val_swizz(poi_);   // pixel-of-interest
+#endif
+
+#if RI // rotation
+vec2 rot(vec2 p, float d)
+{
+	  return vec2(
+	  	  p.x * cos(radians(d)) - p.y * sin(radians(d)),
+	  	  p.y * sin(radians(d)) + p.x * cos(radians(d))
+	  );  
+}
+#else
+#define rot(p, d) (p)
+#endif
+
+#if RFI // reflection
+vec2 ref(vec2 p, int d)
+{
+	  switch (d) {
+	  case 0: return p;  
+	  case 1: return p * vec2(1, -1);  
+	  case 2: return p * vec2(-1, 1);  
+	  }
+}
+#else
+#define ref(p, d) (p)
+#endif
+
+#if SST && R >= SST
+float spatial_r(vec3 v)
+{
+	  v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size);  
+	  return SK(length(v*SD)*SS);  
+}
+#else
+#define spatial_r(v) (1)
+#endif
+
+#if PST && P >= PST
+#define spatial_p(v) PSK(length(v*PSD)*PSS)
+#else
+#define spatial_p(v) (1)
+#endif
+
+val range(val pdiff_sq)
+{
+	  const float h = max(S, 0.0) * 0.013;  
+	  const float pdiff_scale = 1.0/(h*h);  
+	  pdiff_sq = sqrt(pdiff_sq * pdiff_scale);  
+#if defined(LUMA_raw)
+	  return RK(pdiff_sq);  
+#elif defined(CHROMA_raw)
+	  return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y));  
+#else
+	  return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z));  
+#endif
+}
+
+val patch_comparison(vec3 r, vec3 r2)
+{
+	  vec3 p;  
+	  val min_rot = val(p_area);  
+
+	  FOR_ROTATION FOR_REFLECTION {
+	  	  val pdiff_sq = val(0);  
+	  	  FOR_PATCH(p) {
+	  	  	  vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z);  
+	  	  	  val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);  
+	  	  	  diff_sq *= diff_sq;  
+	  	  	  diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy);  
+	  	  	  pdiff_sq += diff_sq;  
+	  	  }
+	  	  min_rot = min(min_rot, pdiff_sq);  
+	  }
+
+	  return min_rot * p_scale;  
+}
+
+#define NO_GATHER (PD == 0 && NG == 0 && SAMPLE == 0) // never textureGather if any of these conditions are false
+#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3 || RI == 7)
+
+#if (defined(LUMA_gather) || D1W) && ((PS == 0 || ((PS == 3 || PS == 7) && RI != 7) || PS == 8) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
+// 3x3 diamond/plus patch_comparison_gather
+// XXX extend to support arbitrary sizes (probably requires code generation)
+// XXX support PSS
+const ivec2 offsets_adj[4] = { ivec2(0,-1), ivec2(1,0), ivec2(0,1), ivec2(-1,0) };  
+const ivec2 offsets_adj_sf[4] = { ivec2(0,-1) * SF, ivec2(1,0) * SF, ivec2(0,1) * SF, ivec2(-1,0) * SF };  
+vec4 poi_patch_adj = gather_offs(0, offsets_adj);  
+#if PS == 0 || PS == 8
+const ivec2 offsets_diag[4] = { ivec2(-1,-1), ivec2(1,-1), ivec2(1,1), ivec2(-1,1) };  
+const ivec2 offsets_diag_sf[4] = { ivec2(-1,-1) * SF, ivec2(1,-1) * SF, ivec2(1,1) * SF, ivec2(-1,1) * SF };  
+vec4 poi_patch_diag = gather_offs(0, offsets_diag);  
+#endif
+float patch_comparison_gather(vec3 r, vec3 r2)
+{
+	  float min_rot = p_area - 1;  
+	  vec4 transformer_adj = gather_offs(r, offsets_adj_sf);  
+#if PS == 0 || PS == 8
+	  vec4 transformer_diag = gather_offs(r, offsets_diag_sf);  
+#endif
+	  FOR_ROTATION {
+	  	  FOR_REFLECTION {
+#if RFI
+	  	  	  /* xxy
+	  	  	   * w y
+	  	  	   * wzz
+	  	  	   */
+	  	  	  switch(rfi) {
+	  	  	  case 1:
+	  	  	  	  transformer_adj = transformer_adj.zyxw;  
+#if PS == 0 || PS == 8
+	  	  	  	  transformer_diag = transformer_diag.zyxw;  
+#endif
+	  	  	  	  break;  
+	  	  	  case 2:
+	  	  	  	  transformer_adj = transformer_adj.xwzy;  
+#if PS == 0 || PS == 8
+	  	  	  	  transformer_diag = transformer_diag.xwzy;  
+#endif
+	  	  	  	  break;  
+	  	  	  }
+#endif
+
+	  	  	  vec4 diff = poi_patch_adj - transformer_adj;  
+#if PS == 0 || PS == 8
+	  	  	  diff += poi_patch_diag - transformer_diag;  
+#endif
+	  	  	  float diff_sq = dot(diff * diff, vec4(1));  
+	  	  	  min_rot = min(diff_sq, min_rot);  
+
+// un-reflect
+#if RFI
+	  	  	  switch(rfi) {
+	  	  	  case 1:
+	  	  	  	  transformer_adj = transformer_adj.zyxw;  
+#if PS == 0 || PS == 8
+	  	  	  	  transformer_diag = transformer_diag.zyxw;  
+#endif
+	  	  	  	  break;  
+	  	  	  case 2:
+	  	  	  	  transformer_adj = transformer_adj.xwzy;  
+#if PS == 0 || PS == 8
+	  	  	  	  transformer_diag = transformer_diag.xwzy;  
+#endif
+	  	  	  	  break;  
+	  	  	  }
+#endif
+	  	  } // FOR_REFLECTION
+#if RI == 7
+	  	  transformer_adj = transformer_adj.wxyz;  
+	  	  // swap adjacents for diagonals
+	  	  transformer_adj += transformer_diag;  
+	  	  transformer_diag = transformer_adj - transformer_diag;  
+	  	  transformer_adj -= transformer_diag;  
+#elif RI == 3
+	  	  transformer_adj = transformer_adj.wxyz;  
+#elif RI == 1
+	  	  transformer_adj = transformer_adj.zwxy;  
+#endif
+#if RI == 3 && (PS == 0 || PS == 8)
+	  	  transformer_diag = transformer_diag.wxyz;  
+#elif RI == 1 && (PS == 0 || PS == 8)
+	  	  transformer_diag = transformer_diag.zwxy;  
+#endif
+	  } // FOR_ROTATION
+	  float center_diff = poi2.x - load2(r).x;  
+	  return (center_diff * center_diff + min_rot) * p_scale;  
+}
+#elif (defined(LUMA_gather) || D1W) && PS == 4 && P == 3 && RI == 0 && RFI == 0 && NO_GATHER
+const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,0), ivec2(1,0) };  
+const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,0) * SF, ivec2(1,0) * SF };  
+vec4 poi_patch = gather_offs(0, offsets);  
+float patch_comparison_gather(vec3 r, vec3 r2)
+{
+	  vec4 pdiff = poi_patch - gather_offs(r, offsets_sf);  
+	  return dot(pdiff * pdiff, vec4(1)) * p_scale;  
+}
+#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
+// tiled even square patch_comparison_gather
+// XXX extend to support odd square?
+float patch_comparison_gather(vec3 r, vec3 r2)
+{
+	  vec2 tile;  
+	  float min_rot = p_area;  
+
+	  /* gather order:
+	   * w z
+	   * x y
+	   */
+	  float pdiff_sq = 0;  
+	  for (tile.x = -hp;   tile.x < hp;   tile.x+=2) for (tile.y = -hp;   tile.y < hp;   tile.y+=2) {
+	  	  vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy);  
+	  	  diff_sq *= diff_sq;  
+	  	  diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)),
+	  	  	                                   spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0)));  
+	  	  pdiff_sq += dot(diff_sq, vec4(1));  
+	  }
+	  min_rot = min(min_rot, pdiff_sq);  
+
+	  return min_rot * p_scale;  
+}
+#else
+#define patch_comparison_gather patch_comparison
+#endif
+
+vec4 hook()
+{
+	  val total_weight = val(0);  
+	  val sum = val(0);  
+	  val result = val(0);  
+
+	  vec3 r = vec3(0);  
+	  vec3 p = vec3(0);  
+	  vec3 me = vec3(0);  
+
+#if T && ME == 1 // temporal & motion estimation
+	  vec3 me_tmp = vec3(0);  
+	  float maxweight = 0;  
+#elif T && ME == 2 // temporal & motion estimation
+	  vec3 me_sum = vec3(0);  
+	  float me_weight = 0;  
+#endif
+
+#if AS
+	  val total_weight_s = val(0);  
+	  val sum_s = val(0);  
+#endif
+
+#if WD == 2 // weight discard (mean)
+	  int r_index = 0;  
+	  val_packed all_weights[r_area];  
+	  val_packed all_pixels[r_area];  
+#elif WD == 1 // weight discard (moving cumulative average)
+	  int r_iter = 1;  
+	  val wd_total_weight = val(0);  
+	  val wd_sum = val(0);  
+#endif
+
+	  FOR_FRAME(r) {
+	  // XXX ME is always a frame behind, should have the option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
+#if T && ME == 1 // temporal & motion estimation max weight
+	  if (r.z > 0) {
+	  	  me += me_tmp * MEF;  
+	  	  me_tmp = vec3(0);  
+	  	  maxweight = 0;  
+	  }
+#elif T && ME == 2 // temporal & motion estimation weighted average
+	  if (r.z > 0) {
+	  	  me += round(me_sum / me_weight * MEF);  
+	  	  me_sum = vec3(0);  
+	  	  me_weight = 0;  
+	  }
+#endif
+	  FOR_RESEARCH(r) {
+	  	  // r coords with appropriate transformations applied
+	  	  vec3 tr = vec3(r.xy + floor(r.xy * RSF), r.z);  
+	  	  float spatial_weight = spatial_r(tr);  
+	  	  tr.xy += me.xy;  
+
+	  	  val px = load(tr);  
+
+#if SKIP_PATCH
+	  	  val weight = val(1);  
+#else
+	  	  val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(tr, vec3(0))) : patch_comparison(tr, vec3(0));  
+	  	  val weight = range(pdiff_sq);  
+#endif
+
+#if T && ME == 1 // temporal & motion estimation max weight
+	  	  me_tmp = vec3(tr.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));  
+	  	  maxweight = max(maxweight, weight.x);  
+#elif T && ME == 2 // temporal & motion estimation weighted average
+	  	  me_sum += vec3(tr.xy,0) * weight.x;  
+	  	  me_weight += weight.x;  
+#endif
+
+#if D1W
+	  	  weight = val(weight.x);  
+#endif
+
+	  	  weight *= spatial_weight;  
+
+#if AS
+	  	  spatial_weight *= int(r.z == 0);   // ignore temporal
+	  	  sum_s += px * spatial_weight;  
+	  	  total_weight_s += spatial_weight;  
+#endif
+
+#if WD == 2 // weight discard (mean)
+	  	  all_weights[r_index] = val_pack(weight);  
+	  	  all_pixels[r_index] = val_pack(px);  
+	  	  r_index++;  
+#elif WD == 1 // weight discard (moving cumulative average)
+	  	  val wd_scale = val(1.0/r_iter);  
+	  	  val below_threshold = WDS * abs(min(val(0.0), weight - (total_weight * wd_scale * WDT * WD1TK(sqrt(wd_scale*WDP)))));  
+#if defined(LUMA_raw)
+	  	  val wdkf = WDK(below_threshold);  
+#elif defined(CHROMA_raw)
+	  	  val wdkf = vec2(WDK(below_threshold.x), WDK(below_threshold.y));  
+#else
+	  	  val wdkf = vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.y));  
+#endif
+	  	  wd_sum += px * weight * wdkf;  
+	  	  wd_total_weight += weight * wdkf;  
+	  	  r_iter++;  
+#endif
+
+	  	  sum += px * weight;  
+	  	  total_weight += weight;  
+	  } // FOR_RESEARCH
+	  } // FOR_FRAME
+
+	  val avg_weight = total_weight * r_scale;  
+	  val old_avg_weight = avg_weight;  
+
+#if WD == 2 // weight discard (mean)
+	  total_weight = val(0);  
+	  sum = val(0);  
+
+	  for (int i = 0;   i < r_area;   i++) {
+	  	  val weight = val_unpack(all_weights[i]);  
+	  	  val px = val_unpack(all_pixels[i]);  
+
+	  	  val below_threshold = WDS * abs(min(val(0.0), weight - (avg_weight * WDT)));  
+#if defined(LUMA_raw)
+	  	  weight *= WDK(below_threshold);  
+#elif defined(CHROMA_raw)
+	  	  weight *= vec2(WDK(below_threshold.x), WDK(below_threshold.y));  
+#else
+	  	  weight *= vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.z));  
+#endif
+
+	  	  sum += px * weight;  
+	  	  total_weight += weight;  
+	  }
+#elif WD == 1 // weight discard (moving cumulative average)
+	  total_weight = wd_total_weight;  
+	  sum = wd_sum;  
+#endif
+#if WD // weight discard
+	  avg_weight = total_weight * r_scale;  
+#endif
+
+	  total_weight += SW * spatial_r(vec3(0));  
+	  sum += poi * SW * spatial_r(vec3(0));  
+	  result = val(sum / total_weight);  
+
+	  // store frames for temporal
+#if T > 1
+
+#endif
+#if T && TRF
+	  imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result));  
+#elif T
+	  imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2));  
+#endif
+
+#if AS == 1 // sharpen+denoise
+#define AS_base result
+#elif AS == 2 // sharpen only
+#define AS_base poi
+#endif
+#if AS
+	  val usm = result - sum_s/total_weight_s;  
+	  usm = exp(log(abs(usm))*ASP) * sign(usm);   // avoiding pow() since it's buggy on nvidia
+	  usm *= gaussian(abs((AS_base + usm - 0.5) / 1.5) * ASA);  
+	  usm *= ASF;  
+	  result = AS_base + usm;  
+#endif
+
+#if EP // extremes preserve
+	  float luminance = EP_texOff(0).x;  
+	  // EPSILON is needed since pow(0,0) is undefined
+	  float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP));  
+	  result = mix(poi, result, ep_weight);  
+#else
+	  float ep_weight = 0;  
+#endif
+
+#if V == 1
+	  result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0);  
+#elif V == 2
+	  result = (poi - result) * 0.5 + 0.5;  
+#elif V == 3 // post-WD weight map
+	  result = avg_weight;  
+#elif V == 4 // pre-WD edge map
+	  result = old_avg_weight;  
+#elif V == 5
+	  result = 0.5 + usm;  
+#elif V == 6
+	  result = val(1 - ep_weight);  
+#endif
+
+// XXX visualize chroma for these
+#if defined(CHROMA_raw) && (V == 3 || V == 4 || V == 6)
+	  return vec4(0.5);  
+#endif
+
+	  return unval(mix(poi, result, BF));  
+}
+
+// End of source code injected from ../LQ/nlmeans.glsl 
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!BIND _INJ_RF_LUMA
+//!WIDTH _INJ_RF_LUMA.w
+//!HEIGHT _INJ_RF_LUMA.h
+//!DESC Non-local means (RF, share)
+//!SAVE _INJ_RF
+
+vec4 hook()
+{
+return _INJ_RF_LUMA_texOff(0);
+}
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!BIND HOOKED
+//!BIND _INJ_RF_LUMA
+//!BIND _INJ_RF
+//!DESC Non-local means (nlmeans.glsl)
+//!SAVE RF_LUMA
+
+// User variables
+
+// It is generally preferable to denoise luma and chroma differently, so the 
+// user variables for luma and chroma are split.
+
+// Denoising factor (level of blur, higher means more blur)
+#ifdef LUMA_raw
+#define S 2.0522687499802097
+#else
+#define S 2.5168955531436197
+#endif
+
+/* Adaptive sharpening
+ *
+ * Performs an unsharp mask by subtracting the spatial kernel's blur from the 
+ * NLM blur. For sharpen+denoise the sharpening is limited to edge areas and 
+ * denoising is done everywhere else.
+ *
+ * Use V=4 to visualize which areas are sharpened (black means sharpen).
+ *
+ * AS:
+ * 	 - 0: disable
+ * 	 - 1: sharpen+denoise
+ * 	 - 2: sharpen only
+ * ASF: Higher numbers make a sharper image
+ * ASA: Anti-ringing, higher numbers increase strength
+ * ASP: Power, lower numbers increase sharpening on lower frequency detail
+ */
+#ifdef LUMA_raw
+#define AS 0
+#define ASF 0.1625
+#define ASA 5.0
+#define ASP 0.5
+#else
+#define AS 0
+#define ASF 0.1625
+#define ASA 5.0
+#define ASP 0.5
+#endif
+
+/* Starting weight
+ *
+ * Also known as the center weight. This represents the weight of the 
+ * pixel-of-interest. Lower numbers may help handle heavy noise & ringing.
+ *
+ * EPSILON should be used instead of zero to avoid divide-by-zero errors.
+ */
+#ifdef LUMA_raw
+#define SW 1.3011446081346498
+#else
+#define SW 1.2219854377433914
+#endif
+
+/* Weight discard
+ *
+ * Reduces weights that fall below a fraction of the average weight. This culls 
+ * the most dissimilar samples from the blur, which can yield a better result, 
+ * especially around edges.
+ * 
+ * WD:
+ * 	 - 2: Mean. Better quality, but slower and requires GLSL 4.0 or later
+ * 	 - 1: Moving cumulative average. Fast but inaccurate, blurs directionally.
+ * 	 - 0: Disable
+ *
+ * WDT: Threshold coefficient, higher numbers discard more
+ * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
+ * WDS (not for WDK=is_zero): Higher numbers are more eager to reduce weights
+ */
+#ifdef LUMA_raw
+#define WD 2
+#define WDT 0.11671341022864548
+#define WDP 5.381278367349288
+#define WDS 1.0
+#else
+#define WD 0
+#define WDT 0.002713346103131793
+#define WDP 5.832936323930807
+#define WDS 1.0
+#endif
+
+/* Extremes preserve
+ *
+ * Reduce denoising in very bright/dark areas.
+ *
+ * Disabled by default now. If you want to reenable this, set EP=3/ in 
+ * Makefile.nlm and rebuild.
+ *
+ * The downscaling factor of the EP shader stage affects what is considered a 
+ * bright/dark area.
+ *
+ * This is incompatible with RGB. If you have RGB hooks enabled then you will 
+ * have to delete the EP shader stage or specify EP=0 through shader_cfg.
+ *
+ * EP: 1 to enable, 0 to disable
+ * DP: EP strength on dark areas, 0 to fully denoise
+ * BP: EP strength on bright areas, 0 to fully denoise
+ */
+#ifdef LUMA_raw
+#define EP 0
+#define BP 0.75
+#define DP 0.25
+#else
+#define EP 0
+#define BP 0.0
+#define DP 0.0
+#endif
+
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+
+/* textureGather applicable configurations:
+ *
+ * - PS={0,3,7,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2}
+ * - PS={0,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2}
+ * - PS=6:RI=0:RFI=0
+ *   - Currently the only scalable variant
+ *
+ * Options which always disable textureGather:
+ * 	 - NG
+ * 	 - SAMPLE
+ * 	 - PD
+ *
+ * Running without textureGather may be much slower.
+ */
+
+/* Patch & research sizes
+ *
+ * P should be an odd number. Higher values are slower and not always better.
+ *
+ * R should be an odd number greater than or equal to 3. Higher values are 
+ * generally better, but slower, blurrier, and gives diminishing returns.
+ */
+#ifdef LUMA_raw
+#define P 3
+#define R 5
+#else
+#define P 3
+#define R 5
+#endif
+
+/* Patch and research shapes
+ *
+ * Different shapes have different speed and quality characteristics. Every 
+ * shape (besides square) is smaller than square.
+ *
+ * PS applies applies to patches, RS applies to research zones.
+ *
+ * 0: square (symmetrical)
+ * 1: horizontal line (asymmetric)
+ * 2: vertical line (asymmetric)
+ * 3: diamond (symmetrical)
+ * 4: triangle (asymmetric, pointing upward)
+ * 5: truncated triangle (asymmetric on two axis, last row halved)
+ * 6: even sized square (asymmetric on two axis)
+ * 7: plus (symmetrical)
+ * 8: plus X (symmetrical)
+ */
+#ifdef LUMA_raw
+#define RS 3
+#define PS 3
+#else
+#define RS 3
+#define PS 3
+#endif
+
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Computes weights on a guide, which could be a downscaled image or the output 
+ * of another shader, and applies the weights to the original image
+ */
+#define _INJ_RF_LUMA 1
+#define RF 1
+
+/* Rotational/reflectional invariance
+ *
+ * Number of rotations/reflections to try for each patch comparison. Can be 
+ * slow, but improves feature preservation. More rotations/reflections gives 
+ * diminishing returns. The most similar rotation/reflection will be used.
+ *
+ * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
+ * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
+ *
+ * Consider setting SAMPLE=1 if setting RI to a setting that would require 
+ * sampling between pixels.
+ *
+ * RI: Rotational invariance
+ * RFI (0 to 2): Reflectional invariance
+ */
+#ifdef LUMA_raw
+#define RI 0
+#define RFI 2
+#else
+#define RI 0
+#define RFI 0
+#endif
+
+/* Temporal denoising
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Caveats:
+ * 	 - Slower:
+ * 	 	 - Each frame needs to be researched (more samples & more math)
+ * 	 	 - Gather optimizations only apply to the current frame
+ * 	 - Requires vo=gpu-next
+ * 	 - Luma-only (this is a bug)
+ * 	 - Buggy
+ *
+ * May cause motion blur and may struggle more with noise that persists across 
+ * multiple frames (e.g., from compression or duplicate frames), but can work 
+ * very well on high quality video.
+ *
+ * Motion estimation (ME) should improve quality without impacting speed.
+ *
+ * T: number of frames used
+ * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
+ * MEF: estimate factor, compensates for ME being one frame behind
+ * TRF: compare against the denoised frames
+ */
+#ifdef LUMA_raw
+#define T 0
+#define ME 1
+#define MEF 2
+#define TRF 0
+#else
+#define T 0
+#define ME 0
+#define MEF 2
+#define TRF 0
+#endif
+
+/* Spatial kernel
+ *
+ * Increasing the spatial denoising factor (SS) reduces the weight of further 
+ * pixels.
+ *
+ * Spatial distortion instructs the spatial kernel to view that axis as 
+ * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
+ * appear closer and increase blur between frames.
+ *
+ * The intra-patch variants are supposed to help with larger patch sizes.
+ *
+ * SST: enables spatial kernel if R>=PST, 0 fully disables
+ * SS: spatial sigma
+ * SD: spatial distortion (X, Y, time)
+ * PSS: intra-patch spatial sigma
+ * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
+ * PSD: intra-patch spatial distortion (X, Y)
+ */
+#ifdef LUMA_raw
+#define SST 1
+#define SS 0.5296176863733414
+#define SD vec3(1,1,1)
+#define PST 0
+#define PSS 0.0
+#define PSD vec2(1,1)
+#else
+#define SST 1
+#define SS 0.26295970436981203
+#define SD vec3(1,1,1)
+#define PST 0
+#define PSS 0.0
+#define PSD vec2(1,1)
+#endif
+
+/* Kernels
+ *
+ * SK: spatial kernel
+ * RK: range kernel (takes patch differences)
+ * PSK: intra-patch spatial kernel
+ * WDK: weight discard kernel
+ * WD1TK (WD=1 only): weight discard tolerance kernel
+ *
+ * List of available kernels:
+ *
+ * bicubic
+ * cos
+ * gaussian
+ * lanczos
+ * quadratic_ (unclamped)
+ * sinc
+ * sinc_ (unclamped)
+ * sinc3
+ * sphinx
+ * sphinx_ (unclamped)
+ * triangle_ (unclamped)
+ * triangle
+ */
+#ifdef LUMA_raw
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
+#define WDK is_zero
+#define WD1TK gaussian
+#else
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
+#define WDK is_zero
+#define WD1TK gaussian
+#endif
+
+/* Sampling method
+ *
+ * In most cases this shouldn't make any difference, only set to bilinear if 
+ * it's necessary to sample between pixels (e.g., RI=2).
+ *
+ * 0: nearest neighbor
+ * 1: bilinear
+ */
+#ifdef LUMA_raw
+#define SAMPLE 0
+#else
+#define SAMPLE 0
+#endif
+
+/* Research scaling factor
+ *
+ * Higher numbers sample more sparsely as the distance from the POI grows.
+ */
+#ifdef LUMA_raw
+#define RSF 0.0
+#else
+#define RSF 0.0
+#endif
+
+// Scaling factor (should match WIDTH/HEIGHT)
+#ifdef LUMA_raw
+#define SF 1
+#else
+#define SF 1
+#endif
+
+// Use the guide image as the input image
+#ifdef LUMA_raw
+#define GUIDE_INPUT 0
+#else
+#define GUIDE_INPUT 0
+#endif
+
+/* Visualization
+ *
+ * 0: off
+ * 1: absolute difference between input/output to the power of 0.25
+ * 2: difference between input/output centered on 0.5
+ * 3: post-WD weight map
+ * 4: pre-WD weight map
+ * 5: unsharp mask
+ * 6: EP
+ */
+#ifdef LUMA_raw
+#define V 0
+#else
+#define V 0
+#endif
+
+// Blur factor (0.0 returns the input image, 1.0 returns the output image)
+#ifdef LUMA_raw
+#define BF 1.0
+#else
+#define BF 1.0
+#endif
+
+// Force disable textureGather
+#ifdef LUMA_raw
+#define NG 0
+#else
+#define NG 0
+#endif
+
+// Patch donut (probably useless)
+#ifdef LUMA_raw
+#define PD 0
+#else
+#define PD 0
+#endif
+
+// Duplicate 1st weight (for luma-guided-chroma)
+#ifdef LUMA_raw
+#define D1W 0
+#else
+#define D1W 0
+#endif
+
+// Skip patch comparison
+#ifdef LUMA_raw
+#define SKIP_PATCH 0
+#else
+#define SKIP_PATCH 0
+#endif
+
+// Shader code
+
+#define EPSILON 1.2e-38
+#define M_PI 3.14159265358979323846
+#define POW2(x) ((x)*(x))
+#define POW3(x) ((x)*(x)*(x))
+#define bicubic_(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define bicubic(x) bicubic_(clamp((x), 0.0, 2.0))
+#define gaussian(x) exp(-1 * POW2(x))
+#define quadratic_(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
+#define quadratic(x) quadratic_(clamp((x), 0.0, 1.5))
+#define sinc_(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
+#define sinc(x) sinc_(clamp((x), 0.0, 1.0))
+#define sinc3(x) sinc_(clamp((x), 0.0, 3.0))
+#define lanczos(x) (sinc3(x) * sinc(x))
+#define sphinx_(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+#define sphinx(x) sphinx_(clamp((x), 0.0, 1.4302966531242027))
+#define triangle_(x) (1 - (x))
+#define triangle(x) triangle_(clamp((x), 0.0, 1.0))
+#define is_zero(x) int(x == 0)
+
+// XXX could maybe be better optimized on LGC
+#if defined(LUMA_raw)
+#define val float
+#define val_swizz(v) (v.x)
+#define unval(v) vec4(v.x, 0, 0, poi_.a)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#elif defined(CHROMA_raw)
+#define val vec2
+#define val_swizz(v) (v.xy)
+#define unval(v) vec4(v.x, v.y, 0, poi_.a)
+#define val_packed uint
+#define val_pack(v) packUnorm2x16(v)
+#define val_unpack(v) unpackUnorm2x16(v)
+#else
+#define val vec3
+#define val_swizz(v) (v.xyz)
+#define unval(v) vec4(v.x, v.y, v.z, poi_.a)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#endif
+
+#if PS == 6
+const int hp = P/2; 
+#else
+const float hp = int(P/2) - 0.5*(1-(P%2));  // sample between pixels for even patch sizes
+#endif
+
+#if RS == 6
+const int hr = R/2; 
+#else
+const float hr = int(R/2) - 0.5*(1-(R%2));  // sample between pixels for even research sizes
+#endif
+
+// patch/research shapes
+// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
+// dots (.) represent samples (pixels) and X represents the pixel-of-interest
+
+// Z    .....
+// Z    .....
+// Z    ..X..
+// Z    .....
+// Z    .....
+#define S_SQUARE(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -hz;  z.y <= hz;  incr)
+
+// (in this instance Z=4)
+// Z    ....
+// Z    ....
+// Z    ..X.
+// Z    ....
+#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz;  z.x < hz;  z.x++) for (z.y = -hz;  z.y < hz;  incr)
+
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+#define S_TRIANGLE(z,hz,incr) for (z.y = -hz;  z.y <= 0;  z.y++) for (z.x = -abs(abs(z.y) - hz);  z.x <= abs(abs(z.y) - hz);  incr)
+
+// Z-4    .
+// Z-2   ...
+// hz+1 ..X
+#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz;  z.y <= 0;  z.y++) for (z.x = -abs(abs(z.y) - hz);  z.x <= abs(abs(z.y) - hz)*int(z.y!=0);  incr)
+#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
+
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+// Z-2   ...
+// Z-4    .
+#define S_DIAMOND(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -abs(abs(z.x) - hz);  z.y <= abs(abs(z.x) - hz);  incr)
+#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
+
+//
+// Z    ..X..
+//
+#define S_HORIZONTAL(z,hz,incr) for (z.y = 0;  z.y <= 0;  z.y++) for (z.x = -hz;  z.x <= hz;  incr)
+
+// 90 degree rotation of S_HORIZONTAL
+#define S_VERTICAL(z,hz,incr) for (z.x = 0;  z.x <= 0;  z.x++) for (z.y = -hz;  z.y <= hz;  incr)
+
+// 1      .
+// 1      . 
+// Z    ..X..
+// 1      . 
+// 1      .
+#define S_PLUS(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -hz * int(z.x == 0);  z.y <= hz * int(z.x == 0);  incr)
+#define S_PLUS_A(hz,Z) (Z*2 - 1)
+
+// 3    . . .
+// 3     ...
+// Z    ..X..
+// 3     ...
+// 3    . . .
+#define S_PLUS_X(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -abs(z.x) + -hz * int(z.x == 0);  z.y <= abs(z.x) + hz * int(z.x == 0);  incr)
+#define S_PLUS_X_A(hz,Z) (Z*4 - 3)
+
+// 1x1 square
+#define S_1X1(z) for (z = vec3(0);  z.x <= 0;  z.x++)
+
+#define T1 (T+1)
+#define FOR_FRAME(r) for (r.z = 0;  r.z < T1;  r.z++)
+
+#ifdef LUMA_raw
+#define _INJ_RF_ _INJ_RF_LUMA
+#else
+#define _INJ_RF_ RF
+#endif
+
+// donut increment, increments without landing on (0,0,0)
+// much faster than a continue statement
+#define DINCR(z,c,a) ((z.c += a),(z.c += int(z == vec3(0))))
+
+#define R_AREA(a) (a * T1 - 1)
+
+// research shapes
+// XXX would be nice to have the option of temporally-varying research sizes
+#if R == 0 || R == 1
+#define FOR_RESEARCH(r) S_1X1(r)
+const int r_area = R_AREA(1); 
+#elif RS == 8
+#define FOR_RESEARCH(r) S_PLUS_X(r,hr,DINCR(r,y,max(1,abs(r.x))))
+const int r_area = R_AREA(S_PLUS_X_A(hr,R)); 
+#elif RS == 7
+#define FOR_RESEARCH(r) S_PLUS(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(S_PLUS_A(hr,R)); 
+#elif RS == 6
+#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(R*R); 
+#elif RS == 5
+#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,DINCR(r,x,1))
+const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); 
+#elif RS == 4
+#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,DINCR(r,x,1))
+const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); 
+#elif RS == 3
+#define FOR_RESEARCH(r) S_DIAMOND(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(S_DIAMOND_A(hr,R)); 
+#elif RS == 2
+#define FOR_RESEARCH(r) S_VERTICAL(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(R); 
+#elif RS == 1
+#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,DINCR(r,x,1))
+const int r_area = R_AREA(R); 
+#elif RS == 0
+#define FOR_RESEARCH(r) S_SQUARE(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(R*R); 
+#endif
+
+#define RI1 (RI+1)
+#define RFI1 (RFI+1)
+
+#if RI
+#define FOR_ROTATION for (float ri = 0;  ri < 360;  ri+=360.0/RI1)
+#else
+#define FOR_ROTATION
+#endif
+
+#if RFI
+#define FOR_REFLECTION for (int rfi = 0;  rfi < RFI1;  rfi++)
+#else
+#define FOR_REFLECTION
+#endif
+
+#if PD
+#define PINCR DINCR
+#else
+#define PINCR(z,c,a) (z.c += a)
+#endif
+
+#define P_AREA(a) (a - PD)
+
+// patch shapes
+#if P == 0 || P == 1
+#define FOR_PATCH(p) S_1X1(p)
+const int p_area = P_AREA(1); 
+#elif PS == 8
+#define FOR_PATCH(p) S_PLUS_X(p,hp,PINCR(p,y,max(1,abs(p.x))))
+const int p_area = P_AREA(S_PLUS_X_A(hp,P)); 
+#elif PS == 7
+#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(S_PLUS_A(hp,P)); 
+#elif PS == 6
+#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(P*P); 
+#elif PS == 5
+#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x,1))
+const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); 
+#elif PS == 4
+#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x,1))
+const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); 
+#elif PS == 3
+#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(S_DIAMOND_A(hp,P)); 
+#elif PS == 2
+#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(P); 
+#elif PS == 1
+#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x,1))
+const int p_area = P_AREA(P); 
+#elif PS == 0
+#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(P*P); 
+#endif
+
+const float r_scale = 1.0/r_area; 
+const float p_scale = 1.0/p_area; 
+
+#if SAMPLE == 0
+#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * (vec2(off) + 0.5 - fract((pos) * (size))))
+#else
+#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * vec2(off))
+#endif
+
+#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
+
+#if _INJ_RF_ && defined(LUMA_raw)
+#define load2_(off) sample(_INJ_RF_LUMA_tex, _INJ_RF_LUMA_pos, _INJ_RF_LUMA_size, _INJ_RF_LUMA_pt, off)
+#define gather_offs(off, off_arr) (_INJ_RF_LUMA_mul * vec4(textureGatherOffsets(_INJ_RF_LUMA_raw, _INJ_RF_LUMA_pos + vec2(off) * _INJ_RF_LUMA_pt, off_arr)))
+#define gather(off) _INJ_RF_LUMA_gather(_INJ_RF_LUMA_pos + (off) * _INJ_RF_LUMA_pt, 0)
+#elif _INJ_RF_ && D1W
+#define load2_(off) sample(_INJ_RF_tex, _INJ_RF_pos, _INJ_RF_size, _INJ_RF_pt, off)
+#define gather_offs(off, off_arr) (_INJ_RF_mul * vec4(textureGatherOffsets(_INJ_RF_raw, _INJ_RF_pos + vec2(off) * _INJ_RF_pt, off_arr)))
+#define gather(off) _INJ_RF_gather(_INJ_RF_pos + (off) * _INJ_RF_pt, 0)
+#elif _INJ_RF_
+#define load2_(off) sample(_INJ_RF_tex, _INJ_RF_pos, _INJ_RF_size, _INJ_RF_pt, off)
+#else
+#define load2_(off) load_(off)
+#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
+#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
+#endif
+
+#if T
+val load(vec3 off)
+{
+	 switch (min(int(off.z), frame)) {
+	 case 0: return val_swizz(load_(off)); 
+
+	 }
+}
+val load2(vec3 off)
+{
+	 return off.z == 0 ? val_swizz(load2_(off)) : load(off); 
+}
+#else
+#define load(off) val_swizz(load_(off))
+#define load2(off) val_swizz(load2_(off))
+#endif
+
+val poi2 = load2(vec3(0));  // guide pixel-of-interest
+#if GUIDE_INPUT
+#define poi poi2
+#else
+vec4 poi_ = load_(vec3(0)); 
+val poi = val_swizz(poi_);  // pixel-of-interest
+#endif
+
+#if RI // rotation
+vec2 rot(vec2 p, float d)
+{
+	 return vec2(
+	 	 p.x * cos(radians(d)) - p.y * sin(radians(d)),
+	 	 p.y * sin(radians(d)) + p.x * cos(radians(d))
+	 ); 
+}
+#else
+#define rot(p, d) (p)
+#endif
+
+#if RFI // reflection
+vec2 ref(vec2 p, int d)
+{
+	 switch (d) {
+	 case 0: return p; 
+	 case 1: return p * vec2(1, -1); 
+	 case 2: return p * vec2(-1, 1); 
+	 }
+}
+#else
+#define ref(p, d) (p)
+#endif
+
+#if SST && R >= SST
+float spatial_r(vec3 v)
+{
+	 v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size); 
+	 return SK(length(v*SD)*SS); 
+}
+#else
+#define spatial_r(v) (1)
+#endif
+
+#if PST && P >= PST
+#define spatial_p(v) PSK(length(v*PSD)*PSS)
+#else
+#define spatial_p(v) (1)
+#endif
+
+val range(val pdiff_sq)
+{
+	 const float h = max(S, 0.0) * 0.013; 
+	 const float pdiff_scale = 1.0/(h*h); 
+	 pdiff_sq = sqrt(pdiff_sq * pdiff_scale); 
+#if defined(LUMA_raw)
+	 return RK(pdiff_sq); 
+#elif defined(CHROMA_raw)
+	 return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y)); 
+#else
+	 return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z)); 
+#endif
+}
 
-vec4 hook()
+val patch_comparison(vec3 r, vec3 r2)
 {
-	 return HOOKED_texOff(0); 
+	 vec3 p; 
+	 val min_rot = val(p_area); 
+
+	 FOR_ROTATION FOR_REFLECTION {
+	 	 val pdiff_sq = val(0); 
+	 	 FOR_PATCH(p) {
+	 	 	 vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z); 
+	 	 	 val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF); 
+	 	 	 diff_sq *= diff_sq; 
+	 	 	 diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy); 
+	 	 	 pdiff_sq += diff_sq; 
+	 	 }
+	 	 min_rot = min(min_rot, pdiff_sq); 
+	 }
+
+	 return min_rot * p_scale; 
 }
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND _INJ_PREI
-//!WIDTH HOOKED.w
-//!HEIGHT HOOKED.h
-//!DESC Guided filter (I)
-//!SAVE _INJ_I
+#define NO_GATHER (PD == 0 && NG == 0 && SAMPLE == 0) // never textureGather if any of these conditions are false
+#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3 || RI == 7)
 
-vec4 hook()
+#if (defined(LUMA_gather) || D1W) && ((PS == 0 || ((PS == 3 || PS == 7) && RI != 7) || PS == 8) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
+// 3x3 diamond/plus patch_comparison_gather
+// XXX extend to support arbitrary sizes (probably requires code generation)
+// XXX support PSS
+const ivec2 offsets_adj[4] = { ivec2(0,-1), ivec2(1,0), ivec2(0,1), ivec2(-1,0) }; 
+const ivec2 offsets_adj_sf[4] = { ivec2(0,-1) * SF, ivec2(1,0) * SF, ivec2(0,1) * SF, ivec2(-1,0) * SF }; 
+vec4 poi_patch_adj = gather_offs(0, offsets_adj); 
+#if PS == 0 || PS == 8
+const ivec2 offsets_diag[4] = { ivec2(-1,-1), ivec2(1,-1), ivec2(1,1), ivec2(-1,1) }; 
+const ivec2 offsets_diag_sf[4] = { ivec2(-1,-1) * SF, ivec2(1,-1) * SF, ivec2(1,1) * SF, ivec2(-1,1) * SF }; 
+vec4 poi_patch_diag = gather_offs(0, offsets_diag); 
+#endif
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
-return _INJ_PREI_texOff(0);
-}
-
+	 float min_rot = p_area - 1; 
+	 vec4 transformer_adj = gather_offs(r, offsets_adj_sf); 
+#if PS == 0 || PS == 8
+	 vec4 transformer_diag = gather_offs(r, offsets_diag_sf); 
+#endif
+	 FOR_ROTATION {
+	 	 FOR_REFLECTION {
+#if RFI
+	 	 	 /* xxy
+	 	 	  * w y
+	 	 	  * wzz
+	 	 	  */
+	 	 	 switch(rfi) {
+	 	 	 case 1:
+	 	 	 	 transformer_adj = transformer_adj.zyxw; 
+#if PS == 0 || PS == 8
+	 	 	 	 transformer_diag = transformer_diag.zyxw; 
+#endif
+	 	 	 	 break; 
+	 	 	 case 2:
+	 	 	 	 transformer_adj = transformer_adj.xwzy; 
+#if PS == 0 || PS == 8
+	 	 	 	 transformer_diag = transformer_diag.xwzy; 
+#endif
+	 	 	 	 break; 
+	 	 	 }
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (P)
-//!BIND HOOKED
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_P
+	 	 	 vec4 diff = poi_patch_adj - transformer_adj; 
+#if PS == 0 || PS == 8
+	 	 	 diff += poi_patch_diag - transformer_diag; 
+#endif
+	 	 	 float diff_sq = dot(diff * diff, vec4(1)); 
+	 	 	 min_rot = min(diff_sq, min_rot); 
 
-vec4 hook()
+// un-reflect
+#if RFI
+	 	 	 switch(rfi) {
+	 	 	 case 1:
+	 	 	 	 transformer_adj = transformer_adj.zyxw; 
+#if PS == 0 || PS == 8
+	 	 	 	 transformer_diag = transformer_diag.zyxw; 
+#endif
+	 	 	 	 break; 
+	 	 	 case 2:
+	 	 	 	 transformer_adj = transformer_adj.xwzy; 
+#if PS == 0 || PS == 8
+	 	 	 	 transformer_diag = transformer_diag.xwzy; 
+#endif
+	 	 	 	 break; 
+	 	 	 }
+#endif
+	 	 } // FOR_REFLECTION
+#if RI == 7
+	 	 transformer_adj = transformer_adj.wxyz; 
+	 	 // swap adjacents for diagonals
+	 	 transformer_adj += transformer_diag; 
+	 	 transformer_diag = transformer_adj - transformer_diag; 
+	 	 transformer_adj -= transformer_diag; 
+#elif RI == 3
+	 	 transformer_adj = transformer_adj.wxyz; 
+#elif RI == 1
+	 	 transformer_adj = transformer_adj.zwxy; 
+#endif
+#if RI == 3 && (PS == 0 || PS == 8)
+	 	 transformer_diag = transformer_diag.wxyz; 
+#elif RI == 1 && (PS == 0 || PS == 8)
+	 	 transformer_diag = transformer_diag.zwxy; 
+#endif
+	 } // FOR_ROTATION
+	 float center_diff = poi2.x - load2(r).x; 
+	 return (center_diff * center_diff + min_rot) * p_scale; 
+}
+#elif (defined(LUMA_gather) || D1W) && PS == 4 && P == 3 && RI == 0 && RFI == 0 && NO_GATHER
+const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,0), ivec2(1,0) }; 
+const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,0) * SF, ivec2(1,0) * SF }; 
+vec4 poi_patch = gather_offs(0, offsets); 
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
-	 return HOOKED_texOff(0); 
+	 vec4 pdiff = poi_patch - gather_offs(r, offsets_sf); 
+	 return dot(pdiff * pdiff, vec4(1)) * p_scale; 
 }
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANI)
-//!BIND _INJ_I
-//!WIDTH _INJ_I.w 1.5 /
-//!HEIGHT _INJ_I.h 1.5 /
-//!SAVE _INJ_MEANI
-
-vec4 hook()
+#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
+// tiled even square patch_comparison_gather
+// XXX extend to support odd square?
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
-return _INJ_I_texOff(0);
+	 vec2 tile; 
+	 float min_rot = p_area; 
+
+	 /* gather order:
+	  * w z
+	  * x y
+	  */
+	 float pdiff_sq = 0; 
+	 for (tile.x = -hp;  tile.x < hp;  tile.x+=2) for (tile.y = -hp;  tile.y < hp;  tile.y+=2) {
+	 	 vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy); 
+	 	 diff_sq *= diff_sq; 
+	 	 diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)),
+	 	 	                                  spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0))); 
+	 	 pdiff_sq += dot(diff_sq, vec4(1)); 
+	 }
+	 min_rot = min(min_rot, pdiff_sq); 
+
+	 return min_rot * p_scale; 
 }
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANP)
-//!BIND _INJ_P
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_MEANP
+#else
+#define patch_comparison_gather patch_comparison
+#endif
 
 vec4 hook()
 {
-return _INJ_P_texOff(0);
-}
+	 val total_weight = val(0); 
+	 val sum = val(0); 
+	 val result = val(0); 
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (_INJ_I_SQ)
-//!BIND _INJ_I
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_I_SQ
+	 vec3 r = vec3(0); 
+	 vec3 p = vec3(0); 
+	 vec3 me = vec3(0); 
 
-vec4 hook()
-{
-return _INJ_I_texOff(0) * _INJ_I_texOff(0);
-}
+#if T && ME == 1 // temporal & motion estimation
+	 vec3 me_tmp = vec3(0); 
+	 float maxweight = 0; 
+#elif T && ME == 2 // temporal & motion estimation
+	 vec3 me_sum = vec3(0); 
+	 float me_weight = 0; 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (_INJ_IXP)
-//!BIND _INJ_I
-//!BIND _INJ_P
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_IXP
+#if AS
+	 val total_weight_s = val(0); 
+	 val sum_s = val(0); 
+#endif
 
-vec4 hook()
-{
-return _INJ_I_texOff(0) * _INJ_P_texOff(0);
-}
+#if WD == 2 // weight discard (mean)
+	 int r_index = 0; 
+	 val_packed all_weights[r_area]; 
+	 val_packed all_pixels[r_area]; 
+#elif WD == 1 // weight discard (moving cumulative average)
+	 int r_iter = 1; 
+	 val wd_total_weight = val(0); 
+	 val wd_sum = val(0); 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (CORRI)
-//!BIND _INJ_I_SQ
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_CORRI
+	 FOR_FRAME(r) {
+	 // XXX ME is always a frame behind, should have the option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
+#if T && ME == 1 // temporal & motion estimation max weight
+	 if (r.z > 0) {
+	 	 me += me_tmp * MEF; 
+	 	 me_tmp = vec3(0); 
+	 	 maxweight = 0; 
+	 }
+#elif T && ME == 2 // temporal & motion estimation weighted average
+	 if (r.z > 0) {
+	 	 me += round(me_sum / me_weight * MEF); 
+	 	 me_sum = vec3(0); 
+	 	 me_weight = 0; 
+	 }
+#endif
+	 FOR_RESEARCH(r) {
+	 	 // r coords with appropriate transformations applied
+	 	 vec3 tr = vec3(r.xy + floor(r.xy * RSF), r.z); 
+	 	 float spatial_weight = spatial_r(tr); 
+	 	 tr.xy += me.xy; 
 
-vec4 hook()
-{
-return _INJ_I_SQ_texOff(0);
-}
+	 	 val px = load(tr); 
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (CORRP)
-//!BIND _INJ_IXP
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_CORRP
+#if SKIP_PATCH
+	 	 val weight = val(1); 
+#else
+	 	 val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(tr, vec3(0))) : patch_comparison(tr, vec3(0)); 
+	 	 val weight = range(pdiff_sq); 
+#endif
 
-vec4 hook()
-{
-return _INJ_IXP_texOff(0);
-}
+#if T && ME == 1 // temporal & motion estimation max weight
+	 	 me_tmp = vec3(tr.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); 
+	 	 maxweight = max(maxweight, weight.x); 
+#elif T && ME == 2 // temporal & motion estimation weighted average
+	 	 me_sum += vec3(tr.xy,0) * weight.x; 
+	 	 me_weight += weight.x; 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (A)
-//!BIND _INJ_MEANI
-//!BIND _INJ_MEANP
-//!BIND _INJ_CORRI
-//!BIND _INJ_CORRP
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_A
+#if D1W
+	 	 weight = val(weight.x); 
+#endif
 
-#define E 0.0013
+	 	 weight *= spatial_weight; 
 
-vec4 hook()
-{
-vec4 var = _INJ_CORRI_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANI_texOff(0);
-vec4 cov = _INJ_CORRP_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANP_texOff(0);
-	 return cov / (var + E); 
-}
+#if AS
+	 	 spatial_weight *= int(r.z == 0);  // ignore temporal
+	 	 sum_s += px * spatial_weight; 
+	 	 total_weight_s += spatial_weight; 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (B)
-//!BIND _INJ_A
-//!BIND _INJ_MEANI
-//!BIND _INJ_MEANP
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_B
+#if WD == 2 // weight discard (mean)
+	 	 all_weights[r_index] = val_pack(weight); 
+	 	 all_pixels[r_index] = val_pack(px); 
+	 	 r_index++; 
+#elif WD == 1 // weight discard (moving cumulative average)
+	 	 val wd_scale = val(1.0/r_iter); 
+	 	 val below_threshold = WDS * abs(min(val(0.0), weight - (total_weight * wd_scale * WDT * WD1TK(sqrt(wd_scale*WDP))))); 
+#if defined(LUMA_raw)
+	 	 val wdkf = WDK(below_threshold); 
+#elif defined(CHROMA_raw)
+	 	 val wdkf = vec2(WDK(below_threshold.x), WDK(below_threshold.y)); 
+#else
+	 	 val wdkf = vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.y)); 
+#endif
+	 	 wd_sum += px * weight * wdkf; 
+	 	 wd_total_weight += weight * wdkf; 
+	 	 r_iter++; 
+#endif
 
-vec4 hook()
-{
-return _INJ_MEANP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANI_texOff(0);
-}
+	 	 sum += px * weight; 
+	 	 total_weight += weight; 
+	 } // FOR_RESEARCH
+	 } // FOR_FRAME
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANA)
-//!BIND _INJ_A
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_MEANA
+	 val avg_weight = total_weight * r_scale; 
+	 val old_avg_weight = avg_weight; 
 
-vec4 hook()
-{
-return _INJ_A_texOff(0);
-}
+#if WD == 2 // weight discard (mean)
+	 total_weight = val(0); 
+	 sum = val(0); 
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANB)
-//!BIND _INJ_B
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_MEANB
+	 for (int i = 0;  i < r_area;  i++) {
+	 	 val weight = val_unpack(all_weights[i]); 
+	 	 val px = val_unpack(all_pixels[i]); 
 
-vec4 hook()
-{
-return _INJ_B_texOff(0);
-}
+	 	 val below_threshold = WDS * abs(min(val(0.0), weight - (avg_weight * WDT))); 
+#if defined(LUMA_raw)
+	 	 weight *= WDK(below_threshold); 
+#elif defined(CHROMA_raw)
+	 	 weight *= vec2(WDK(below_threshold.x), WDK(below_threshold.y)); 
+#else
+	 	 weight *= vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.z)); 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter
-//!BIND HOOKED
-//!BIND _INJ_MEANA
-//!BIND _INJ_MEANB
-//!SAVE RF_LUMA
+	 	 sum += px * weight; 
+	 	 total_weight += weight; 
+	 }
+#elif WD == 1 // weight discard (moving cumulative average)
+	 total_weight = wd_total_weight; 
+	 sum = wd_sum; 
+#endif
+#if WD // weight discard
+	 avg_weight = total_weight * r_scale; 
+#endif
 
-vec4 hook()
-{
-return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
+	 total_weight += SW * spatial_r(vec3(0)); 
+	 sum += poi * SW * spatial_r(vec3(0)); 
+	 result = val(sum / total_weight); 
+
+	 // store frames for temporal
+#if T > 1
+
+#endif
+#if T && TRF
+	 imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result)); 
+#elif T
+	 imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2)); 
+#endif
+
+#if AS == 1 // sharpen+denoise
+#define AS_base result
+#elif AS == 2 // sharpen only
+#define AS_base poi
+#endif
+#if AS
+	 val usm = result - sum_s/total_weight_s; 
+	 usm = exp(log(abs(usm))*ASP) * sign(usm);  // avoiding pow() since it's buggy on nvidia
+	 usm *= gaussian(abs((AS_base + usm - 0.5) / 1.5) * ASA); 
+	 usm *= ASF; 
+	 result = AS_base + usm; 
+#endif
+
+#if EP // extremes preserve
+	 float luminance = EP_texOff(0).x; 
+	 // EPSILON is needed since pow(0,0) is undefined
+	 float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); 
+	 result = mix(poi, result, ep_weight); 
+#else
+	 float ep_weight = 0; 
+#endif
+
+#if V == 1
+	 result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0); 
+#elif V == 2
+	 result = (poi - result) * 0.5 + 0.5; 
+#elif V == 3 // post-WD weight map
+	 result = avg_weight; 
+#elif V == 4 // pre-WD edge map
+	 result = old_avg_weight; 
+#elif V == 5
+	 result = 0.5 + usm; 
+#elif V == 6
+	 result = val(1 - ep_weight); 
+#endif
+
+// XXX visualize chroma for these
+#if defined(CHROMA_raw) && (V == 3 || V == 4 || V == 6)
+	 return vec4(0.5); 
+#endif
+
+	 return unval(mix(poi, result, BF)); 
 }
 
-// End of source code injected from guided.glsl 
+// End of source code injected from ../nlmeans.glsl 
 
 //!HOOK LUMA
 //!HOOK CHROMA
@@ -328,26 +2276,12 @@ vec4 hook()
 	return RF_LUMA_texOff(0);
 }
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND LUMA
-//!WIDTH LUMA.w 3 /
-//!HEIGHT LUMA.h 3 /
-//!DESC Non-local means (EP)
-//!SAVE EP
-
-vec4 hook()
-{
-	return LUMA_texOff(0);
-}
-
 //!HOOK LUMA
 //!HOOK CHROMA
 //!BIND HOOKED
 //!BIND RF_LUMA
 //!BIND RF
-//!BIND EP
-//!DESC Non-local means (nlmeans_hqx.glsl)
+//!DESC Non-local means (nlmeans.glsl)
 
 // User variables
 
@@ -358,47 +2292,35 @@ vec4 hook()
 #ifdef LUMA_raw
 #define S 2.25
 #else
-#define S 5.0
+#define S 2.5168955531436197
 #endif
 
 /* Adaptive sharpening
  *
- * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
- * weight map to restrict the sharpening to edges.
- *
- * If you just want to increase/decrease sharpness then you want to change ASF.
+ * Performs an unsharp mask by subtracting the spatial kernel's blur from the 
+ * NLM blur. For sharpen+denoise the sharpening is limited to edge areas and 
+ * denoising is done everywhere else.
  *
  * Use V=4 to visualize which areas are sharpened (black means sharpen).
  *
  * AS:
- * 	- 0 to disable
- * 	- 1 to sharpen+denoise
- * 	- 2 to sharpen only
+ * 	- 0: disable
+ * 	- 1: sharpen+denoise
+ * 	- 2: sharpen only
  * ASF: Higher numbers make a sharper image
- * ASP: Higher numbers use more of the sharp image
- * ASW:
- * 	- 0 to use pre-WD weights
- * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
- * ASK: Weight kernel:
- * 	- 0 for power. This is the old method.
- * 	- 1 for sigmoid. This is generally recommended.
- * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
- * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
+ * ASA: Anti-ringing, higher numbers increase strength
+ * ASP: Power, lower numbers increase sharpening on lower frequency detail
  */
 #ifdef LUMA_raw
 #define AS 0
-#define ASF 3.0
-#define ASP 1
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
+#define ASF 0.1625
+#define ASA 5.0
+#define ASP 0.5
 #else
 #define AS 0
-#define ASF 3.0
-#define ASP 1.0
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
+#define ASF 0.1625
+#define ASA 5.0
+#define ASP 0.5
 #endif
 
 /* Starting weight
@@ -409,52 +2331,57 @@ vec4 hook()
  * EPSILON should be used instead of zero to avoid divide-by-zero errors.
  */
 #ifdef LUMA_raw
-#define SW 1.0
+#define SW 1.3011446081346498
 #else
-#define SW 0.5
+#define SW 1.2219854377433914
 #endif
 
 /* Weight discard
  *
- * Discard weights that fall below a fraction of the average weight. This culls 
- * the most dissimilar samples from the blur, yielding a much more pleasant 
- * result, especially around edges.
+ * Reduces weights that fall below a fraction of the average weight. This culls 
+ * the most dissimilar samples from the blur, which can yield a better result, 
+ * especially around edges.
  * 
  * WD:
- * 	- 2: True average. Better quality, but slower and requires GLSL 4.0 or later
- * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
+ * 	- 2: Mean. Better quality, but slower and requires GLSL 4.0 or later
+ * 	- 1: Moving cumulative average. Fast but inaccurate, blurs directionally.
  * 	- 0: Disable
  *
  * WDT: Threshold coefficient, higher numbers discard more
  * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
+ * WDS (not for WDK=is_zero): Higher numbers are more eager to reduce weights
  */
 #ifdef LUMA_raw
 #define WD 2
-#define WDT 0.5
-#define WDP 6.0
+#define WDT 0.11671341022864548
+#define WDP 5.381278367349288
+#define WDS 1.0
 #else
-#define WD 2
-#define WDT 0.75
-#define WDP 6.0
+#define WD 0
+#define WDT 0.002713346103131793
+#define WDP 5.832936323930807
+#define WDS 1.0
 #endif
 
 /* Extremes preserve
  *
- * Reduces denoising around very bright/dark areas.
+ * Reduce denoising in very bright/dark areas.
+ *
+ * Disabled by default now. If you want to reenable this, set EP=3/ in 
+ * Makefile.nlm and rebuild.
  *
  * The downscaling factor of the EP shader stage affects what is considered a 
- * bright/dark area. The default of 3 should be fine, it's not recommended to 
- * change this.
+ * bright/dark area.
  *
  * This is incompatible with RGB. If you have RGB hooks enabled then you will 
  * have to delete the EP shader stage or specify EP=0 through shader_cfg.
  *
  * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
+ * DP: EP strength on dark areas, 0 to fully denoise
+ * BP: EP strength on bright areas, 0 to fully denoise
  */
 #ifdef LUMA_raw
-#define EP 1
+#define EP 0
 #define BP 0.75
 #define DP 0.25
 #else
@@ -469,12 +2396,26 @@ vec4 hook()
 /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
 /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
 
+/* textureGather applicable configurations:
+ *
+ * - PS={0,3,7,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2}
+ * - PS={0,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2}
+ * - PS=6:RI=0:RFI=0
+ *   - Currently the only scalable variant
+ *
+ * Options which always disable textureGather:
+ * 	- NG
+ * 	- SAMPLE
+ * 	- PD
+ *
+ * Running without textureGather may be much slower.
+ */
+
 /* Patch & research sizes
  *
- * Patch size should be an odd number greater than or equal to 3. Higher values 
- * are slower and not always better.
+ * P should be an odd number. Higher values are slower and not always better.
  *
- * Research size be an odd number greater than or equal to 3. Higher values are 
+ * R should be an odd number greater than or equal to 3. Higher values are 
  * generally better, but slower, blurrier, and gives diminishing returns.
  */
 #ifdef LUMA_raw
@@ -492,8 +2433,6 @@ vec4 hook()
  *
  * PS applies applies to patches, RS applies to research zones.
  *
- * Be wary of gather optimizations (see the Regarding Speed comment at the top)
- *
  * 0: square (symmetrical)
  * 1: horizontal line (asymmetric)
  * 2: vertical line (asymmetric)
@@ -502,6 +2441,7 @@ vec4 hook()
  * 5: truncated triangle (asymmetric on two axis, last row halved)
  * 6: even sized square (asymmetric on two axis)
  * 7: plus (symmetrical)
+ * 8: plus X (symmetrical)
  */
 #ifdef LUMA_raw
 #define RS 3
@@ -516,8 +2456,8 @@ vec4 hook()
  * This setting is dependent on code generation from shader_cfg, so this 
  * setting can only be enabled via shader_cfg.
  *
- * Compares the pixel-of-interest against a guide, which could be a downscaled 
- * image or the output of another shader
+ * Computes weights on a guide, which could be a downscaled image or the output 
+ * of another shader, and applies the weights to the original image
  */
 #define RF_LUMA 1
 #define RF 1
@@ -531,6 +2471,9 @@ vec4 hook()
  * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
  * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
  *
+ * Consider setting SAMPLE=1 if setting RI to a setting that would require 
+ * sampling between pixels.
+ *
  * RI: Rotational invariance
  * RFI (0 to 2): Reflectional invariance
  */
@@ -598,14 +2541,14 @@ vec4 hook()
  */
 #ifdef LUMA_raw
 #define SST 1
-#define SS 0.25
+#define SS 0.5296176863733414
 #define SD vec3(1,1,1)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #else
 #define SST 1
-#define SS 0.25
+#define SS 0.26295970436981203
 #define SD vec3(1,1,1)
 #define PST 0
 #define PSS 0.0
@@ -617,6 +2560,8 @@ vec4 hook()
  * SK: spatial kernel
  * RK: range kernel (takes patch differences)
  * PSK: intra-patch spatial kernel
+ * WDK: weight discard kernel
+ * WD1TK (WD=1 only): weight discard tolerance kernel
  *
  * List of available kernels:
  *
@@ -624,18 +2569,51 @@ vec4 hook()
  * cos
  * gaussian
  * lanczos
- * quadratic
+ * quadratic_ (unclamped)
  * sinc
+ * sinc_ (unclamped)
+ * sinc3
  * sphinx
+ * sphinx_ (unclamped)
+ * triangle_ (unclamped)
+ * triangle
  */
 #ifdef LUMA_raw
 #define SK gaussian
 #define RK gaussian
 #define PSK gaussian
+#define WDK is_zero
+#define WD1TK gaussian
 #else
 #define SK gaussian
 #define RK gaussian
 #define PSK gaussian
+#define WDK is_zero
+#define WD1TK gaussian
+#endif
+
+/* Sampling method
+ *
+ * In most cases this shouldn't make any difference, only set to bilinear if 
+ * it's necessary to sample between pixels (e.g., RI=2).
+ *
+ * 0: nearest neighbor
+ * 1: bilinear
+ */
+#ifdef LUMA_raw
+#define SAMPLE 0
+#else
+#define SAMPLE 0
+#endif
+
+/* Research scaling factor
+ *
+ * Higher numbers sample more sparsely as the distance from the POI grows.
+ */
+#ifdef LUMA_raw
+#define RSF 0.0
+#else
+#define RSF 0.0
 #endif
 
 // Scaling factor (should match WIDTH/HEIGHT)
@@ -645,13 +2623,22 @@ vec4 hook()
 #define SF 1
 #endif
 
+// Use the guide image as the input image
+#ifdef LUMA_raw
+#define GUIDE_INPUT 0
+#else
+#define GUIDE_INPUT 0
+#endif
+
 /* Visualization
  *
  * 0: off
  * 1: absolute difference between input/output to the power of 0.25
  * 2: difference between input/output centered on 0.5
- * 3: avg_weight
- * 4: edge map (based on the relevant AS settings)
+ * 3: post-WD weight map
+ * 4: pre-WD weight map
+ * 5: unsharp mask
+ * 6: EP
  */
 #ifdef LUMA_raw
 #define V 0
@@ -696,37 +2683,44 @@ vec4 hook()
 
 // Shader code
 
-#define EPSILON 0.00000000001
+#define EPSILON 1.2e-38
 #define M_PI 3.14159265358979323846
 #define POW2(x) ((x)*(x))
 #define POW3(x) ((x)*(x)*(x))
-#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define bicubic_(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define bicubic(x) bicubic_(clamp((x), 0.0, 2.0))
 #define gaussian(x) exp(-1 * POW2(x))
-#define lanczos(x) POW2(sinc(x))
-#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
-#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
-#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+#define quadratic_(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
+#define quadratic(x) quadratic_(clamp((x), 0.0, 1.5))
+#define sinc_(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
+#define sinc(x) sinc_(clamp((x), 0.0, 1.0))
+#define sinc3(x) sinc_(clamp((x), 0.0, 3.0))
+#define lanczos(x) (sinc3(x) * sinc(x))
+#define sphinx_(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+#define sphinx(x) sphinx_(clamp((x), 0.0, 1.4302966531242027))
+#define triangle_(x) (1 - (x))
+#define triangle(x) triangle_(clamp((x), 0.0, 1.0))
+#define is_zero(x) int(x == 0)
 
 // XXX could maybe be better optimized on LGC
-// XXX return original alpha component instead of 1.0
 #if defined(LUMA_raw)
 #define val float
 #define val_swizz(v) (v.x)
-#define unval(v) vec4(v.x, 0, 0, 1.0)
+#define unval(v) vec4(v.x, 0, 0, poi_.a)
 #define val_packed val
 #define val_pack(v) (v)
 #define val_unpack(v) (v)
 #elif defined(CHROMA_raw)
 #define val vec2
 #define val_swizz(v) (v.xy)
-#define unval(v) vec4(v.x, v.y, 0, 1.0)
+#define unval(v) vec4(v.x, v.y, 0, poi_.a)
 #define val_packed uint
 #define val_pack(v) packUnorm2x16(v)
 #define val_unpack(v) unpackUnorm2x16(v)
 #else
 #define val vec3
 #define val_swizz(v) (v.xyz)
-#define unval(v) vec4(v.x, v.y, v.z, 1.0)
+#define unval(v) vec4(v.x, v.y, v.z, poi_.a)
 #define val_packed val
 #define val_pack(v) (v)
 #define val_unpack(v) (v)
@@ -744,10 +2738,6 @@ const int hr = R/2;
 const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes
 #endif
 
-// donut increment, increments without landing on (0,0,0)
-// much faster than a continue statement
-#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
-
 // patch/research shapes
 // each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
 // dots (.) represent samples (pixels) and X represents the pixel-of-interest
@@ -788,7 +2778,7 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 //
 // Z    ..X..
 //
-#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
+#define S_HORIZONTAL(z,hz,incr) for (z.y = 0; z.y <= 0; z.y++) for (z.x = -hz; z.x <= hz; incr)
 
 // 90 degree rotation of S_HORIZONTAL
 #define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
@@ -801,19 +2791,13 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
 #define S_PLUS_A(hz,Z) (Z*2 - 1)
 
-// XXX implement S_PLUS w/ an X overlayed:
 // 3    . . .
 // 3     ...
 // Z    ..X..
 // 3     ...
 // 3    . . .
-
-// XXX implement an X shape:
-// 2    .   .
-// 2     . .
-// 1      X  
-// 2     . .
-// 2    .   .
+#define S_PLUS_X(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(z.x) + -hz * int(z.x == 0); z.y <= abs(z.x) + hz * int(z.x == 0); incr)
+#define S_PLUS_X_A(hz,Z) (Z*4 - 3)
 
 // 1x1 square
 #define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
@@ -827,43 +2811,43 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #define RF_ RF
 #endif
 
-// Skip comparing the pixel-of-interest against itself, unless RF is enabled
-#if RF_
-#define RINCR(z,c) (z.c++)
-#else
-#define RINCR DINCR
-#endif
+// donut increment, increments without landing on (0,0,0)
+// much faster than a continue statement
+#define DINCR(z,c,a) ((z.c += a),(z.c += int(z == vec3(0))))
 
-#define R_AREA(a) (a * T1 + RF_-1)
+#define R_AREA(a) (a * T1 - 1)
 
 // research shapes
 // XXX would be nice to have the option of temporally-varying research sizes
 #if R == 0 || R == 1
 #define FOR_RESEARCH(r) S_1X1(r)
 const int r_area = R_AREA(1);
+#elif RS == 8
+#define FOR_RESEARCH(r) S_PLUS_X(r,hr,DINCR(r,y,max(1,abs(r.x))))
+const int r_area = R_AREA(S_PLUS_X_A(hr,R));
 #elif RS == 7
-#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_PLUS(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(S_PLUS_A(hr,R));
 #elif RS == 6
-#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(R*R);
 #elif RS == 5
-#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x))
+#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,DINCR(r,x,1))
 const int r_area = R_AREA(S_TRIANGLE_A(hr,hr));
 #elif RS == 4
-#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x))
+#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,DINCR(r,x,1))
 const int r_area = R_AREA(S_TRIANGLE_A(hr,R));
 #elif RS == 3
-#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_DIAMOND(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(S_DIAMOND_A(hr,R));
 #elif RS == 2
-#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_VERTICAL(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(R);
 #elif RS == 1
-#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x))
+#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,DINCR(r,x,1))
 const int r_area = R_AREA(R);
 #elif RS == 0
-#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_SQUARE(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(R*R);
 #endif
 
@@ -885,7 +2869,7 @@ const int r_area = R_AREA(R*R);
 #if PD
 #define PINCR DINCR
 #else
-#define PINCR(z,c) (z.c++)
+#define PINCR(z,c,a) (z.c += a)
 #endif
 
 #define P_AREA(a) (a - PD)
@@ -894,36 +2878,44 @@ const int r_area = R_AREA(R*R);
 #if P == 0 || P == 1
 #define FOR_PATCH(p) S_1X1(p)
 const int p_area = P_AREA(1);
+#elif PS == 8
+#define FOR_PATCH(p) S_PLUS_X(p,hp,PINCR(p,y,max(1,abs(p.x))))
+const int p_area = P_AREA(S_PLUS_X_A(hp,P));
 #elif PS == 7
-#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(S_PLUS_A(hp,P));
 #elif PS == 6
-#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(P*P);
 #elif PS == 5
-#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x))
+#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x,1))
 const int p_area = P_AREA(S_TRIANGLE_A(hp,hp));
 #elif PS == 4
-#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x))
+#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x,1))
 const int p_area = P_AREA(S_TRIANGLE_A(hp,P));
 #elif PS == 3
-#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(S_DIAMOND_A(hp,P));
 #elif PS == 2
-#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(P);
 #elif PS == 1
-#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x))
+#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x,1))
 const int p_area = P_AREA(P);
 #elif PS == 0
-#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(P*P);
 #endif
 
 const float r_scale = 1.0/r_area;
 const float p_scale = 1.0/p_area;
 
-#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size)))
+#if SAMPLE == 0
+#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * (vec2(off) + 0.5 - fract((pos) * (size))))
+#else
+#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * vec2(off))
+#endif
+
 #define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
 
 #if RF_ && defined(LUMA_raw)
@@ -959,8 +2951,13 @@ val load2(vec3 off)
 #define load2(off) val_swizz(load2_(off))
 #endif
 
-val poi = load(vec3(0)); // pixel-of-interest
 val poi2 = load2(vec3(0)); // guide pixel-of-interest
+#if GUIDE_INPUT
+#define poi poi2
+#else
+vec4 poi_ = load_(vec3(0));
+val poi = val_swizz(poi_); // pixel-of-interest
+#endif
 
 #if RI // rotation
 vec2 rot(vec2 p, float d)
@@ -1005,7 +3002,7 @@ float spatial_r(vec3 v)
 
 val range(val pdiff_sq)
 {
-	const float h = S*0.013;
+	const float h = max(S, 0.0) * 0.013;
 	const float pdiff_scale = 1.0/(h*h);
 	pdiff_sq = sqrt(pdiff_sq * pdiff_scale);
 #if defined(LUMA_raw)
@@ -1015,10 +3012,6 @@ val range(val pdiff_sq)
 #else
 	return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z));
 #endif
-	//return exp(-pdiff_sq * pdiff_scale);
-
-	// weight function from the NLM paper, it's not very good
-	//return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale);
 }
 
 val patch_comparison(vec3 r, vec3 r2)
@@ -1041,42 +3034,104 @@ val patch_comparison(vec3 r, vec3 r2)
 	return min_rot * p_scale;
 }
 
-#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
-#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
+#define NO_GATHER (PD == 0 && NG == 0 && SAMPLE == 0) // never textureGather if any of these conditions are false
+#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3 || RI == 7)
 
-#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
+#if (defined(LUMA_gather) || D1W) && ((PS == 0 || ((PS == 3 || PS == 7) && RI != 7) || PS == 8) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
 // 3x3 diamond/plus patch_comparison_gather
 // XXX extend to support arbitrary sizes (probably requires code generation)
-// XXX extend to support 3x3 square
 // XXX support PSS
-const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
-const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
-vec4 poi_patch = gather_offs(0, offsets);
+const ivec2 offsets_adj[4] = { ivec2(0,-1), ivec2(1,0), ivec2(0,1), ivec2(-1,0) };
+const ivec2 offsets_adj_sf[4] = { ivec2(0,-1) * SF, ivec2(1,0) * SF, ivec2(0,1) * SF, ivec2(-1,0) * SF };
+vec4 poi_patch_adj = gather_offs(0, offsets_adj);
+#if PS == 0 || PS == 8
+const ivec2 offsets_diag[4] = { ivec2(-1,-1), ivec2(1,-1), ivec2(1,1), ivec2(-1,1) };
+const ivec2 offsets_diag_sf[4] = { ivec2(-1,-1) * SF, ivec2(1,-1) * SF, ivec2(1,1) * SF, ivec2(-1,1) * SF };
+vec4 poi_patch_diag = gather_offs(0, offsets_diag);
+#endif
 float patch_comparison_gather(vec3 r, vec3 r2)
 {
 	float min_rot = p_area - 1;
-	vec4 transformer = gather_offs(r, offsets_sf);
+	vec4 transformer_adj = gather_offs(r, offsets_adj_sf);
+#if PS == 0 || PS == 8
+	vec4 transformer_diag = gather_offs(r, offsets_diag_sf);
+#endif
 	FOR_ROTATION {
 		FOR_REFLECTION {
-			float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1));
+#if RFI
+			/* xxy
+			 * w y
+			 * wzz
+			 */
+			switch(rfi) {
+			case 1:
+				transformer_adj = transformer_adj.zyxw;
+#if PS == 0 || PS == 8
+				transformer_diag = transformer_diag.zyxw;
+#endif
+				break;
+			case 2:
+				transformer_adj = transformer_adj.xwzy;
+#if PS == 0 || PS == 8
+				transformer_diag = transformer_diag.xwzy;
+#endif
+				break;
+			}
+#endif
+
+			vec4 diff = poi_patch_adj - transformer_adj;
+#if PS == 0 || PS == 8
+			diff += poi_patch_diag - transformer_diag;
+#endif
+			float diff_sq = dot(diff * diff, vec4(1));
 			min_rot = min(diff_sq, min_rot);
+
+// un-reflect
 #if RFI
 			switch(rfi) {
-			case 0: transformer = transformer.zyxw; break;
-			case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror
-			case 2: transformer = transformer.zyxw; break; // undoes last mirror
+			case 1:
+				transformer_adj = transformer_adj.zyxw;
+#if PS == 0 || PS == 8
+				transformer_diag = transformer_diag.zyxw;
+#endif
+				break;
+			case 2:
+				transformer_adj = transformer_adj.xwzy;
+#if PS == 0 || PS == 8
+				transformer_diag = transformer_diag.xwzy;
+#endif
+				break;
 			}
 #endif
-		}
-#if RI == 3
-		transformer = transformer.wxyz;
+		} // FOR_REFLECTION
+#if RI == 7
+		transformer_adj = transformer_adj.wxyz;
+		// swap adjacents for diagonals
+		transformer_adj += transformer_diag;
+		transformer_diag = transformer_adj - transformer_diag;
+		transformer_adj -= transformer_diag;
+#elif RI == 3
+		transformer_adj = transformer_adj.wxyz;
 #elif RI == 1
-		transformer = transformer.zwxy;
+		transformer_adj = transformer_adj.zwxy;
 #endif
-	}
-	float center_diff_sq = poi2.x - load2(r).x;
-	center_diff_sq *= center_diff_sq;
-	return (min_rot + center_diff_sq) * p_scale;
+#if RI == 3 && (PS == 0 || PS == 8)
+		transformer_diag = transformer_diag.wxyz;
+#elif RI == 1 && (PS == 0 || PS == 8)
+		transformer_diag = transformer_diag.zwxy;
+#endif
+	} // FOR_ROTATION
+	float center_diff = poi2.x - load2(r).x;
+	return (center_diff * center_diff + min_rot) * p_scale;
+}
+#elif (defined(LUMA_gather) || D1W) && PS == 4 && P == 3 && RI == 0 && RFI == 0 && NO_GATHER
+const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,0), ivec2(1,0) };
+const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,0) * SF, ivec2(1,0) * SF };
+vec4 poi_patch = gather_offs(0, offsets);
+float patch_comparison_gather(vec3 r, vec3 r2)
+{
+	vec4 pdiff = poi_patch - gather_offs(r, offsets_sf);
+	return dot(pdiff * pdiff, vec4(1)) * p_scale;
 }
 #elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
 // tiled even square patch_comparison_gather
@@ -1124,18 +3179,23 @@ vec4 hook()
 	float me_weight = 0;
 #endif
 
-#if WD == 2 // weight discard
+#if AS
+	val total_weight_s = val(0);
+	val sum_s = val(0);
+#endif
+
+#if WD == 2 // weight discard (mean)
 	int r_index = 0;
 	val_packed all_weights[r_area];
 	val_packed all_pixels[r_area];
-#elif WD == 1 // weight discard
-	val no_weights = val(0);
-	val discard_total_weight = val(0);
-	val discard_sum = val(0);
+#elif WD == 1 // weight discard (moving cumulative average)
+	int r_iter = 1;
+	val wd_total_weight = val(0);
+	val wd_sum = val(0);
 #endif
 
 	FOR_FRAME(r) {
-	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
+	// XXX ME is always a frame behind, should have the option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
 #if T && ME == 1 // temporal & motion estimation max weight
 	if (r.z > 0) {
 		me += me_tmp * MEF;
@@ -1149,19 +3209,26 @@ vec4 hook()
 		me_weight = 0;
 	}
 #endif
-	FOR_RESEARCH(r) { // main NLM logic
+	FOR_RESEARCH(r) {
+		// r coords with appropriate transformations applied
+		vec3 tr = vec3(r.xy + floor(r.xy * RSF), r.z);
+		float spatial_weight = spatial_r(tr);
+		tr.xy += me.xy;
+
+		val px = load(tr);
+
 #if SKIP_PATCH
 		val weight = val(1);
 #else
-		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0));
+		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(tr, vec3(0))) : patch_comparison(tr, vec3(0));
 		val weight = range(pdiff_sq);
 #endif
 
 #if T && ME == 1 // temporal & motion estimation max weight
-		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
+		me_tmp = vec3(tr.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
 		maxweight = max(maxweight, weight.x);
 #elif T && ME == 2 // temporal & motion estimation weighted average
-		me_sum += vec3(r.xy,0) * weight.x;
+		me_sum += vec3(tr.xy,0) * weight.x;
 		me_weight += weight.x;
 #endif
 
@@ -1169,21 +3236,34 @@ vec4 hook()
 		weight = val(weight.x);
 #endif
 
-		weight *= spatial_r(r);
+		weight *= spatial_weight;
 
-#if WD == 2 // weight discard
+#if AS
+		spatial_weight *= int(r.z == 0); // ignore temporal
+		sum_s += px * spatial_weight;
+		total_weight_s += spatial_weight;
+#endif
+
+#if WD == 2 // weight discard (mean)
 		all_weights[r_index] = val_pack(weight);
-		all_pixels[r_index] = val_pack(load(r+me));
+		all_pixels[r_index] = val_pack(px);
 		r_index++;
-#elif WD == 1 // weight discard
-		val wd_scale = 1.0/max(no_weights, 1);
-		val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
-		discard_sum += load(r+me) * weight * (1 - keeps);
-		discard_total_weight += weight * (1 - keeps);
-		no_weights += keeps;
+#elif WD == 1 // weight discard (moving cumulative average)
+		val wd_scale = val(1.0/r_iter);
+		val below_threshold = WDS * abs(min(val(0.0), weight - (total_weight * wd_scale * WDT * WD1TK(sqrt(wd_scale*WDP)))));
+#if defined(LUMA_raw)
+		val wdkf = WDK(below_threshold);
+#elif defined(CHROMA_raw)
+		val wdkf = vec2(WDK(below_threshold.x), WDK(below_threshold.y));
+#else
+		val wdkf = vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.y));
+#endif
+		wd_sum += px * weight * wdkf;
+		wd_total_weight += weight * wdkf;
+		r_iter++;
 #endif
 
-		sum += load(r+me) * weight;
+		sum += px * weight;
 		total_weight += weight;
 	} // FOR_RESEARCH
 	} // FOR_FRAME
@@ -1191,37 +3271,37 @@ vec4 hook()
 	val avg_weight = total_weight * r_scale;
 	val old_avg_weight = avg_weight;
 
-#if WD == 2 // true average
+#if WD == 2 // weight discard (mean)
 	total_weight = val(0);
 	sum = val(0);
-	val no_weights = val(0);
 
 	for (int i = 0; i < r_area; i++) {
-		val w = val_unpack(all_weights[i]);
+		val weight = val_unpack(all_weights[i]);
 		val px = val_unpack(all_pixels[i]);
-		val keeps = step(avg_weight*WDT, w);
 
-		w *= keeps;
-		sum += px * w;
-		total_weight += w;
-		no_weights += keeps;
+		val below_threshold = WDS * abs(min(val(0.0), weight - (avg_weight * WDT)));
+#if defined(LUMA_raw)
+		weight *= WDK(below_threshold);
+#elif defined(CHROMA_raw)
+		weight *= vec2(WDK(below_threshold.x), WDK(below_threshold.y));
+#else
+		weight *= vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.z));
+#endif
+
+		sum += px * weight;
+		total_weight += weight;
 	}
-#elif WD == 1 // moving cumulative average
-	total_weight -= discard_total_weight;
-	sum -= discard_sum;
+#elif WD == 1 // weight discard (moving cumulative average)
+	total_weight = wd_total_weight;
+	sum = wd_sum;
 #endif
 #if WD // weight discard
-	avg_weight = total_weight / no_weights;
+	avg_weight = total_weight * r_scale;
 #endif
 
 	total_weight += SW * spatial_r(vec3(0));
 	sum += poi * SW * spatial_r(vec3(0));
-
-#if V == 3 // weight map
-	result = val(avg_weight);
-#else // mean
 	result = val(sum / total_weight);
-#endif
 
 	// store frames for temporal
 #if T > 1
@@ -1233,27 +3313,17 @@ vec4 hook()
 	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2));
 #endif
 
-#if ASW == 0 // pre-WD weights
-#define AS_weight old_avg_weight
-#elif ASW == 1 // post-WD weights
-#define AS_weight avg_weight
-#endif
-
-#if ASK == 0
-	val sharpening_strength = pow(AS_weight, val(ASP));
-#elif ASK == 1
-	val sharpening_strength = mix(
-			pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)),
-			AS_weight, ASC);
-	// XXX normalize the result to account for a negative ASC?
-#elif ASK == 2
-	val sharpening_strength = val(ASP);
-#endif
-
 #if AS == 1 // sharpen+denoise
-	val sharpened = result + (poi - result) * ASF;
+#define AS_base result
 #elif AS == 2 // sharpen only
-	val sharpened = poi + (poi - result) * ASF;
+#define AS_base poi
+#endif
+#if AS
+	val usm = result - sum_s/total_weight_s;
+	usm = exp(log(abs(usm))*ASP) * sign(usm); // avoiding pow() since it's buggy on nvidia
+	usm *= gaussian(abs((AS_base + usm - 0.5) / 1.5) * ASA);
+	usm *= ASF;
+	result = AS_base + usm;
 #endif
 
 #if EP // extremes preserve
@@ -1261,26 +3331,27 @@ vec4 hook()
 	// EPSILON is needed since pow(0,0) is undefined
 	float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP));
 	result = mix(poi, result, ep_weight);
-#endif
-
-#if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_strength);
-#elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_strength);
-#endif
-
-#if V == 4 // edge map
-	result = sharpening_strength;
-#endif
-
-#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations
-	return vec4(0.5);
+#else
+	float ep_weight = 0;
 #endif
 
 #if V == 1
 	result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0);
 #elif V == 2
 	result = (poi - result) * 0.5 + 0.5;
+#elif V == 3 // post-WD weight map
+	result = avg_weight;
+#elif V == 4 // pre-WD edge map
+	result = old_avg_weight;
+#elif V == 5
+	result = 0.5 + usm;
+#elif V == 6
+	result = val(1 - ep_weight);
+#endif
+
+// XXX visualize chroma for these
+#if defined(CHROMA_raw) && (V == 3 || V == 4 || V == 6)
+	return vec4(0.5);
 #endif
 
 	return unval(mix(poi, result, BF));
diff --git a/portable_config/shaders/nlmeans_lgc.glsl b/portable_config/shaders/nlmeans_lgc.glsl
deleted file mode 100644
index 384d3a88..00000000
--- a/portable_config/shaders/nlmeans_lgc.glsl
+++ /dev/null
@@ -1,1043 +0,0 @@
-/* vi: ft=c
- *
- * Based on vf_nlmeans.c from FFmpeg.
- *
- * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
- * Copyright (c) 2016 Clément Bœsch <u pkh me>
- *
- * This program is free software: you can redistribute it and/or modify it 
- * under the terms of the GNU Lesser General Public License as published by 
- * the Free Software Foundation, either version 2.1 of the License, or (at 
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT 
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
- * for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License 
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-// Description: nlmeans_lgc.glsl: Experimental luma-guided chroma denoising, kinda similar to KrigBilateral
-
-/* The recommended usage of this shader and its variant profiles is to add them 
- * to input.conf and then dispatch the appropriate shader via a keybind during 
- * media playback. Here is an example input.conf entry:
- *
- * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)"
- *
- * These shaders can also be enabled by default in mpv.conf, for example:
- *
- * glsl-shaders='~~/shaders/nlmeans.glsl'
- *
- * Both of the examples above assume the shaders are located in a subdirectory 
- * named "shaders" within mpv's config directory. Refer to the mpv 
- * documentation for more details.
- *
- * This shader is highly configurable via user variables below. Although the 
- * default settings should offer good quality at a reasonable speed, you are 
- * encouraged to tweak them to your preferences. Be mindful that certain 
- * settings may greatly affect speed.
- *
- * Denoising is most useful for noisy content. If there is no perceptible 
- * noise, you probably won't see a positive difference.
- *
- * The default settings are generally tuned for low noise and high detail 
- * preservation. The "medium" and "heavy" profiles are tuned for higher levels 
- * of noise.
- *
- * The denoiser will not work properly if the content has been upscaled 
- * beforehand (whether it was done by you or not). In such cases, consider 
- * issuing a command to downscale in the mpv console (backtick ` key):
- *
- * vf toggle scale=-2:720
- *
- * ...replacing 720 with whatever resolution seems appropriate. Rerun the 
- * command to undo the downscale. It may take some trial-and-error to find the 
- * proper resolution.
- */
-
-/* Regarding speed
- *
- * Speed may vary wildly for different vo and gpu-api settings. Generally 
- * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this 
- * may be different for your system.
- *
- * If your GPU doesn't support textureGather, or if you are on a version of mpv 
- * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile
- *
- * If you plan on tinkering with NLM's settings, read below:
- *
- * textureGather only applies to luma and limited to the these configurations:
- *
- * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}
- *   - Default, very fast, rotations and reflections should be free
- *   - If this is unusually slow then try changing gpu-api and vo
- *   - If it's still slow, try setting RI/RFI to 0.
- *
- * - PS=6:RI={0,1,3}:RFI={0,1,2}
- *   - Currently the only scalable variant
- *   - Patch shape is asymmetric on two axis
- *   - Rotations should have very little speed impact
- *   - Reflections may have a significant speed impact
- *
- * Options which always disable textureGather:
- * 	- PD
- * 	- NG
- */
-
-//!HOOK CHROMA
-//!BIND LUMA
-//!WIDTH LUMA.w
-//!HEIGHT LUMA.h
-//!DESC Non-local means (RF, share)
-//!SAVE RF
-
-vec4 hook()
-{
-	return LUMA_texOff(0);
-}
-
-//!HOOK CHROMA
-//!BIND HOOKED
-//!BIND RF
-//!DESC Non-local means (nlmeans_lgc.glsl)
-
-// User variables
-
-// It is generally preferable to denoise luma and chroma differently, so the 
-// user variables for luma and chroma are split.
-
-// Denoising factor (level of blur, higher means more blur)
-#ifdef LUMA_raw
-#define S 11.66
-#else
-#define S 11.66
-#endif
-
-/* Adaptive sharpening
- *
- * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
- * weight map to restrict the sharpening to edges.
- *
- * If you just want to increase/decrease sharpness then you want to change ASF.
- *
- * Use V=4 to visualize which areas are sharpened (black means sharpen).
- *
- * AS:
- * 	- 0 to disable
- * 	- 1 to sharpen+denoise
- * 	- 2 to sharpen only
- * ASF: Higher numbers make a sharper image
- * ASP: Higher numbers use more of the sharp image
- * ASW:
- * 	- 0 to use pre-WD weights
- * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
- * ASK: Weight kernel:
- * 	- 0 for power. This is the old method.
- * 	- 1 for sigmoid. This is generally recommended.
- * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
- * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
- */
-#ifdef LUMA_raw
-#define AS 0
-#define ASF 3.0
-#define ASP 1.0
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
-#else
-#define AS 0
-#define ASF 3.0
-#define ASP 1.0
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
-#endif
-
-/* Starting weight
- *
- * Also known as the center weight. This represents the weight of the 
- * pixel-of-interest. Lower numbers may help handle heavy noise & ringing.
- *
- * EPSILON should be used instead of zero to avoid divide-by-zero errors.
- */
-#ifdef LUMA_raw
-#define SW 0.75
-#else
-#define SW 0.75
-#endif
-
-/* Weight discard
- *
- * Discard weights that fall below a fraction of the average weight. This culls 
- * the most dissimilar samples from the blur, yielding a much more pleasant 
- * result, especially around edges.
- * 
- * WD:
- * 	- 2: True average. Better quality, but slower and requires GLSL 4.0 or later
- * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
- * 	- 0: Disable
- *
- * WDT: Threshold coefficient, higher numbers discard more
- * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
- */
-#ifdef LUMA_raw
-#define WD 0
-#define WDT 0.5
-#define WDP 6.0
-#else
-#define WD 0
-#define WDT 0.75
-#define WDP 6.0
-#endif
-
-/* Extremes preserve
- *
- * Reduces denoising around very bright/dark areas.
- *
- * The downscaling factor of the EP shader stage affects what is considered a 
- * bright/dark area. The default of 3 should be fine, it's not recommended to 
- * change this.
- *
- * This is incompatible with RGB. If you have RGB hooks enabled then you will 
- * have to delete the EP shader stage or specify EP=0 through shader_cfg.
- *
- * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
- */
-#ifdef LUMA_raw
-#define EP 0
-#define BP 0.75
-#define DP 0.25
-#else
-#define EP 0
-#define BP 0.0
-#define DP 0.0
-#endif
-
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-
-/* Patch & research sizes
- *
- * Patch size should be an odd number greater than or equal to 3. Higher values 
- * are slower and not always better.
- *
- * Research size be an odd number greater than or equal to 3. Higher values are 
- * generally better, but slower, blurrier, and gives diminishing returns.
- */
-#ifdef LUMA_raw
-#define P 3
-#define R 5
-#else
-#define P 3
-#define R 5
-#endif
-
-/* Patch and research shapes
- *
- * Different shapes have different speed and quality characteristics. Every 
- * shape (besides square) is smaller than square.
- *
- * PS applies applies to patches, RS applies to research zones.
- *
- * Be wary of gather optimizations (see the Regarding Speed comment at the top)
- *
- * 0: square (symmetrical)
- * 1: horizontal line (asymmetric)
- * 2: vertical line (asymmetric)
- * 3: diamond (symmetrical)
- * 4: triangle (asymmetric, pointing upward)
- * 5: truncated triangle (asymmetric on two axis, last row halved)
- * 6: even sized square (asymmetric on two axis)
- * 7: plus (symmetrical)
- */
-#ifdef LUMA_raw
-#define RS 3
-#define PS 3
-#else
-#define RS 3
-#define PS 3
-#endif
-
-/* Robust filtering
- *
- * This setting is dependent on code generation from shader_cfg, so this 
- * setting can only be enabled via shader_cfg.
- *
- * Compares the pixel-of-interest against a guide, which could be a downscaled 
- * image or the output of another shader
- */
-#define RF_LUMA 0
-#define RF 1
-
-/* Rotational/reflectional invariance
- *
- * Number of rotations/reflections to try for each patch comparison. Can be 
- * slow, but improves feature preservation. More rotations/reflections gives 
- * diminishing returns. The most similar rotation/reflection will be used.
- *
- * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
- * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
- *
- * RI: Rotational invariance
- * RFI (0 to 2): Reflectional invariance
- */
-#ifdef LUMA_raw
-#define RI 3
-#define RFI 2
-#else
-#define RI 3
-#define RFI 2
-#endif
-
-/* Temporal denoising
- *
- * This setting is dependent on code generation from shader_cfg, so this 
- * setting can only be enabled via shader_cfg.
- *
- * Caveats:
- * 	- Slower:
- * 		- Each frame needs to be researched (more samples & more math)
- * 		- Gather optimizations only apply to the current frame
- * 	- Requires vo=gpu-next
- * 	- Luma-only (this is a bug)
- * 	- Buggy
- *
- * May cause motion blur and may struggle more with noise that persists across 
- * multiple frames (e.g., from compression or duplicate frames), but can work 
- * very well on high quality video.
- *
- * Motion estimation (ME) should improve quality without impacting speed.
- *
- * T: number of frames used
- * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
- * MEF: estimate factor, compensates for ME being one frame behind
- * TRF: compare against the denoised frames
- */
-#ifdef LUMA_raw
-#define T 0
-#define ME 1
-#define MEF 2
-#define TRF 0
-#else
-#define T 0
-#define ME 0
-#define MEF 2
-#define TRF 0
-#endif
-
-/* Spatial kernel
- *
- * Increasing the spatial denoising factor (SS) reduces the weight of further 
- * pixels.
- *
- * Spatial distortion instructs the spatial kernel to view that axis as 
- * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
- * appear closer and increase blur between frames.
- *
- * The intra-patch variants are supposed to help with larger patch sizes.
- *
- * SST: enables spatial kernel if R>=PST, 0 fully disables
- * SS: spatial sigma
- * SD: spatial distortion (X, Y, time)
- * PSS: intra-patch spatial sigma
- * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
- * PSD: intra-patch spatial distortion (X, Y)
- */
-#ifdef LUMA_raw
-#define SST 1
-#define SS 0.25
-#define SD vec3(1,1,1)
-#define PST 0
-#define PSS 0.0
-#define PSD vec2(1,1)
-#else
-#define SST 1
-#define SS 0.25
-#define SD vec3(1,1,1)
-#define PST 0
-#define PSS 0.0
-#define PSD vec2(1,1)
-#endif
-
-/* Kernels
- *
- * SK: spatial kernel
- * RK: range kernel (takes patch differences)
- * PSK: intra-patch spatial kernel
- *
- * List of available kernels:
- *
- * bicubic
- * cos
- * gaussian
- * lanczos
- * quadratic
- * sinc
- * sphinx
- */
-#ifdef LUMA_raw
-#define SK gaussian
-#define RK gaussian
-#define PSK gaussian
-#else
-#define SK gaussian
-#define RK gaussian
-#define PSK gaussian
-#endif
-
-// Scaling factor (should match WIDTH/HEIGHT)
-#ifdef LUMA_raw
-#define SF 1
-#else
-#define SF 1
-#endif
-
-/* Visualization
- *
- * 0: off
- * 1: absolute difference between input/output to the power of 0.25
- * 2: difference between input/output centered on 0.5
- * 3: avg_weight
- * 4: edge map (based on the relevant AS settings)
- */
-#ifdef LUMA_raw
-#define V 0
-#else
-#define V 0
-#endif
-
-// Blur factor (0.0 returns the input image, 1.0 returns the output image)
-#ifdef LUMA_raw
-#define BF 1.0
-#else
-#define BF 1.0
-#endif
-
-// Force disable textureGather
-#ifdef LUMA_raw
-#define NG 0
-#else
-#define NG 0
-#endif
-
-// Patch donut (probably useless)
-#ifdef LUMA_raw
-#define PD 0
-#else
-#define PD 0
-#endif
-
-// Duplicate 1st weight (for luma-guided-chroma)
-#ifdef LUMA_raw
-#define D1W 1
-#else
-#define D1W 1
-#endif
-
-// Skip patch comparison
-#ifdef LUMA_raw
-#define SKIP_PATCH 0
-#else
-#define SKIP_PATCH 0
-#endif
-
-// Shader code
-
-#define EPSILON 0.00000000001
-#define M_PI 3.14159265358979323846
-#define POW2(x) ((x)*(x))
-#define POW3(x) ((x)*(x)*(x))
-#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
-#define gaussian(x) exp(-1 * POW2(x))
-#define lanczos(x) POW2(sinc(x))
-#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
-#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
-#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
-
-// XXX could maybe be better optimized on LGC
-// XXX return original alpha component instead of 1.0
-#if defined(LUMA_raw)
-#define val float
-#define val_swizz(v) (v.x)
-#define unval(v) vec4(v.x, 0, 0, 1.0)
-#define val_packed val
-#define val_pack(v) (v)
-#define val_unpack(v) (v)
-#elif defined(CHROMA_raw)
-#define val vec2
-#define val_swizz(v) (v.xy)
-#define unval(v) vec4(v.x, v.y, 0, 1.0)
-#define val_packed uint
-#define val_pack(v) packUnorm2x16(v)
-#define val_unpack(v) unpackUnorm2x16(v)
-#else
-#define val vec3
-#define val_swizz(v) (v.xyz)
-#define unval(v) vec4(v.x, v.y, v.z, 1.0)
-#define val_packed val
-#define val_pack(v) (v)
-#define val_unpack(v) (v)
-#endif
-
-#if PS == 6
-const int hp = P/2;
-#else
-const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes
-#endif
-
-#if RS == 6
-const int hr = R/2;
-#else
-const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes
-#endif
-
-// donut increment, increments without landing on (0,0,0)
-// much faster than a continue statement
-#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
-
-// patch/research shapes
-// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
-// dots (.) represent samples (pixels) and X represents the pixel-of-interest
-
-// Z    .....
-// Z    .....
-// Z    ..X..
-// Z    .....
-// Z    .....
-#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
-
-// (in this instance Z=4)
-// Z    ....
-// Z    ....
-// Z    ..X.
-// Z    ....
-#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
-
-// Z-4    .
-// Z-2   ...
-// Z    ..X..
-#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr)
-
-// Z-4    .
-// Z-2   ...
-// hz+1 ..X
-#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr)
-#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
-
-// Z-4    .
-// Z-2   ...
-// Z    ..X..
-// Z-2   ...
-// Z-4    .
-#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr)
-#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
-
-//
-// Z    ..X..
-//
-#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
-
-// 90 degree rotation of S_HORIZONTAL
-#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
-
-// 1      .
-// 1      . 
-// Z    ..X..
-// 1      . 
-// 1      .
-#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
-#define S_PLUS_A(hz,Z) (Z*2 - 1)
-
-// XXX implement S_PLUS w/ an X overlayed:
-// 3    . . .
-// 3     ...
-// Z    ..X..
-// 3     ...
-// 3    . . .
-
-// XXX implement an X shape:
-// 2    .   .
-// 2     . .
-// 1      X  
-// 2     . .
-// 2    .   .
-
-// 1x1 square
-#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
-
-#define T1 (T+1)
-#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++)
-
-#ifdef LUMA_raw
-#define RF_ RF_LUMA
-#else
-#define RF_ RF
-#endif
-
-// Skip comparing the pixel-of-interest against itself, unless RF is enabled
-#if RF_
-#define RINCR(z,c) (z.c++)
-#else
-#define RINCR DINCR
-#endif
-
-#define R_AREA(a) (a * T1 + RF_-1)
-
-// research shapes
-// XXX would be nice to have the option of temporally-varying research sizes
-#if R == 0 || R == 1
-#define FOR_RESEARCH(r) S_1X1(r)
-const int r_area = R_AREA(1);
-#elif RS == 7
-#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y))
-const int r_area = R_AREA(S_PLUS_A(hr,R));
-#elif RS == 6
-#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R*R);
-#elif RS == 5
-#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x))
-const int r_area = R_AREA(S_TRIANGLE_A(hr,hr));
-#elif RS == 4
-#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x))
-const int r_area = R_AREA(S_TRIANGLE_A(hr,R));
-#elif RS == 3
-#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y))
-const int r_area = R_AREA(S_DIAMOND_A(hr,R));
-#elif RS == 2
-#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R);
-#elif RS == 1
-#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x))
-const int r_area = R_AREA(R);
-#elif RS == 0
-#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R*R);
-#endif
-
-#define RI1 (RI+1)
-#define RFI1 (RFI+1)
-
-#if RI
-#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1)
-#else
-#define FOR_ROTATION
-#endif
-
-#if RFI
-#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++)
-#else
-#define FOR_REFLECTION
-#endif
-
-#if PD
-#define PINCR DINCR
-#else
-#define PINCR(z,c) (z.c++)
-#endif
-
-#define P_AREA(a) (a - PD)
-
-// patch shapes
-#if P == 0 || P == 1
-#define FOR_PATCH(p) S_1X1(p)
-const int p_area = P_AREA(1);
-#elif PS == 7
-#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y))
-const int p_area = P_AREA(S_PLUS_A(hp,P));
-#elif PS == 6
-#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P*P);
-#elif PS == 5
-#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x))
-const int p_area = P_AREA(S_TRIANGLE_A(hp,hp));
-#elif PS == 4
-#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x))
-const int p_area = P_AREA(S_TRIANGLE_A(hp,P));
-#elif PS == 3
-#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y))
-const int p_area = P_AREA(S_DIAMOND_A(hp,P));
-#elif PS == 2
-#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P);
-#elif PS == 1
-#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x))
-const int p_area = P_AREA(P);
-#elif PS == 0
-#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P*P);
-#endif
-
-const float r_scale = 1.0/r_area;
-const float p_scale = 1.0/p_area;
-
-#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size)))
-#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
-
-#if RF_ && defined(LUMA_raw)
-#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off)
-#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
-#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
-#elif RF_ && D1W
-#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
-#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
-#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
-#elif RF_
-#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
-#else
-#define load2_(off) load_(off)
-#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
-#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
-#endif
-
-#if T
-val load(vec3 off)
-{
-	switch (min(int(off.z), frame)) {
-	case 0: return val_swizz(load_(off));
-
-	}
-}
-val load2(vec3 off)
-{
-	return off.z == 0 ? val_swizz(load2_(off)) : load(off);
-}
-#else
-#define load(off) val_swizz(load_(off))
-#define load2(off) val_swizz(load2_(off))
-#endif
-
-val poi = load(vec3(0)); // pixel-of-interest
-val poi2 = load2(vec3(0)); // guide pixel-of-interest
-
-#if RI // rotation
-vec2 rot(vec2 p, float d)
-{
-	return vec2(
-		p.x * cos(radians(d)) - p.y * sin(radians(d)),
-		p.y * sin(radians(d)) + p.x * cos(radians(d))
-	);
-}
-#else
-#define rot(p, d) (p)
-#endif
-
-#if RFI // reflection
-vec2 ref(vec2 p, int d)
-{
-	switch (d) {
-	case 0: return p;
-	case 1: return p * vec2(1, -1);
-	case 2: return p * vec2(-1, 1);
-	}
-}
-#else
-#define ref(p, d) (p)
-#endif
-
-#if SST && R >= SST
-float spatial_r(vec3 v)
-{
-	v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size);
-	return SK(length(v*SD)*SS);
-}
-#else
-#define spatial_r(v) (1)
-#endif
-
-#if PST && P >= PST
-#define spatial_p(v) PSK(length(v*PSD)*PSS)
-#else
-#define spatial_p(v) (1)
-#endif
-
-val range(val pdiff_sq)
-{
-	const float h = S*0.013;
-	const float pdiff_scale = 1.0/(h*h);
-	pdiff_sq = sqrt(pdiff_sq * pdiff_scale);
-#if defined(LUMA_raw)
-	return RK(pdiff_sq);
-#elif defined(CHROMA_raw)
-	return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y));
-#else
-	return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z));
-#endif
-	//return exp(-pdiff_sq * pdiff_scale);
-
-	// weight function from the NLM paper, it's not very good
-	//return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale);
-}
-
-val patch_comparison(vec3 r, vec3 r2)
-{
-	vec3 p;
-	val min_rot = val(p_area);
-
-	FOR_ROTATION FOR_REFLECTION {
-		val pdiff_sq = val(0);
-		FOR_PATCH(p) {
-			vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z);
-			val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);
-			diff_sq *= diff_sq;
-			diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy);
-			pdiff_sq += diff_sq;
-		}
-		min_rot = min(min_rot, pdiff_sq);
-	}
-
-	return min_rot * p_scale;
-}
-
-#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
-#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
-
-#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
-// 3x3 diamond/plus patch_comparison_gather
-// XXX extend to support arbitrary sizes (probably requires code generation)
-// XXX extend to support 3x3 square
-// XXX support PSS
-const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
-const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
-vec4 poi_patch = gather_offs(0, offsets);
-float patch_comparison_gather(vec3 r, vec3 r2)
-{
-	float min_rot = p_area - 1;
-	vec4 transformer = gather_offs(r, offsets_sf);
-	FOR_ROTATION {
-		FOR_REFLECTION {
-			float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1));
-			min_rot = min(diff_sq, min_rot);
-#if RFI
-			switch(rfi) {
-			case 0: transformer = transformer.zyxw; break;
-			case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror
-			case 2: transformer = transformer.zyxw; break; // undoes last mirror
-			}
-#endif
-		}
-#if RI == 3
-		transformer = transformer.wxyz;
-#elif RI == 1
-		transformer = transformer.zwxy;
-#endif
-	}
-	float center_diff_sq = poi2.x - load2(r).x;
-	center_diff_sq *= center_diff_sq;
-	return (min_rot + center_diff_sq) * p_scale;
-}
-#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
-// tiled even square patch_comparison_gather
-// XXX extend to support odd square?
-float patch_comparison_gather(vec3 r, vec3 r2)
-{
-	vec2 tile;
-	float min_rot = p_area;
-
-	/* gather order:
-	 * w z
-	 * x y
-	 */
-	float pdiff_sq = 0;
-	for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
-		vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy);
-		diff_sq *= diff_sq;
-		diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)),
-			                                 spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0)));
-		pdiff_sq += dot(diff_sq, vec4(1));
-	}
-	min_rot = min(min_rot, pdiff_sq);
-
-	return min_rot * p_scale;
-}
-#else
-#define patch_comparison_gather patch_comparison
-#endif
-
-vec4 hook()
-{
-	val total_weight = val(0);
-	val sum = val(0);
-	val result = val(0);
-
-	vec3 r = vec3(0);
-	vec3 p = vec3(0);
-	vec3 me = vec3(0);
-
-#if T && ME == 1 // temporal & motion estimation
-	vec3 me_tmp = vec3(0);
-	float maxweight = 0;
-#elif T && ME == 2 // temporal & motion estimation
-	vec3 me_sum = vec3(0);
-	float me_weight = 0;
-#endif
-
-#if WD == 2 // weight discard
-	int r_index = 0;
-	val_packed all_weights[r_area];
-	val_packed all_pixels[r_area];
-#elif WD == 1 // weight discard
-	val no_weights = val(0);
-	val discard_total_weight = val(0);
-	val discard_sum = val(0);
-#endif
-
-	FOR_FRAME(r) {
-	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
-#if T && ME == 1 // temporal & motion estimation max weight
-	if (r.z > 0) {
-		me += me_tmp * MEF;
-		me_tmp = vec3(0);
-		maxweight = 0;
-	}
-#elif T && ME == 2 // temporal & motion estimation weighted average
-	if (r.z > 0) {
-		me += round(me_sum / me_weight * MEF);
-		me_sum = vec3(0);
-		me_weight = 0;
-	}
-#endif
-	FOR_RESEARCH(r) { // main NLM logic
-#if SKIP_PATCH
-		val weight = val(1);
-#else
-		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0));
-		val weight = range(pdiff_sq);
-#endif
-
-#if T && ME == 1 // temporal & motion estimation max weight
-		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
-		maxweight = max(maxweight, weight.x);
-#elif T && ME == 2 // temporal & motion estimation weighted average
-		me_sum += vec3(r.xy,0) * weight.x;
-		me_weight += weight.x;
-#endif
-
-#if D1W
-		weight = val(weight.x);
-#endif
-
-		weight *= spatial_r(r);
-
-#if WD == 2 // weight discard
-		all_weights[r_index] = val_pack(weight);
-		all_pixels[r_index] = val_pack(load(r+me));
-		r_index++;
-#elif WD == 1 // weight discard
-		val wd_scale = 1.0/max(no_weights, 1);
-		val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
-		discard_sum += load(r+me) * weight * (1 - keeps);
-		discard_total_weight += weight * (1 - keeps);
-		no_weights += keeps;
-#endif
-
-		sum += load(r+me) * weight;
-		total_weight += weight;
-	} // FOR_RESEARCH
-	} // FOR_FRAME
-
-	val avg_weight = total_weight * r_scale;
-	val old_avg_weight = avg_weight;
-
-#if WD == 2 // true average
-	total_weight = val(0);
-	sum = val(0);
-	val no_weights = val(0);
-
-	for (int i = 0; i < r_area; i++) {
-		val w = val_unpack(all_weights[i]);
-		val px = val_unpack(all_pixels[i]);
-		val keeps = step(avg_weight*WDT, w);
-
-		w *= keeps;
-		sum += px * w;
-		total_weight += w;
-		no_weights += keeps;
-	}
-#elif WD == 1 // moving cumulative average
-	total_weight -= discard_total_weight;
-	sum -= discard_sum;
-#endif
-#if WD // weight discard
-	avg_weight = total_weight / no_weights;
-#endif
-
-	total_weight += SW * spatial_r(vec3(0));
-	sum += poi * SW * spatial_r(vec3(0));
-
-#if V == 3 // weight map
-	result = val(avg_weight);
-#else // mean
-	result = val(sum / total_weight);
-#endif
-
-	// store frames for temporal
-#if T > 1
-
-#endif
-#if T && TRF
-	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result));
-#elif T
-	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2));
-#endif
-
-#if ASW == 0 // pre-WD weights
-#define AS_weight old_avg_weight
-#elif ASW == 1 // post-WD weights
-#define AS_weight avg_weight
-#endif
-
-#if ASK == 0
-	val sharpening_strength = pow(AS_weight, val(ASP));
-#elif ASK == 1
-	val sharpening_strength = mix(
-			pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)),
-			AS_weight, ASC);
-	// XXX normalize the result to account for a negative ASC?
-#elif ASK == 2
-	val sharpening_strength = val(ASP);
-#endif
-
-#if AS == 1 // sharpen+denoise
-	val sharpened = result + (poi - result) * ASF;
-#elif AS == 2 // sharpen only
-	val sharpened = poi + (poi - result) * ASF;
-#endif
-
-#if EP // extremes preserve
-	float luminance = EP_texOff(0).x;
-	// EPSILON is needed since pow(0,0) is undefined
-	float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP));
-	result = mix(poi, result, ep_weight);
-#endif
-
-#if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_strength);
-#elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_strength);
-#endif
-
-#if V == 4 // edge map
-	result = sharpening_strength;
-#endif
-
-#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations
-	return vec4(0.5);
-#endif
-
-#if V == 1
-	result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0);
-#elif V == 2
-	result = (poi - result) * 0.5 + 0.5;
-#endif
-
-	return unval(mix(poi, result, BF));
-}
-
diff --git a/portable_config/shaders/nlmeans_lq.glsl b/portable_config/shaders/nlmeans_lq.glsl
deleted file mode 100644
index 80eaf745..00000000
--- a/portable_config/shaders/nlmeans_lq.glsl
+++ /dev/null
@@ -1,1086 +0,0 @@
-/* vi: ft=c
- *
- * Based on vf_nlmeans.c from FFmpeg.
- *
- * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
- * Copyright (c) 2016 Clément Bœsch <u pkh me>
- *
- * This program is free software: you can redistribute it and/or modify it 
- * under the terms of the GNU Lesser General Public License as published by 
- * the Free Software Foundation, either version 2.1 of the License, or (at 
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT 
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
- * for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License 
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-// Description: nlmeans_lq.glsl: Faster, but lower quality.
-
-/* The recommended usage of this shader and its variant profiles is to add them 
- * to input.conf and then dispatch the appropriate shader via a keybind during 
- * media playback. Here is an example input.conf entry:
- *
- * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)"
- *
- * These shaders can also be enabled by default in mpv.conf, for example:
- *
- * glsl-shaders='~~/shaders/nlmeans.glsl'
- *
- * Both of the examples above assume the shaders are located in a subdirectory 
- * named "shaders" within mpv's config directory. Refer to the mpv 
- * documentation for more details.
- *
- * This shader is highly configurable via user variables below. Although the 
- * default settings should offer good quality at a reasonable speed, you are 
- * encouraged to tweak them to your preferences. Be mindful that certain 
- * settings may greatly affect speed.
- *
- * Denoising is most useful for noisy content. If there is no perceptible 
- * noise, you probably won't see a positive difference.
- *
- * The default settings are generally tuned for low noise and high detail 
- * preservation. The "medium" and "heavy" profiles are tuned for higher levels 
- * of noise.
- *
- * The denoiser will not work properly if the content has been upscaled 
- * beforehand (whether it was done by you or not). In such cases, consider 
- * issuing a command to downscale in the mpv console (backtick ` key):
- *
- * vf toggle scale=-2:720
- *
- * ...replacing 720 with whatever resolution seems appropriate. Rerun the 
- * command to undo the downscale. It may take some trial-and-error to find the 
- * proper resolution.
- */
-
-/* Regarding speed
- *
- * Speed may vary wildly for different vo and gpu-api settings. Generally 
- * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this 
- * may be different for your system.
- *
- * If your GPU doesn't support textureGather, or if you are on a version of mpv 
- * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile
- *
- * If you plan on tinkering with NLM's settings, read below:
- *
- * textureGather only applies to luma and limited to the these configurations:
- *
- * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}
- *   - Default, very fast, rotations and reflections should be free
- *   - If this is unusually slow then try changing gpu-api and vo
- *   - If it's still slow, try setting RI/RFI to 0.
- *
- * - PS=6:RI={0,1,3}:RFI={0,1,2}
- *   - Currently the only scalable variant
- *   - Patch shape is asymmetric on two axis
- *   - Rotations should have very little speed impact
- *   - Reflections may have a significant speed impact
- *
- * Options which always disable textureGather:
- * 	- PD
- * 	- NG
- */
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND HOOKED
-//!WIDTH HOOKED.w 1.25 /
-//!HEIGHT HOOKED.h 1.25 /
-//!DESC Non-local means (PRERF)
-//!SAVE PRERF_LUMA
-
-vec4 hook()
-{
-	return HOOKED_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND PRERF_LUMA
-//!WIDTH HOOKED.w
-//!HEIGHT HOOKED.h
-//!DESC Non-local means (RF)
-//!SAVE RF_LUMA
-
-vec4 hook()
-{
-	return PRERF_LUMA_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND RF_LUMA
-//!WIDTH RF_LUMA.w
-//!HEIGHT RF_LUMA.h
-//!DESC Non-local means (RF, share)
-//!SAVE RF
-
-vec4 hook()
-{
-	return RF_LUMA_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND LUMA
-//!WIDTH LUMA.w 3 /
-//!HEIGHT LUMA.h 3 /
-//!DESC Non-local means (EP)
-//!SAVE EP
-
-vec4 hook()
-{
-	return LUMA_texOff(0);
-}
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND HOOKED
-//!BIND RF_LUMA
-//!BIND RF
-//!BIND EP
-//!DESC Non-local means (nlmeans_lq.glsl)
-
-// User variables
-
-// It is generally preferable to denoise luma and chroma differently, so the 
-// user variables for luma and chroma are split.
-
-// Denoising factor (level of blur, higher means more blur)
-#ifdef LUMA_raw
-#define S 1.25
-#else
-#define S 5.0
-#endif
-
-/* Adaptive sharpening
- *
- * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
- * weight map to restrict the sharpening to edges.
- *
- * If you just want to increase/decrease sharpness then you want to change ASF.
- *
- * Use V=4 to visualize which areas are sharpened (black means sharpen).
- *
- * AS:
- * 	- 0 to disable
- * 	- 1 to sharpen+denoise
- * 	- 2 to sharpen only
- * ASF: Higher numbers make a sharper image
- * ASP: Higher numbers use more of the sharp image
- * ASW:
- * 	- 0 to use pre-WD weights
- * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
- * ASK: Weight kernel:
- * 	- 0 for power. This is the old method.
- * 	- 1 for sigmoid. This is generally recommended.
- * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
- * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
- */
-#ifdef LUMA_raw
-#define AS 0
-#define ASF 3.0
-#define ASP 1.0
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
-#else
-#define AS 0
-#define ASF 3.0
-#define ASP 1.0
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
-#endif
-
-/* Starting weight
- *
- * Also known as the center weight. This represents the weight of the 
- * pixel-of-interest. Lower numbers may help handle heavy noise & ringing.
- *
- * EPSILON should be used instead of zero to avoid divide-by-zero errors.
- */
-#ifdef LUMA_raw
-#define SW 1.0
-#else
-#define SW 0.5
-#endif
-
-/* Weight discard
- *
- * Discard weights that fall below a fraction of the average weight. This culls 
- * the most dissimilar samples from the blur, yielding a much more pleasant 
- * result, especially around edges.
- * 
- * WD:
- * 	- 2: True average. Better quality, but slower and requires GLSL 4.0 or later
- * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
- * 	- 0: Disable
- *
- * WDT: Threshold coefficient, higher numbers discard more
- * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
- */
-#ifdef LUMA_raw
-#define WD 1
-#define WDT 0.5
-#define WDP 6.0
-#else
-#define WD 1
-#define WDT 0.75
-#define WDP 6.0
-#endif
-
-/* Extremes preserve
- *
- * Reduces denoising around very bright/dark areas.
- *
- * The downscaling factor of the EP shader stage affects what is considered a 
- * bright/dark area. The default of 3 should be fine, it's not recommended to 
- * change this.
- *
- * This is incompatible with RGB. If you have RGB hooks enabled then you will 
- * have to delete the EP shader stage or specify EP=0 through shader_cfg.
- *
- * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
- */
-#ifdef LUMA_raw
-#define EP 1
-#define BP 0.75
-#define DP 0.25
-#else
-#define EP 0
-#define BP 0.0
-#define DP 0.0
-#endif
-
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
-
-/* Patch & research sizes
- *
- * Patch size should be an odd number greater than or equal to 3. Higher values 
- * are slower and not always better.
- *
- * Research size be an odd number greater than or equal to 3. Higher values are 
- * generally better, but slower, blurrier, and gives diminishing returns.
- */
-#ifdef LUMA_raw
-#define P 3
-#define R 3
-#else
-#define P 3
-#define R 5
-#endif
-
-/* Patch and research shapes
- *
- * Different shapes have different speed and quality characteristics. Every 
- * shape (besides square) is smaller than square.
- *
- * PS applies applies to patches, RS applies to research zones.
- *
- * Be wary of gather optimizations (see the Regarding Speed comment at the top)
- *
- * 0: square (symmetrical)
- * 1: horizontal line (asymmetric)
- * 2: vertical line (asymmetric)
- * 3: diamond (symmetrical)
- * 4: triangle (asymmetric, pointing upward)
- * 5: truncated triangle (asymmetric on two axis, last row halved)
- * 6: even sized square (asymmetric on two axis)
- * 7: plus (symmetrical)
- */
-#ifdef LUMA_raw
-#define RS 3
-#define PS 3
-#else
-#define RS 3
-#define PS 3
-#endif
-
-/* Robust filtering
- *
- * This setting is dependent on code generation from shader_cfg, so this 
- * setting can only be enabled via shader_cfg.
- *
- * Compares the pixel-of-interest against a guide, which could be a downscaled 
- * image or the output of another shader
- */
-#define RF_LUMA 1
-#define RF 1
-
-/* Rotational/reflectional invariance
- *
- * Number of rotations/reflections to try for each patch comparison. Can be 
- * slow, but improves feature preservation. More rotations/reflections gives 
- * diminishing returns. The most similar rotation/reflection will be used.
- *
- * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
- * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
- *
- * RI: Rotational invariance
- * RFI (0 to 2): Reflectional invariance
- */
-#ifdef LUMA_raw
-#define RI 0
-#define RFI 0
-#else
-#define RI 0
-#define RFI 0
-#endif
-
-/* Temporal denoising
- *
- * This setting is dependent on code generation from shader_cfg, so this 
- * setting can only be enabled via shader_cfg.
- *
- * Caveats:
- * 	- Slower:
- * 		- Each frame needs to be researched (more samples & more math)
- * 		- Gather optimizations only apply to the current frame
- * 	- Requires vo=gpu-next
- * 	- Luma-only (this is a bug)
- * 	- Buggy
- *
- * May cause motion blur and may struggle more with noise that persists across 
- * multiple frames (e.g., from compression or duplicate frames), but can work 
- * very well on high quality video.
- *
- * Motion estimation (ME) should improve quality without impacting speed.
- *
- * T: number of frames used
- * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
- * MEF: estimate factor, compensates for ME being one frame behind
- * TRF: compare against the denoised frames
- */
-#ifdef LUMA_raw
-#define T 0
-#define ME 1
-#define MEF 2
-#define TRF 0
-#else
-#define T 0
-#define ME 0
-#define MEF 2
-#define TRF 0
-#endif
-
-/* Spatial kernel
- *
- * Increasing the spatial denoising factor (SS) reduces the weight of further 
- * pixels.
- *
- * Spatial distortion instructs the spatial kernel to view that axis as 
- * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
- * appear closer and increase blur between frames.
- *
- * The intra-patch variants are supposed to help with larger patch sizes.
- *
- * SST: enables spatial kernel if R>=PST, 0 fully disables
- * SS: spatial sigma
- * SD: spatial distortion (X, Y, time)
- * PSS: intra-patch spatial sigma
- * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
- * PSD: intra-patch spatial distortion (X, Y)
- */
-#ifdef LUMA_raw
-#define SST 1
-#define SS 0.25
-#define SD vec3(1,1,1)
-#define PST 0
-#define PSS 0.0
-#define PSD vec2(1,1)
-#else
-#define SST 1
-#define SS 0.25
-#define SD vec3(1,1,1)
-#define PST 0
-#define PSS 0.0
-#define PSD vec2(1,1)
-#endif
-
-/* Kernels
- *
- * SK: spatial kernel
- * RK: range kernel (takes patch differences)
- * PSK: intra-patch spatial kernel
- *
- * List of available kernels:
- *
- * bicubic
- * cos
- * gaussian
- * lanczos
- * quadratic
- * sinc
- * sphinx
- */
-#ifdef LUMA_raw
-#define SK gaussian
-#define RK gaussian
-#define PSK gaussian
-#else
-#define SK gaussian
-#define RK gaussian
-#define PSK gaussian
-#endif
-
-// Scaling factor (should match WIDTH/HEIGHT)
-#ifdef LUMA_raw
-#define SF 1
-#else
-#define SF 1
-#endif
-
-/* Visualization
- *
- * 0: off
- * 1: absolute difference between input/output to the power of 0.25
- * 2: difference between input/output centered on 0.5
- * 3: avg_weight
- * 4: edge map (based on the relevant AS settings)
- */
-#ifdef LUMA_raw
-#define V 0
-#else
-#define V 0
-#endif
-
-// Blur factor (0.0 returns the input image, 1.0 returns the output image)
-#ifdef LUMA_raw
-#define BF 1.0
-#else
-#define BF 1.0
-#endif
-
-// Force disable textureGather
-#ifdef LUMA_raw
-#define NG 0
-#else
-#define NG 0
-#endif
-
-// Patch donut (probably useless)
-#ifdef LUMA_raw
-#define PD 0
-#else
-#define PD 0
-#endif
-
-// Duplicate 1st weight (for luma-guided-chroma)
-#ifdef LUMA_raw
-#define D1W 0
-#else
-#define D1W 0
-#endif
-
-// Skip patch comparison
-#ifdef LUMA_raw
-#define SKIP_PATCH 0
-#else
-#define SKIP_PATCH 0
-#endif
-
-// Shader code
-
-#define EPSILON 0.00000000001
-#define M_PI 3.14159265358979323846
-#define POW2(x) ((x)*(x))
-#define POW3(x) ((x)*(x)*(x))
-#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
-#define gaussian(x) exp(-1 * POW2(x))
-#define lanczos(x) POW2(sinc(x))
-#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
-#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
-#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
-
-// XXX could maybe be better optimized on LGC
-// XXX return original alpha component instead of 1.0
-#if defined(LUMA_raw)
-#define val float
-#define val_swizz(v) (v.x)
-#define unval(v) vec4(v.x, 0, 0, 1.0)
-#define val_packed val
-#define val_pack(v) (v)
-#define val_unpack(v) (v)
-#elif defined(CHROMA_raw)
-#define val vec2
-#define val_swizz(v) (v.xy)
-#define unval(v) vec4(v.x, v.y, 0, 1.0)
-#define val_packed uint
-#define val_pack(v) packUnorm2x16(v)
-#define val_unpack(v) unpackUnorm2x16(v)
-#else
-#define val vec3
-#define val_swizz(v) (v.xyz)
-#define unval(v) vec4(v.x, v.y, v.z, 1.0)
-#define val_packed val
-#define val_pack(v) (v)
-#define val_unpack(v) (v)
-#endif
-
-#if PS == 6
-const int hp = P/2;
-#else
-const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes
-#endif
-
-#if RS == 6
-const int hr = R/2;
-#else
-const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes
-#endif
-
-// donut increment, increments without landing on (0,0,0)
-// much faster than a continue statement
-#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
-
-// patch/research shapes
-// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
-// dots (.) represent samples (pixels) and X represents the pixel-of-interest
-
-// Z    .....
-// Z    .....
-// Z    ..X..
-// Z    .....
-// Z    .....
-#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr)
-
-// (in this instance Z=4)
-// Z    ....
-// Z    ....
-// Z    ..X.
-// Z    ....
-#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr)
-
-// Z-4    .
-// Z-2   ...
-// Z    ..X..
-#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr)
-
-// Z-4    .
-// Z-2   ...
-// hz+1 ..X
-#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr)
-#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
-
-// Z-4    .
-// Z-2   ...
-// Z    ..X..
-// Z-2   ...
-// Z-4    .
-#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr)
-#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
-
-//
-// Z    ..X..
-//
-#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
-
-// 90 degree rotation of S_HORIZONTAL
-#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
-
-// 1      .
-// 1      . 
-// Z    ..X..
-// 1      . 
-// 1      .
-#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
-#define S_PLUS_A(hz,Z) (Z*2 - 1)
-
-// XXX implement S_PLUS w/ an X overlayed:
-// 3    . . .
-// 3     ...
-// Z    ..X..
-// 3     ...
-// 3    . . .
-
-// XXX implement an X shape:
-// 2    .   .
-// 2     . .
-// 1      X  
-// 2     . .
-// 2    .   .
-
-// 1x1 square
-#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
-
-#define T1 (T+1)
-#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++)
-
-#ifdef LUMA_raw
-#define RF_ RF_LUMA
-#else
-#define RF_ RF
-#endif
-
-// Skip comparing the pixel-of-interest against itself, unless RF is enabled
-#if RF_
-#define RINCR(z,c) (z.c++)
-#else
-#define RINCR DINCR
-#endif
-
-#define R_AREA(a) (a * T1 + RF_-1)
-
-// research shapes
-// XXX would be nice to have the option of temporally-varying research sizes
-#if R == 0 || R == 1
-#define FOR_RESEARCH(r) S_1X1(r)
-const int r_area = R_AREA(1);
-#elif RS == 7
-#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y))
-const int r_area = R_AREA(S_PLUS_A(hr,R));
-#elif RS == 6
-#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R*R);
-#elif RS == 5
-#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x))
-const int r_area = R_AREA(S_TRIANGLE_A(hr,hr));
-#elif RS == 4
-#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x))
-const int r_area = R_AREA(S_TRIANGLE_A(hr,R));
-#elif RS == 3
-#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y))
-const int r_area = R_AREA(S_DIAMOND_A(hr,R));
-#elif RS == 2
-#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R);
-#elif RS == 1
-#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x))
-const int r_area = R_AREA(R);
-#elif RS == 0
-#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y))
-const int r_area = R_AREA(R*R);
-#endif
-
-#define RI1 (RI+1)
-#define RFI1 (RFI+1)
-
-#if RI
-#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1)
-#else
-#define FOR_ROTATION
-#endif
-
-#if RFI
-#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++)
-#else
-#define FOR_REFLECTION
-#endif
-
-#if PD
-#define PINCR DINCR
-#else
-#define PINCR(z,c) (z.c++)
-#endif
-
-#define P_AREA(a) (a - PD)
-
-// patch shapes
-#if P == 0 || P == 1
-#define FOR_PATCH(p) S_1X1(p)
-const int p_area = P_AREA(1);
-#elif PS == 7
-#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y))
-const int p_area = P_AREA(S_PLUS_A(hp,P));
-#elif PS == 6
-#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P*P);
-#elif PS == 5
-#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x))
-const int p_area = P_AREA(S_TRIANGLE_A(hp,hp));
-#elif PS == 4
-#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x))
-const int p_area = P_AREA(S_TRIANGLE_A(hp,P));
-#elif PS == 3
-#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y))
-const int p_area = P_AREA(S_DIAMOND_A(hp,P));
-#elif PS == 2
-#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P);
-#elif PS == 1
-#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x))
-const int p_area = P_AREA(P);
-#elif PS == 0
-#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y))
-const int p_area = P_AREA(P*P);
-#endif
-
-const float r_scale = 1.0/r_area;
-const float p_scale = 1.0/p_area;
-
-#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size)))
-#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
-
-#if RF_ && defined(LUMA_raw)
-#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off)
-#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
-#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
-#elif RF_ && D1W
-#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
-#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
-#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
-#elif RF_
-#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
-#else
-#define load2_(off) load_(off)
-#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
-#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
-#endif
-
-#if T
-val load(vec3 off)
-{
-	switch (min(int(off.z), frame)) {
-	case 0: return val_swizz(load_(off));
-
-	}
-}
-val load2(vec3 off)
-{
-	return off.z == 0 ? val_swizz(load2_(off)) : load(off);
-}
-#else
-#define load(off) val_swizz(load_(off))
-#define load2(off) val_swizz(load2_(off))
-#endif
-
-val poi = load(vec3(0)); // pixel-of-interest
-val poi2 = load2(vec3(0)); // guide pixel-of-interest
-
-#if RI // rotation
-vec2 rot(vec2 p, float d)
-{
-	return vec2(
-		p.x * cos(radians(d)) - p.y * sin(radians(d)),
-		p.y * sin(radians(d)) + p.x * cos(radians(d))
-	);
-}
-#else
-#define rot(p, d) (p)
-#endif
-
-#if RFI // reflection
-vec2 ref(vec2 p, int d)
-{
-	switch (d) {
-	case 0: return p;
-	case 1: return p * vec2(1, -1);
-	case 2: return p * vec2(-1, 1);
-	}
-}
-#else
-#define ref(p, d) (p)
-#endif
-
-#if SST && R >= SST
-float spatial_r(vec3 v)
-{
-	v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size);
-	return SK(length(v*SD)*SS);
-}
-#else
-#define spatial_r(v) (1)
-#endif
-
-#if PST && P >= PST
-#define spatial_p(v) PSK(length(v*PSD)*PSS)
-#else
-#define spatial_p(v) (1)
-#endif
-
-val range(val pdiff_sq)
-{
-	const float h = S*0.013;
-	const float pdiff_scale = 1.0/(h*h);
-	pdiff_sq = sqrt(pdiff_sq * pdiff_scale);
-#if defined(LUMA_raw)
-	return RK(pdiff_sq);
-#elif defined(CHROMA_raw)
-	return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y));
-#else
-	return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z));
-#endif
-	//return exp(-pdiff_sq * pdiff_scale);
-
-	// weight function from the NLM paper, it's not very good
-	//return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale);
-}
-
-val patch_comparison(vec3 r, vec3 r2)
-{
-	vec3 p;
-	val min_rot = val(p_area);
-
-	FOR_ROTATION FOR_REFLECTION {
-		val pdiff_sq = val(0);
-		FOR_PATCH(p) {
-			vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z);
-			val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF);
-			diff_sq *= diff_sq;
-			diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy);
-			pdiff_sq += diff_sq;
-		}
-		min_rot = min(min_rot, pdiff_sq);
-	}
-
-	return min_rot * p_scale;
-}
-
-#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
-#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
-
-#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
-// 3x3 diamond/plus patch_comparison_gather
-// XXX extend to support arbitrary sizes (probably requires code generation)
-// XXX extend to support 3x3 square
-// XXX support PSS
-const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
-const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
-vec4 poi_patch = gather_offs(0, offsets);
-float patch_comparison_gather(vec3 r, vec3 r2)
-{
-	float min_rot = p_area - 1;
-	vec4 transformer = gather_offs(r, offsets_sf);
-	FOR_ROTATION {
-		FOR_REFLECTION {
-			float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1));
-			min_rot = min(diff_sq, min_rot);
-#if RFI
-			switch(rfi) {
-			case 0: transformer = transformer.zyxw; break;
-			case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror
-			case 2: transformer = transformer.zyxw; break; // undoes last mirror
-			}
-#endif
-		}
-#if RI == 3
-		transformer = transformer.wxyz;
-#elif RI == 1
-		transformer = transformer.zwxy;
-#endif
-	}
-	float center_diff_sq = poi2.x - load2(r).x;
-	center_diff_sq *= center_diff_sq;
-	return (min_rot + center_diff_sq) * p_scale;
-}
-#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
-// tiled even square patch_comparison_gather
-// XXX extend to support odd square?
-float patch_comparison_gather(vec3 r, vec3 r2)
-{
-	vec2 tile;
-	float min_rot = p_area;
-
-	/* gather order:
-	 * w z
-	 * x y
-	 */
-	float pdiff_sq = 0;
-	for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) {
-		vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy);
-		diff_sq *= diff_sq;
-		diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)),
-			                                 spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0)));
-		pdiff_sq += dot(diff_sq, vec4(1));
-	}
-	min_rot = min(min_rot, pdiff_sq);
-
-	return min_rot * p_scale;
-}
-#else
-#define patch_comparison_gather patch_comparison
-#endif
-
-vec4 hook()
-{
-	val total_weight = val(0);
-	val sum = val(0);
-	val result = val(0);
-
-	vec3 r = vec3(0);
-	vec3 p = vec3(0);
-	vec3 me = vec3(0);
-
-#if T && ME == 1 // temporal & motion estimation
-	vec3 me_tmp = vec3(0);
-	float maxweight = 0;
-#elif T && ME == 2 // temporal & motion estimation
-	vec3 me_sum = vec3(0);
-	float me_weight = 0;
-#endif
-
-#if WD == 2 // weight discard
-	int r_index = 0;
-	val_packed all_weights[r_area];
-	val_packed all_pixels[r_area];
-#elif WD == 1 // weight discard
-	val no_weights = val(0);
-	val discard_total_weight = val(0);
-	val discard_sum = val(0);
-#endif
-
-	FOR_FRAME(r) {
-	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
-#if T && ME == 1 // temporal & motion estimation max weight
-	if (r.z > 0) {
-		me += me_tmp * MEF;
-		me_tmp = vec3(0);
-		maxweight = 0;
-	}
-#elif T && ME == 2 // temporal & motion estimation weighted average
-	if (r.z > 0) {
-		me += round(me_sum / me_weight * MEF);
-		me_sum = vec3(0);
-		me_weight = 0;
-	}
-#endif
-	FOR_RESEARCH(r) { // main NLM logic
-#if SKIP_PATCH
-		val weight = val(1);
-#else
-		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0));
-		val weight = range(pdiff_sq);
-#endif
-
-#if T && ME == 1 // temporal & motion estimation max weight
-		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
-		maxweight = max(maxweight, weight.x);
-#elif T && ME == 2 // temporal & motion estimation weighted average
-		me_sum += vec3(r.xy,0) * weight.x;
-		me_weight += weight.x;
-#endif
-
-#if D1W
-		weight = val(weight.x);
-#endif
-
-		weight *= spatial_r(r);
-
-#if WD == 2 // weight discard
-		all_weights[r_index] = val_pack(weight);
-		all_pixels[r_index] = val_pack(load(r+me));
-		r_index++;
-#elif WD == 1 // weight discard
-		val wd_scale = 1.0/max(no_weights, 1);
-		val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
-		discard_sum += load(r+me) * weight * (1 - keeps);
-		discard_total_weight += weight * (1 - keeps);
-		no_weights += keeps;
-#endif
-
-		sum += load(r+me) * weight;
-		total_weight += weight;
-	} // FOR_RESEARCH
-	} // FOR_FRAME
-
-	val avg_weight = total_weight * r_scale;
-	val old_avg_weight = avg_weight;
-
-#if WD == 2 // true average
-	total_weight = val(0);
-	sum = val(0);
-	val no_weights = val(0);
-
-	for (int i = 0; i < r_area; i++) {
-		val w = val_unpack(all_weights[i]);
-		val px = val_unpack(all_pixels[i]);
-		val keeps = step(avg_weight*WDT, w);
-
-		w *= keeps;
-		sum += px * w;
-		total_weight += w;
-		no_weights += keeps;
-	}
-#elif WD == 1 // moving cumulative average
-	total_weight -= discard_total_weight;
-	sum -= discard_sum;
-#endif
-#if WD // weight discard
-	avg_weight = total_weight / no_weights;
-#endif
-
-	total_weight += SW * spatial_r(vec3(0));
-	sum += poi * SW * spatial_r(vec3(0));
-
-#if V == 3 // weight map
-	result = val(avg_weight);
-#else // mean
-	result = val(sum / total_weight);
-#endif
-
-	// store frames for temporal
-#if T > 1
-
-#endif
-#if T && TRF
-	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result));
-#elif T
-	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2));
-#endif
-
-#if ASW == 0 // pre-WD weights
-#define AS_weight old_avg_weight
-#elif ASW == 1 // post-WD weights
-#define AS_weight avg_weight
-#endif
-
-#if ASK == 0
-	val sharpening_strength = pow(AS_weight, val(ASP));
-#elif ASK == 1
-	val sharpening_strength = mix(
-			pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)),
-			AS_weight, ASC);
-	// XXX normalize the result to account for a negative ASC?
-#elif ASK == 2
-	val sharpening_strength = val(ASP);
-#endif
-
-#if AS == 1 // sharpen+denoise
-	val sharpened = result + (poi - result) * ASF;
-#elif AS == 2 // sharpen only
-	val sharpened = poi + (poi - result) * ASF;
-#endif
-
-#if EP // extremes preserve
-	float luminance = EP_texOff(0).x;
-	// EPSILON is needed since pow(0,0) is undefined
-	float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP));
-	result = mix(poi, result, ep_weight);
-#endif
-
-#if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_strength);
-#elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_strength);
-#endif
-
-#if V == 4 // edge map
-	result = sharpening_strength;
-#endif
-
-#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations
-	return vec4(0.5);
-#endif
-
-#if V == 1
-	result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0);
-#elif V == 2
-	result = (poi - result) * 0.5 + 0.5;
-#endif
-
-	return unval(mix(poi, result, BF));
-}
-
diff --git a/portable_config/shaders/nlmeans_temporal.glsl b/portable_config/shaders/nlmeans_temporal.glsl
index a3bf340d..c3d16f66 100644
--- a/portable_config/shaders/nlmeans_temporal.glsl
+++ b/portable_config/shaders/nlmeans_temporal.glsl
@@ -21,299 +21,1121 @@
 
 // Description: nlmeans_temporal.glsl: Very experimental and buggy, limited to vo=gpu-next.
 
-/* The recommended usage of this shader and its variant profiles is to add them 
- * to input.conf and then dispatch the appropriate shader via a keybind during 
- * media playback. Here is an example input.conf entry:
+/* This shader is highly configurable via user variables below. Although the 
+ * default settings should offer good quality at a reasonable speed, you are 
+ * encouraged to tweak them to your preferences.
+ */
+
+// The following is shader code injected from ../LQ/nlmeans.glsl
+/* vi: ft=c
  *
- * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)"
+ * Based on vf_nlmeans.c from FFmpeg.
  *
- * These shaders can also be enabled by default in mpv.conf, for example:
+ * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
+ * Copyright (c) 2016 Clément Bœsch <u pkh me>
  *
- * glsl-shaders='~~/shaders/nlmeans.glsl'
+ * This program is free software: you can redistribute it and/or modify it 
+ * under the terms of the GNU Lesser General Public License as published by 
+ * the Free Software Foundation, either version 2.1 of the License, or (at 
+ * your option) any later version.
  *
- * Both of the examples above assume the shaders are located in a subdirectory 
- * named "shaders" within mpv's config directory. Refer to the mpv 
- * documentation for more details.
+ * This program is distributed in the hope that it will be useful, but WITHOUT 
+ * ANY WARRANTY;  without even the implied warranty of MERCHANTABILITY or 
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
+ * for more details.
  *
- * This shader is highly configurable via user variables below. Although the 
+ * You should have received a copy of the GNU Lesser General Public License 
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+// Description: nlmeans.glsl: Faster, but lower quality.
+
+/* This shader is highly configurable via user variables below. Although the 
  * default settings should offer good quality at a reasonable speed, you are 
- * encouraged to tweak them to your preferences. Be mindful that certain 
- * settings may greatly affect speed.
+ * encouraged to tweak them to your preferences.
+ */
+
+//!HOOK LUMA
+//!HOOK CHROMA
+//!BIND HOOKED
+//!DESC Non-local means (nlmeans.glsl)
+//!SAVE RF_LUMA
+
+// User variables
+
+// It is generally preferable to denoise luma and chroma differently, so the 
+// user variables for luma and chroma are split.
+
+// Denoising factor (level of blur, higher means more blur)
+#ifdef LUMA_raw
+#define S 3.5968056672833097
+#else
+#define S 5.191526541606411
+#endif
+
+/* Adaptive sharpening
+ *
+ * Performs an unsharp mask by subtracting the spatial kernel's blur from the 
+ * NLM blur. For sharpen+denoise the sharpening is limited to edge areas and 
+ * denoising is done everywhere else.
+ *
+ * Use V=4 to visualize which areas are sharpened (black means sharpen).
  *
- * Denoising is most useful for noisy content. If there is no perceptible 
- * noise, you probably won't see a positive difference.
+ * AS:
+ * 	 - 0: disable
+ * 	 - 1: sharpen+denoise
+ * 	 - 2: sharpen only
+ * ASF: Higher numbers make a sharper image
+ * ASA: Anti-ringing, higher numbers increase strength
+ * ASP: Power, lower numbers increase sharpening on lower frequency detail
+ */
+#ifdef LUMA_raw
+#define AS 0
+#define ASF 0.1625
+#define ASA 5.0
+#define ASP 0.5
+#else
+#define AS 0
+#define ASF 0.1625
+#define ASA 5.0
+#define ASP 0.5
+#endif
+
+/* Starting weight
  *
- * The default settings are generally tuned for low noise and high detail 
- * preservation. The "medium" and "heavy" profiles are tuned for higher levels 
- * of noise.
+ * Also known as the center weight. This represents the weight of the 
+ * pixel-of-interest. Lower numbers may help handle heavy noise & ringing.
  *
- * The denoiser will not work properly if the content has been upscaled 
- * beforehand (whether it was done by you or not). In such cases, consider 
- * issuing a command to downscale in the mpv console (backtick ` key):
+ * EPSILON should be used instead of zero to avoid divide-by-zero errors.
+ */
+#ifdef LUMA_raw
+#define SW 0.7392620481427672
+#else
+#define SW 0.6448288408806067
+#endif
+
+/* Weight discard
  *
- * vf toggle scale=-2:720
+ * Reduces weights that fall below a fraction of the average weight. This culls 
+ * the most dissimilar samples from the blur, which can yield a better result, 
+ * especially around edges.
+ * 
+ * WD:
+ * 	 - 2: Mean. Better quality, but slower and requires GLSL 4.0 or later
+ * 	 - 1: Moving cumulative average. Fast but inaccurate, blurs directionally.
+ * 	 - 0: Disable
  *
- * ...replacing 720 with whatever resolution seems appropriate. Rerun the 
- * command to undo the downscale. It may take some trial-and-error to find the 
- * proper resolution.
+ * WDT: Threshold coefficient, higher numbers discard more
+ * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
+ * WDS (not for WDK=is_zero): Higher numbers are more eager to reduce weights
  */
+#ifdef LUMA_raw
+#define WD 1
+#define WDT 0.580415381682815
+#define WDP 5.381278367349288
+#define WDS 1.0
+#else
+#define WD 1
+#define WDT 0.913447511792627
+#define WDP 5.832936323930807
+#define WDS 1.0
+#endif
 
-/* Regarding speed
+/* Extremes preserve
  *
- * Speed may vary wildly for different vo and gpu-api settings. Generally 
- * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this 
- * may be different for your system.
+ * Reduce denoising in very bright/dark areas.
  *
- * If your GPU doesn't support textureGather, or if you are on a version of mpv 
- * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile
+ * Disabled by default now. If you want to reenable this, set EP=3/ in 
+ * Makefile.nlm and rebuild.
  *
- * If you plan on tinkering with NLM's settings, read below:
+ * The downscaling factor of the EP shader stage affects what is considered a 
+ * bright/dark area.
  *
- * textureGather only applies to luma and limited to the these configurations:
+ * This is incompatible with RGB. If you have RGB hooks enabled then you will 
+ * have to delete the EP shader stage or specify EP=0 through shader_cfg.
  *
- * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2}
- *   - Default, very fast, rotations and reflections should be free
- *   - If this is unusually slow then try changing gpu-api and vo
- *   - If it's still slow, try setting RI/RFI to 0.
+ * EP: 1 to enable, 0 to disable
+ * DP: EP strength on dark areas, 0 to fully denoise
+ * BP: EP strength on bright areas, 0 to fully denoise
+ */
+#ifdef LUMA_raw
+#define EP 0
+#define BP 0.75
+#define DP 0.25
+#else
+#define EP 0
+#define BP 0.0
+#define DP 0.0
+#endif
+
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
+
+/* textureGather applicable configurations:
  *
- * - PS=6:RI={0,1,3}:RFI={0,1,2}
+ * - PS={0,3,7,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2}
+ * - PS={0,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2}
+ * - PS=6:RI=0:RFI=0
  *   - Currently the only scalable variant
- *   - Patch shape is asymmetric on two axis
- *   - Rotations should have very little speed impact
- *   - Reflections may have a significant speed impact
  *
  * Options which always disable textureGather:
- * 	- PD
- * 	- NG
+ * 	 - NG
+ * 	 - SAMPLE
+ * 	 - PD
+ *
+ * Running without textureGather may be much slower.
  */
 
-// The following is shader code injected from guided.glsl
-/* vi: ft=c
+/* Patch & research sizes
  *
- * Copyright (c) 2022 an3223 <ethanr2048@gmail.com>
+ * P should be an odd number. Higher values are slower and not always better.
  *
- * This program is free software: you can redistribute it and/or modify it 
- * under the terms of the GNU Lesser General Public License as published by 
- * the Free Software Foundation, either version 2.1 of the License, or (at 
- * your option) any later version.
+ * R should be an odd number greater than or equal to 3. Higher values are 
+ * generally better, but slower, blurrier, and gives diminishing returns.
+ */
+#ifdef LUMA_raw
+#define P 3
+#define R 5
+#else
+#define P 3
+#define R 5
+#endif
+
+/* Patch and research shapes
  *
- * This program is distributed in the hope that it will be useful, but WITHOUT 
- * ANY WARRANTY;  without even the implied warranty of MERCHANTABILITY or 
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 
- * for more details.
+ * Different shapes have different speed and quality characteristics. Every 
+ * shape (besides square) is smaller than square.
  *
- * You should have received a copy of the GNU Lesser General Public License 
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ * PS applies applies to patches, RS applies to research zones.
+ *
+ * 0: square (symmetrical)
+ * 1: horizontal line (asymmetric)
+ * 2: vertical line (asymmetric)
+ * 3: diamond (symmetrical)
+ * 4: triangle (asymmetric, pointing upward)
+ * 5: truncated triangle (asymmetric on two axis, last row halved)
+ * 6: even sized square (asymmetric on two axis)
+ * 7: plus (symmetrical)
+ * 8: plus X (symmetrical)
  */
+#ifdef LUMA_raw
+#define RS 3
+#define PS 4
+#else
+#define RS 3
+#define PS 3
+#endif
 
-// Description: guided.glsl: Guided by the downscaled image
+/* Robust filtering
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Computes weights on a guide, which could be a downscaled image or the output 
+ * of another shader, and applies the weights to the original image
+ */
+#define RF_LUMA 0
+#define RF 0
 
-/* The radius can be adjusted with the MEANI stage's downscaling factor. 
- * Higher numbers give a bigger radius.
+/* Rotational/reflectional invariance
+ *
+ * Number of rotations/reflections to try for each patch comparison. Can be 
+ * slow, but improves feature preservation. More rotations/reflections gives 
+ * diminishing returns. The most similar rotation/reflection will be used.
  *
- * The E variable can be found in the A stage.
+ * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
+ * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
  *
- * The subsampling (fast guided filter) can be adjusted with the I stage's 
- * downscaling factor. Higher numbers are faster.
+ * Consider setting SAMPLE=1 if setting RI to a setting that would require 
+ * sampling between pixels.
  *
- * The guide's subsampling can be adjusted with the PREI stage's downscaling 
- * factor. Higher numbers downscale more.
+ * RI: Rotational invariance
+ * RFI (0 to 2): Reflectional invariance
  */
+#ifdef LUMA_raw
+#define RI 0
+#define RFI 0
+#else
+#define RI 0
+#define RFI 0
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND HOOKED
-//!WIDTH HOOKED.w 1.25 /
-//!HEIGHT HOOKED.h 1.25 /
-//!DESC Guided filter (PREI)
-//!SAVE _INJ_PREI
+/* Temporal denoising
+ *
+ * This setting is dependent on code generation from shader_cfg, so this 
+ * setting can only be enabled via shader_cfg.
+ *
+ * Caveats:
+ * 	 - Slower:
+ * 	 	 - Each frame needs to be researched (more samples & more math)
+ * 	 	 - Gather optimizations only apply to the current frame
+ * 	 - Requires vo=gpu-next
+ * 	 - Luma-only (this is a bug)
+ * 	 - Buggy
+ *
+ * May cause motion blur and may struggle more with noise that persists across 
+ * multiple frames (e.g., from compression or duplicate frames), but can work 
+ * very well on high quality video.
+ *
+ * Motion estimation (ME) should improve quality without impacting speed.
+ *
+ * T: number of frames used
+ * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg
+ * MEF: estimate factor, compensates for ME being one frame behind
+ * TRF: compare against the denoised frames
+ */
+#ifdef LUMA_raw
+#define T 0
+#define ME 1
+#define MEF 2
+#define TRF 0
+#else
+#define T 0
+#define ME 0
+#define MEF 2
+#define TRF 0
+#endif
+
+/* Spatial kernel
+ *
+ * Increasing the spatial denoising factor (SS) reduces the weight of further 
+ * pixels.
+ *
+ * Spatial distortion instructs the spatial kernel to view that axis as 
+ * closer/further, for instance SD=(1,1,0.5) would make the temporal axis 
+ * appear closer and increase blur between frames.
+ *
+ * The intra-patch variants are supposed to help with larger patch sizes.
+ *
+ * SST: enables spatial kernel if R>=PST, 0 fully disables
+ * SS: spatial sigma
+ * SD: spatial distortion (X, Y, time)
+ * PSS: intra-patch spatial sigma
+ * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables
+ * PSD: intra-patch spatial distortion (X, Y)
+ */
+#ifdef LUMA_raw
+#define SST 1
+#define SS 0.49764743714339127
+#define SD vec3(1,1,1)
+#define PST 0
+#define PSS 0.0
+#define PSD vec2(1,1)
+#else
+#define SST 1
+#define SS 0.32091162692066677
+#define SD vec3(1,1,1)
+#define PST 0
+#define PSS 0.0
+#define PSD vec2(1,1)
+#endif
+
+/* Kernels
+ *
+ * SK: spatial kernel
+ * RK: range kernel (takes patch differences)
+ * PSK: intra-patch spatial kernel
+ * WDK: weight discard kernel
+ * WD1TK (WD=1 only): weight discard tolerance kernel
+ *
+ * List of available kernels:
+ *
+ * bicubic
+ * cos
+ * gaussian
+ * lanczos
+ * quadratic_ (unclamped)
+ * sinc
+ * sinc_ (unclamped)
+ * sinc3
+ * sphinx
+ * sphinx_ (unclamped)
+ * triangle_ (unclamped)
+ * triangle
+ */
+#ifdef LUMA_raw
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
+#define WDK is_zero
+#define WD1TK gaussian
+#else
+#define SK gaussian
+#define RK gaussian
+#define PSK gaussian
+#define WDK is_zero
+#define WD1TK gaussian
+#endif
+
+/* Sampling method
+ *
+ * In most cases this shouldn't make any difference, only set to bilinear if 
+ * it's necessary to sample between pixels (e.g., RI=2).
+ *
+ * 0: nearest neighbor
+ * 1: bilinear
+ */
+#ifdef LUMA_raw
+#define SAMPLE 0
+#else
+#define SAMPLE 0
+#endif
+
+/* Research scaling factor
+ *
+ * Higher numbers sample more sparsely as the distance from the POI grows.
+ */
+#ifdef LUMA_raw
+#define RSF 0.0
+#else
+#define RSF 0.0
+#endif
+
+// Scaling factor (should match WIDTH/HEIGHT)
+#ifdef LUMA_raw
+#define SF 1
+#else
+#define SF 1
+#endif
+
+// Use the guide image as the input image
+#ifdef LUMA_raw
+#define GUIDE_INPUT 0
+#else
+#define GUIDE_INPUT 0
+#endif
+
+/* Visualization
+ *
+ * 0: off
+ * 1: absolute difference between input/output to the power of 0.25
+ * 2: difference between input/output centered on 0.5
+ * 3: post-WD weight map
+ * 4: pre-WD weight map
+ * 5: unsharp mask
+ * 6: EP
+ */
+#ifdef LUMA_raw
+#define V 0
+#else
+#define V 0
+#endif
+
+// Blur factor (0.0 returns the input image, 1.0 returns the output image)
+#ifdef LUMA_raw
+#define BF 1.0
+#else
+#define BF 1.0
+#endif
+
+// Force disable textureGather
+#ifdef LUMA_raw
+#define NG 0
+#else
+#define NG 0
+#endif
+
+// Patch donut (probably useless)
+#ifdef LUMA_raw
+#define PD 0
+#else
+#define PD 0
+#endif
+
+// Duplicate 1st weight (for luma-guided-chroma)
+#ifdef LUMA_raw
+#define D1W 0
+#else
+#define D1W 0
+#endif
+
+// Skip patch comparison
+#ifdef LUMA_raw
+#define SKIP_PATCH 0
+#else
+#define SKIP_PATCH 0
+#endif
+
+// Shader code
+
+#define EPSILON 1.2e-38
+#define M_PI 3.14159265358979323846
+#define POW2(x) ((x)*(x))
+#define POW3(x) ((x)*(x)*(x))
+#define bicubic_(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define bicubic(x) bicubic_(clamp((x), 0.0, 2.0))
+#define gaussian(x) exp(-1 * POW2(x))
+#define quadratic_(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
+#define quadratic(x) quadratic_(clamp((x), 0.0, 1.5))
+#define sinc_(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
+#define sinc(x) sinc_(clamp((x), 0.0, 1.0))
+#define sinc3(x) sinc_(clamp((x), 0.0, 3.0))
+#define lanczos(x) (sinc3(x) * sinc(x))
+#define sphinx_(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+#define sphinx(x) sphinx_(clamp((x), 0.0, 1.4302966531242027))
+#define triangle_(x) (1 - (x))
+#define triangle(x) triangle_(clamp((x), 0.0, 1.0))
+#define is_zero(x) int(x == 0)
+
+// XXX could maybe be better optimized on LGC
+#if defined(LUMA_raw)
+#define val float
+#define val_swizz(v) (v.x)
+#define unval(v) vec4(v.x, 0, 0, poi_.a)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#elif defined(CHROMA_raw)
+#define val vec2
+#define val_swizz(v) (v.xy)
+#define unval(v) vec4(v.x, v.y, 0, poi_.a)
+#define val_packed uint
+#define val_pack(v) packUnorm2x16(v)
+#define val_unpack(v) unpackUnorm2x16(v)
+#else
+#define val vec3
+#define val_swizz(v) (v.xyz)
+#define unval(v) vec4(v.x, v.y, v.z, poi_.a)
+#define val_packed val
+#define val_pack(v) (v)
+#define val_unpack(v) (v)
+#endif
+
+#if PS == 6
+const int hp = P/2; 
+#else
+const float hp = int(P/2) - 0.5*(1-(P%2));  // sample between pixels for even patch sizes
+#endif
+
+#if RS == 6
+const int hr = R/2; 
+#else
+const float hr = int(R/2) - 0.5*(1-(R%2));  // sample between pixels for even research sizes
+#endif
+
+// patch/research shapes
+// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
+// dots (.) represent samples (pixels) and X represents the pixel-of-interest
+
+// Z    .....
+// Z    .....
+// Z    ..X..
+// Z    .....
+// Z    .....
+#define S_SQUARE(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -hz;  z.y <= hz;  incr)
+
+// (in this instance Z=4)
+// Z    ....
+// Z    ....
+// Z    ..X.
+// Z    ....
+#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz;  z.x < hz;  z.x++) for (z.y = -hz;  z.y < hz;  incr)
+
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+#define S_TRIANGLE(z,hz,incr) for (z.y = -hz;  z.y <= 0;  z.y++) for (z.x = -abs(abs(z.y) - hz);  z.x <= abs(abs(z.y) - hz);  incr)
+
+// Z-4    .
+// Z-2   ...
+// hz+1 ..X
+#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz;  z.y <= 0;  z.y++) for (z.x = -abs(abs(z.y) - hz);  z.x <= abs(abs(z.y) - hz)*int(z.y!=0);  incr)
+#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z)
+
+// Z-4    .
+// Z-2   ...
+// Z    ..X..
+// Z-2   ...
+// Z-4    .
+#define S_DIAMOND(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -abs(abs(z.x) - hz);  z.y <= abs(abs(z.x) - hz);  incr)
+#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z)
+
+//
+// Z    ..X..
+//
+#define S_HORIZONTAL(z,hz,incr) for (z.y = 0;  z.y <= 0;  z.y++) for (z.x = -hz;  z.x <= hz;  incr)
+
+// 90 degree rotation of S_HORIZONTAL
+#define S_VERTICAL(z,hz,incr) for (z.x = 0;  z.x <= 0;  z.x++) for (z.y = -hz;  z.y <= hz;  incr)
+
+// 1      .
+// 1      . 
+// Z    ..X..
+// 1      . 
+// 1      .
+#define S_PLUS(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -hz * int(z.x == 0);  z.y <= hz * int(z.x == 0);  incr)
+#define S_PLUS_A(hz,Z) (Z*2 - 1)
+
+// 3    . . .
+// 3     ...
+// Z    ..X..
+// 3     ...
+// 3    . . .
+#define S_PLUS_X(z,hz,incr) for (z.x = -hz;  z.x <= hz;  z.x++) for (z.y = -abs(z.x) + -hz * int(z.x == 0);  z.y <= abs(z.x) + hz * int(z.x == 0);  incr)
+#define S_PLUS_X_A(hz,Z) (Z*4 - 3)
+
+// 1x1 square
+#define S_1X1(z) for (z = vec3(0);  z.x <= 0;  z.x++)
+
+#define T1 (T+1)
+#define FOR_FRAME(r) for (r.z = 0;  r.z < T1;  r.z++)
+
+#ifdef LUMA_raw
+#define RF_ RF_LUMA
+#else
+#define RF_ RF
+#endif
+
+// donut increment, increments without landing on (0,0,0)
+// much faster than a continue statement
+#define DINCR(z,c,a) ((z.c += a),(z.c += int(z == vec3(0))))
+
+#define R_AREA(a) (a * T1 - 1)
+
+// research shapes
+// XXX would be nice to have the option of temporally-varying research sizes
+#if R == 0 || R == 1
+#define FOR_RESEARCH(r) S_1X1(r)
+const int r_area = R_AREA(1); 
+#elif RS == 8
+#define FOR_RESEARCH(r) S_PLUS_X(r,hr,DINCR(r,y,max(1,abs(r.x))))
+const int r_area = R_AREA(S_PLUS_X_A(hr,R)); 
+#elif RS == 7
+#define FOR_RESEARCH(r) S_PLUS(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(S_PLUS_A(hr,R)); 
+#elif RS == 6
+#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(R*R); 
+#elif RS == 5
+#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,DINCR(r,x,1))
+const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); 
+#elif RS == 4
+#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,DINCR(r,x,1))
+const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); 
+#elif RS == 3
+#define FOR_RESEARCH(r) S_DIAMOND(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(S_DIAMOND_A(hr,R)); 
+#elif RS == 2
+#define FOR_RESEARCH(r) S_VERTICAL(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(R); 
+#elif RS == 1
+#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,DINCR(r,x,1))
+const int r_area = R_AREA(R); 
+#elif RS == 0
+#define FOR_RESEARCH(r) S_SQUARE(r,hr,DINCR(r,y,1))
+const int r_area = R_AREA(R*R); 
+#endif
+
+#define RI1 (RI+1)
+#define RFI1 (RFI+1)
+
+#if RI
+#define FOR_ROTATION for (float ri = 0;  ri < 360;  ri+=360.0/RI1)
+#else
+#define FOR_ROTATION
+#endif
+
+#if RFI
+#define FOR_REFLECTION for (int rfi = 0;  rfi < RFI1;  rfi++)
+#else
+#define FOR_REFLECTION
+#endif
+
+#if PD
+#define PINCR DINCR
+#else
+#define PINCR(z,c,a) (z.c += a)
+#endif
+
+#define P_AREA(a) (a - PD)
+
+// patch shapes
+#if P == 0 || P == 1
+#define FOR_PATCH(p) S_1X1(p)
+const int p_area = P_AREA(1); 
+#elif PS == 8
+#define FOR_PATCH(p) S_PLUS_X(p,hp,PINCR(p,y,max(1,abs(p.x))))
+const int p_area = P_AREA(S_PLUS_X_A(hp,P)); 
+#elif PS == 7
+#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(S_PLUS_A(hp,P)); 
+#elif PS == 6
+#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(P*P); 
+#elif PS == 5
+#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x,1))
+const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); 
+#elif PS == 4
+#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x,1))
+const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); 
+#elif PS == 3
+#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(S_DIAMOND_A(hp,P)); 
+#elif PS == 2
+#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(P); 
+#elif PS == 1
+#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x,1))
+const int p_area = P_AREA(P); 
+#elif PS == 0
+#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y,1))
+const int p_area = P_AREA(P*P); 
+#endif
+
+const float r_scale = 1.0/r_area; 
+const float p_scale = 1.0/p_area; 
+
+#if SAMPLE == 0
+#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * (vec2(off) + 0.5 - fract((pos) * (size))))
+#else
+#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * vec2(off))
+#endif
+
+#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
+
+#if RF_ && defined(LUMA_raw)
+#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off)
+#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr)))
+#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0)
+#elif RF_ && D1W
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
+#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr)))
+#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0)
+#elif RF_
+#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off)
+#else
+#define load2_(off) load_(off)
+#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr)))
+#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0)
+#endif
+
+#if T
+val load(vec3 off)
+{
+	 switch (min(int(off.z), frame)) {
+	 case 0: return val_swizz(load_(off)); 
+
+	 }
+}
+val load2(vec3 off)
+{
+	 return off.z == 0 ? val_swizz(load2_(off)) : load(off); 
+}
+#else
+#define load(off) val_swizz(load_(off))
+#define load2(off) val_swizz(load2_(off))
+#endif
+
+val poi2 = load2(vec3(0));  // guide pixel-of-interest
+#if GUIDE_INPUT
+#define poi poi2
+#else
+vec4 poi_ = load_(vec3(0)); 
+val poi = val_swizz(poi_);  // pixel-of-interest
+#endif
+
+#if RI // rotation
+vec2 rot(vec2 p, float d)
+{
+	 return vec2(
+	 	 p.x * cos(radians(d)) - p.y * sin(radians(d)),
+	 	 p.y * sin(radians(d)) + p.x * cos(radians(d))
+	 ); 
+}
+#else
+#define rot(p, d) (p)
+#endif
+
+#if RFI // reflection
+vec2 ref(vec2 p, int d)
+{
+	 switch (d) {
+	 case 0: return p; 
+	 case 1: return p * vec2(1, -1); 
+	 case 2: return p * vec2(-1, 1); 
+	 }
+}
+#else
+#define ref(p, d) (p)
+#endif
+
+#if SST && R >= SST
+float spatial_r(vec3 v)
+{
+	 v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size); 
+	 return SK(length(v*SD)*SS); 
+}
+#else
+#define spatial_r(v) (1)
+#endif
+
+#if PST && P >= PST
+#define spatial_p(v) PSK(length(v*PSD)*PSS)
+#else
+#define spatial_p(v) (1)
+#endif
+
+val range(val pdiff_sq)
+{
+	 const float h = max(S, 0.0) * 0.013; 
+	 const float pdiff_scale = 1.0/(h*h); 
+	 pdiff_sq = sqrt(pdiff_sq * pdiff_scale); 
+#if defined(LUMA_raw)
+	 return RK(pdiff_sq); 
+#elif defined(CHROMA_raw)
+	 return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y)); 
+#else
+	 return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z)); 
+#endif
+}
 
-vec4 hook()
+val patch_comparison(vec3 r, vec3 r2)
 {
-	 return HOOKED_texOff(0); 
+	 vec3 p; 
+	 val min_rot = val(p_area); 
+
+	 FOR_ROTATION FOR_REFLECTION {
+	 	 val pdiff_sq = val(0); 
+	 	 FOR_PATCH(p) {
+	 	 	 vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z); 
+	 	 	 val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF); 
+	 	 	 diff_sq *= diff_sq; 
+	 	 	 diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy); 
+	 	 	 pdiff_sq += diff_sq; 
+	 	 }
+	 	 min_rot = min(min_rot, pdiff_sq); 
+	 }
+
+	 return min_rot * p_scale; 
 }
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND _INJ_PREI
-//!WIDTH HOOKED.w
-//!HEIGHT HOOKED.h
-//!DESC Guided filter (I)
-//!SAVE _INJ_I
+#define NO_GATHER (PD == 0 && NG == 0 && SAMPLE == 0) // never textureGather if any of these conditions are false
+#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3 || RI == 7)
 
-vec4 hook()
+#if (defined(LUMA_gather) || D1W) && ((PS == 0 || ((PS == 3 || PS == 7) && RI != 7) || PS == 8) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
+// 3x3 diamond/plus patch_comparison_gather
+// XXX extend to support arbitrary sizes (probably requires code generation)
+// XXX support PSS
+const ivec2 offsets_adj[4] = { ivec2(0,-1), ivec2(1,0), ivec2(0,1), ivec2(-1,0) }; 
+const ivec2 offsets_adj_sf[4] = { ivec2(0,-1) * SF, ivec2(1,0) * SF, ivec2(0,1) * SF, ivec2(-1,0) * SF }; 
+vec4 poi_patch_adj = gather_offs(0, offsets_adj); 
+#if PS == 0 || PS == 8
+const ivec2 offsets_diag[4] = { ivec2(-1,-1), ivec2(1,-1), ivec2(1,1), ivec2(-1,1) }; 
+const ivec2 offsets_diag_sf[4] = { ivec2(-1,-1) * SF, ivec2(1,-1) * SF, ivec2(1,1) * SF, ivec2(-1,1) * SF }; 
+vec4 poi_patch_diag = gather_offs(0, offsets_diag); 
+#endif
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
-return _INJ_PREI_texOff(0);
-}
-
+	 float min_rot = p_area - 1; 
+	 vec4 transformer_adj = gather_offs(r, offsets_adj_sf); 
+#if PS == 0 || PS == 8
+	 vec4 transformer_diag = gather_offs(r, offsets_diag_sf); 
+#endif
+	 FOR_ROTATION {
+	 	 FOR_REFLECTION {
+#if RFI
+	 	 	 /* xxy
+	 	 	  * w y
+	 	 	  * wzz
+	 	 	  */
+	 	 	 switch(rfi) {
+	 	 	 case 1:
+	 	 	 	 transformer_adj = transformer_adj.zyxw; 
+#if PS == 0 || PS == 8
+	 	 	 	 transformer_diag = transformer_diag.zyxw; 
+#endif
+	 	 	 	 break; 
+	 	 	 case 2:
+	 	 	 	 transformer_adj = transformer_adj.xwzy; 
+#if PS == 0 || PS == 8
+	 	 	 	 transformer_diag = transformer_diag.xwzy; 
+#endif
+	 	 	 	 break; 
+	 	 	 }
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (P)
-//!BIND HOOKED
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_P
+	 	 	 vec4 diff = poi_patch_adj - transformer_adj; 
+#if PS == 0 || PS == 8
+	 	 	 diff += poi_patch_diag - transformer_diag; 
+#endif
+	 	 	 float diff_sq = dot(diff * diff, vec4(1)); 
+	 	 	 min_rot = min(diff_sq, min_rot); 
 
-vec4 hook()
+// un-reflect
+#if RFI
+	 	 	 switch(rfi) {
+	 	 	 case 1:
+	 	 	 	 transformer_adj = transformer_adj.zyxw; 
+#if PS == 0 || PS == 8
+	 	 	 	 transformer_diag = transformer_diag.zyxw; 
+#endif
+	 	 	 	 break; 
+	 	 	 case 2:
+	 	 	 	 transformer_adj = transformer_adj.xwzy; 
+#if PS == 0 || PS == 8
+	 	 	 	 transformer_diag = transformer_diag.xwzy; 
+#endif
+	 	 	 	 break; 
+	 	 	 }
+#endif
+	 	 } // FOR_REFLECTION
+#if RI == 7
+	 	 transformer_adj = transformer_adj.wxyz; 
+	 	 // swap adjacents for diagonals
+	 	 transformer_adj += transformer_diag; 
+	 	 transformer_diag = transformer_adj - transformer_diag; 
+	 	 transformer_adj -= transformer_diag; 
+#elif RI == 3
+	 	 transformer_adj = transformer_adj.wxyz; 
+#elif RI == 1
+	 	 transformer_adj = transformer_adj.zwxy; 
+#endif
+#if RI == 3 && (PS == 0 || PS == 8)
+	 	 transformer_diag = transformer_diag.wxyz; 
+#elif RI == 1 && (PS == 0 || PS == 8)
+	 	 transformer_diag = transformer_diag.zwxy; 
+#endif
+	 } // FOR_ROTATION
+	 float center_diff = poi2.x - load2(r).x; 
+	 return (center_diff * center_diff + min_rot) * p_scale; 
+}
+#elif (defined(LUMA_gather) || D1W) && PS == 4 && P == 3 && RI == 0 && RFI == 0 && NO_GATHER
+const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,0), ivec2(1,0) }; 
+const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,0) * SF, ivec2(1,0) * SF }; 
+vec4 poi_patch = gather_offs(0, offsets); 
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
-	 return HOOKED_texOff(0); 
+	 vec4 pdiff = poi_patch - gather_offs(r, offsets_sf); 
+	 return dot(pdiff * pdiff, vec4(1)) * p_scale; 
 }
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANI)
-//!BIND _INJ_I
-//!WIDTH _INJ_I.w 1.5 /
-//!HEIGHT _INJ_I.h 1.5 /
-//!SAVE _INJ_MEANI
-
-vec4 hook()
+#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
+// tiled even square patch_comparison_gather
+// XXX extend to support odd square?
+float patch_comparison_gather(vec3 r, vec3 r2)
 {
-return _INJ_I_texOff(0);
+	 vec2 tile; 
+	 float min_rot = p_area; 
+
+	 /* gather order:
+	  * w z
+	  * x y
+	  */
+	 float pdiff_sq = 0; 
+	 for (tile.x = -hp;  tile.x < hp;  tile.x+=2) for (tile.y = -hp;  tile.y < hp;  tile.y+=2) {
+	 	 vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy); 
+	 	 diff_sq *= diff_sq; 
+	 	 diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)),
+	 	 	                                  spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0))); 
+	 	 pdiff_sq += dot(diff_sq, vec4(1)); 
+	 }
+	 min_rot = min(min_rot, pdiff_sq); 
+
+	 return min_rot * p_scale; 
 }
-
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANP)
-//!BIND _INJ_P
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_MEANP
+#else
+#define patch_comparison_gather patch_comparison
+#endif
 
 vec4 hook()
 {
-return _INJ_P_texOff(0);
-}
+	 val total_weight = val(0); 
+	 val sum = val(0); 
+	 val result = val(0); 
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (_INJ_I_SQ)
-//!BIND _INJ_I
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_I_SQ
+	 vec3 r = vec3(0); 
+	 vec3 p = vec3(0); 
+	 vec3 me = vec3(0); 
 
-vec4 hook()
-{
-return _INJ_I_texOff(0) * _INJ_I_texOff(0);
-}
+#if T && ME == 1 // temporal & motion estimation
+	 vec3 me_tmp = vec3(0); 
+	 float maxweight = 0; 
+#elif T && ME == 2 // temporal & motion estimation
+	 vec3 me_sum = vec3(0); 
+	 float me_weight = 0; 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (_INJ_IXP)
-//!BIND _INJ_I
-//!BIND _INJ_P
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_IXP
+#if AS
+	 val total_weight_s = val(0); 
+	 val sum_s = val(0); 
+#endif
 
-vec4 hook()
-{
-return _INJ_I_texOff(0) * _INJ_P_texOff(0);
-}
+#if WD == 2 // weight discard (mean)
+	 int r_index = 0; 
+	 val_packed all_weights[r_area]; 
+	 val_packed all_pixels[r_area]; 
+#elif WD == 1 // weight discard (moving cumulative average)
+	 int r_iter = 1; 
+	 val wd_total_weight = val(0); 
+	 val wd_sum = val(0); 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (CORRI)
-//!BIND _INJ_I_SQ
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_CORRI
+	 FOR_FRAME(r) {
+	 // XXX ME is always a frame behind, should have the option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
+#if T && ME == 1 // temporal & motion estimation max weight
+	 if (r.z > 0) {
+	 	 me += me_tmp * MEF; 
+	 	 me_tmp = vec3(0); 
+	 	 maxweight = 0; 
+	 }
+#elif T && ME == 2 // temporal & motion estimation weighted average
+	 if (r.z > 0) {
+	 	 me += round(me_sum / me_weight * MEF); 
+	 	 me_sum = vec3(0); 
+	 	 me_weight = 0; 
+	 }
+#endif
+	 FOR_RESEARCH(r) {
+	 	 // r coords with appropriate transformations applied
+	 	 vec3 tr = vec3(r.xy + floor(r.xy * RSF), r.z); 
+	 	 float spatial_weight = spatial_r(tr); 
+	 	 tr.xy += me.xy; 
 
-vec4 hook()
-{
-return _INJ_I_SQ_texOff(0);
-}
+	 	 val px = load(tr); 
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (CORRP)
-//!BIND _INJ_IXP
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_CORRP
+#if SKIP_PATCH
+	 	 val weight = val(1); 
+#else
+	 	 val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(tr, vec3(0))) : patch_comparison(tr, vec3(0)); 
+	 	 val weight = range(pdiff_sq); 
+#endif
 
-vec4 hook()
-{
-return _INJ_IXP_texOff(0);
-}
+#if T && ME == 1 // temporal & motion estimation max weight
+	 	 me_tmp = vec3(tr.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); 
+	 	 maxweight = max(maxweight, weight.x); 
+#elif T && ME == 2 // temporal & motion estimation weighted average
+	 	 me_sum += vec3(tr.xy,0) * weight.x; 
+	 	 me_weight += weight.x; 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (A)
-//!BIND _INJ_MEANI
-//!BIND _INJ_MEANP
-//!BIND _INJ_CORRI
-//!BIND _INJ_CORRP
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_A
+#if D1W
+	 	 weight = val(weight.x); 
+#endif
 
-#define E 0.0013
+	 	 weight *= spatial_weight; 
 
-vec4 hook()
-{
-vec4 var = _INJ_CORRI_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANI_texOff(0);
-vec4 cov = _INJ_CORRP_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANP_texOff(0);
-	 return cov / (var + E); 
-}
+#if AS
+	 	 spatial_weight *= int(r.z == 0);  // ignore temporal
+	 	 sum_s += px * spatial_weight; 
+	 	 total_weight_s += spatial_weight; 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (B)
-//!BIND _INJ_A
-//!BIND _INJ_MEANI
-//!BIND _INJ_MEANP
-//!WIDTH _INJ_I.w
-//!HEIGHT _INJ_I.h
-//!SAVE _INJ_B
+#if WD == 2 // weight discard (mean)
+	 	 all_weights[r_index] = val_pack(weight); 
+	 	 all_pixels[r_index] = val_pack(px); 
+	 	 r_index++; 
+#elif WD == 1 // weight discard (moving cumulative average)
+	 	 val wd_scale = val(1.0/r_iter); 
+	 	 val below_threshold = WDS * abs(min(val(0.0), weight - (total_weight * wd_scale * WDT * WD1TK(sqrt(wd_scale*WDP))))); 
+#if defined(LUMA_raw)
+	 	 val wdkf = WDK(below_threshold); 
+#elif defined(CHROMA_raw)
+	 	 val wdkf = vec2(WDK(below_threshold.x), WDK(below_threshold.y)); 
+#else
+	 	 val wdkf = vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.y)); 
+#endif
+	 	 wd_sum += px * weight * wdkf; 
+	 	 wd_total_weight += weight * wdkf; 
+	 	 r_iter++; 
+#endif
 
-vec4 hook()
-{
-return _INJ_MEANP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANI_texOff(0);
-}
+	 	 sum += px * weight; 
+	 	 total_weight += weight; 
+	 } // FOR_RESEARCH
+	 } // FOR_FRAME
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANA)
-//!BIND _INJ_A
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_MEANA
+	 val avg_weight = total_weight * r_scale; 
+	 val old_avg_weight = avg_weight; 
 
-vec4 hook()
-{
-return _INJ_A_texOff(0);
-}
+#if WD == 2 // weight discard (mean)
+	 total_weight = val(0); 
+	 sum = val(0); 
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter (MEANB)
-//!BIND _INJ_B
-//!WIDTH _INJ_MEANI.w
-//!HEIGHT _INJ_MEANI.h
-//!SAVE _INJ_MEANB
+	 for (int i = 0;  i < r_area;  i++) {
+	 	 val weight = val_unpack(all_weights[i]); 
+	 	 val px = val_unpack(all_pixels[i]); 
 
-vec4 hook()
-{
-return _INJ_B_texOff(0);
-}
+	 	 val below_threshold = WDS * abs(min(val(0.0), weight - (avg_weight * WDT))); 
+#if defined(LUMA_raw)
+	 	 weight *= WDK(below_threshold); 
+#elif defined(CHROMA_raw)
+	 	 weight *= vec2(WDK(below_threshold.x), WDK(below_threshold.y)); 
+#else
+	 	 weight *= vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.z)); 
+#endif
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!DESC Guided filter
-//!BIND HOOKED
-//!BIND _INJ_MEANA
-//!BIND _INJ_MEANB
-//!SAVE RF_LUMA
+	 	 sum += px * weight; 
+	 	 total_weight += weight; 
+	 }
+#elif WD == 1 // weight discard (moving cumulative average)
+	 total_weight = wd_total_weight; 
+	 sum = wd_sum; 
+#endif
+#if WD // weight discard
+	 avg_weight = total_weight * r_scale; 
+#endif
 
-vec4 hook()
-{
-return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0);
+	 total_weight += SW * spatial_r(vec3(0)); 
+	 sum += poi * SW * spatial_r(vec3(0)); 
+	 result = val(sum / total_weight); 
+
+	 // store frames for temporal
+#if T > 1
+
+#endif
+#if T && TRF
+	 imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result)); 
+#elif T
+	 imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2)); 
+#endif
+
+#if AS == 1 // sharpen+denoise
+#define AS_base result
+#elif AS == 2 // sharpen only
+#define AS_base poi
+#endif
+#if AS
+	 val usm = result - sum_s/total_weight_s; 
+	 usm = exp(log(abs(usm))*ASP) * sign(usm);  // avoiding pow() since it's buggy on nvidia
+	 usm *= gaussian(abs((AS_base + usm - 0.5) / 1.5) * ASA); 
+	 usm *= ASF; 
+	 result = AS_base + usm; 
+#endif
+
+#if EP // extremes preserve
+	 float luminance = EP_texOff(0).x; 
+	 // EPSILON is needed since pow(0,0) is undefined
+	 float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); 
+	 result = mix(poi, result, ep_weight); 
+#else
+	 float ep_weight = 0; 
+#endif
+
+#if V == 1
+	 result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0); 
+#elif V == 2
+	 result = (poi - result) * 0.5 + 0.5; 
+#elif V == 3 // post-WD weight map
+	 result = avg_weight; 
+#elif V == 4 // pre-WD edge map
+	 result = old_avg_weight; 
+#elif V == 5
+	 result = 0.5 + usm; 
+#elif V == 6
+	 result = val(1 - ep_weight); 
+#endif
+
+// XXX visualize chroma for these
+#if defined(CHROMA_raw) && (V == 3 || V == 4 || V == 6)
+	 return vec4(0.5); 
+#endif
+
+	 return unval(mix(poi, result, BF)); 
 }
 
-// End of source code injected from guided.glsl 
+// End of source code injected from ../LQ/nlmeans.glsl 
 
 //!HOOK LUMA
 //!HOOK CHROMA
@@ -328,25 +1150,11 @@ vec4 hook()
 	return RF_LUMA_texOff(0);
 }
 
-//!HOOK LUMA
-//!HOOK CHROMA
-//!BIND LUMA
-//!WIDTH LUMA.w 3 /
-//!HEIGHT LUMA.h 3 /
-//!DESC Non-local means (EP)
-//!SAVE EP
-
-vec4 hook()
-{
-	return LUMA_texOff(0);
-}
-
 //!HOOK LUMA
 //!HOOK CHROMA
 //!BIND HOOKED
 //!BIND RF_LUMA
 //!BIND RF
-//!BIND EP
 //!BIND PREV1
 //!BIND PREV2
 //!DESC Non-local means (nlmeans_temporal.glsl)
@@ -358,49 +1166,37 @@ vec4 hook()
 
 // Denoising factor (level of blur, higher means more blur)
 #ifdef LUMA_raw
-#define S 2.0
+#define S 2.0522687499802097
 #else
-#define S 5.0
+#define S 2.5168955531436197
 #endif
 
 /* Adaptive sharpening
  *
- * Uses the blur incurred by denoising to perform an unsharp mask, and uses the 
- * weight map to restrict the sharpening to edges.
- *
- * If you just want to increase/decrease sharpness then you want to change ASF.
+ * Performs an unsharp mask by subtracting the spatial kernel's blur from the 
+ * NLM blur. For sharpen+denoise the sharpening is limited to edge areas and 
+ * denoising is done everywhere else.
  *
  * Use V=4 to visualize which areas are sharpened (black means sharpen).
  *
  * AS:
- * 	- 0 to disable
- * 	- 1 to sharpen+denoise
- * 	- 2 to sharpen only
+ * 	- 0: disable
+ * 	- 1: sharpen+denoise
+ * 	- 2: sharpen only
  * ASF: Higher numbers make a sharper image
- * ASP: Higher numbers use more of the sharp image
- * ASW:
- * 	- 0 to use pre-WD weights
- * 	- 1 to use post-WD weights (ASP should be ~2x to compensate)
- * ASK: Weight kernel:
- * 	- 0 for power. This is the old method.
- * 	- 1 for sigmoid. This is generally recommended.
- * 	- 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image)
- * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map
+ * ASA: Anti-ringing, higher numbers increase strength
+ * ASP: Power, lower numbers increase sharpening on lower frequency detail
  */
 #ifdef LUMA_raw
 #define AS 0
-#define ASF 3.0
-#define ASP 1.0
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
+#define ASF 0.1625
+#define ASA 5.0
+#define ASP 0.5
 #else
 #define AS 0
-#define ASF 3.0
-#define ASP 1.0
-#define ASW 0
-#define ASK 1
-#define ASC 0.0
+#define ASF 0.1625
+#define ASA 5.0
+#define ASP 0.5
 #endif
 
 /* Starting weight
@@ -411,52 +1207,57 @@ vec4 hook()
  * EPSILON should be used instead of zero to avoid divide-by-zero errors.
  */
 #ifdef LUMA_raw
-#define SW 1.0
+#define SW 1.3011446081346498
 #else
-#define SW 0.5
+#define SW 1.2219854377433914
 #endif
 
 /* Weight discard
  *
- * Discard weights that fall below a fraction of the average weight. This culls 
- * the most dissimilar samples from the blur, yielding a much more pleasant 
- * result, especially around edges.
+ * Reduces weights that fall below a fraction of the average weight. This culls 
+ * the most dissimilar samples from the blur, which can yield a better result, 
+ * especially around edges.
  * 
  * WD:
- * 	- 2: True average. Better quality, but slower and requires GLSL 4.0 or later
- * 	- 1: Moving cumulative average. Inaccurate, tends to blur directionally.
+ * 	- 2: Mean. Better quality, but slower and requires GLSL 4.0 or later
+ * 	- 1: Moving cumulative average. Fast but inaccurate, blurs directionally.
  * 	- 0: Disable
  *
  * WDT: Threshold coefficient, higher numbers discard more
  * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes
+ * WDS (not for WDK=is_zero): Higher numbers are more eager to reduce weights
  */
 #ifdef LUMA_raw
 #define WD 1
-#define WDT 0.5
-#define WDP 6.0
+#define WDT 0.11671341022864548
+#define WDP 5.381278367349288
+#define WDS 1.0
 #else
-#define WD 2
-#define WDT 0.75
-#define WDP 6.0
+#define WD 0
+#define WDT 0.002713346103131793
+#define WDP 5.832936323930807
+#define WDS 1.0
 #endif
 
 /* Extremes preserve
  *
- * Reduces denoising around very bright/dark areas.
+ * Reduce denoising in very bright/dark areas.
+ *
+ * Disabled by default now. If you want to reenable this, set EP=3/ in 
+ * Makefile.nlm and rebuild.
  *
  * The downscaling factor of the EP shader stage affects what is considered a 
- * bright/dark area. The default of 3 should be fine, it's not recommended to 
- * change this.
+ * bright/dark area.
  *
  * This is incompatible with RGB. If you have RGB hooks enabled then you will 
  * have to delete the EP shader stage or specify EP=0 through shader_cfg.
  *
  * EP: 1 to enable, 0 to disable
- * DP: EP strength on dark patches, 0 to fully denoise
- * BP: EP strength on bright patches, 0 to fully denoise
+ * DP: EP strength on dark areas, 0 to fully denoise
+ * BP: EP strength on bright areas, 0 to fully denoise
  */
 #ifdef LUMA_raw
-#define EP 1
+#define EP 0
 #define BP 0.75
 #define DP 0.25
 #else
@@ -471,12 +1272,26 @@ vec4 hook()
 /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
 /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */
 
+/* textureGather applicable configurations:
+ *
+ * - PS={0,3,7,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2}
+ * - PS={0,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2}
+ * - PS=6:RI=0:RFI=0
+ *   - Currently the only scalable variant
+ *
+ * Options which always disable textureGather:
+ * 	- NG
+ * 	- SAMPLE
+ * 	- PD
+ *
+ * Running without textureGather may be much slower.
+ */
+
 /* Patch & research sizes
  *
- * Patch size should be an odd number greater than or equal to 3. Higher values 
- * are slower and not always better.
+ * P should be an odd number. Higher values are slower and not always better.
  *
- * Research size be an odd number greater than or equal to 3. Higher values are 
+ * R should be an odd number greater than or equal to 3. Higher values are 
  * generally better, but slower, blurrier, and gives diminishing returns.
  */
 #ifdef LUMA_raw
@@ -494,8 +1309,6 @@ vec4 hook()
  *
  * PS applies applies to patches, RS applies to research zones.
  *
- * Be wary of gather optimizations (see the Regarding Speed comment at the top)
- *
  * 0: square (symmetrical)
  * 1: horizontal line (asymmetric)
  * 2: vertical line (asymmetric)
@@ -504,6 +1317,7 @@ vec4 hook()
  * 5: truncated triangle (asymmetric on two axis, last row halved)
  * 6: even sized square (asymmetric on two axis)
  * 7: plus (symmetrical)
+ * 8: plus X (symmetrical)
  */
 #ifdef LUMA_raw
 #define RS 3
@@ -518,8 +1332,8 @@ vec4 hook()
  * This setting is dependent on code generation from shader_cfg, so this 
  * setting can only be enabled via shader_cfg.
  *
- * Compares the pixel-of-interest against a guide, which could be a downscaled 
- * image or the output of another shader
+ * Computes weights on a guide, which could be a downscaled image or the output 
+ * of another shader, and applies the weights to the original image
  */
 #define RF_LUMA 1
 #define RF 1
@@ -533,6 +1347,9 @@ vec4 hook()
  * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a 
  * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc.
  *
+ * Consider setting SAMPLE=1 if setting RI to a setting that would require 
+ * sampling between pixels.
+ *
  * RI: Rotational invariance
  * RFI (0 to 2): Reflectional invariance
  */
@@ -600,14 +1417,14 @@ vec4 hook()
  */
 #ifdef LUMA_raw
 #define SST 1
-#define SS 0.25
+#define SS 0.5296176863733414
 #define SD vec3(1,1,1)
 #define PST 0
 #define PSS 0.0
 #define PSD vec2(1,1)
 #else
 #define SST 1
-#define SS 0.25
+#define SS 0.26295970436981203
 #define SD vec3(1,1,1)
 #define PST 0
 #define PSS 0.0
@@ -619,6 +1436,8 @@ vec4 hook()
  * SK: spatial kernel
  * RK: range kernel (takes patch differences)
  * PSK: intra-patch spatial kernel
+ * WDK: weight discard kernel
+ * WD1TK (WD=1 only): weight discard tolerance kernel
  *
  * List of available kernels:
  *
@@ -626,18 +1445,51 @@ vec4 hook()
  * cos
  * gaussian
  * lanczos
- * quadratic
+ * quadratic_ (unclamped)
  * sinc
+ * sinc_ (unclamped)
+ * sinc3
  * sphinx
+ * sphinx_ (unclamped)
+ * triangle_ (unclamped)
+ * triangle
  */
 #ifdef LUMA_raw
 #define SK gaussian
 #define RK gaussian
 #define PSK gaussian
+#define WDK is_zero
+#define WD1TK gaussian
 #else
 #define SK gaussian
 #define RK gaussian
 #define PSK gaussian
+#define WDK is_zero
+#define WD1TK gaussian
+#endif
+
+/* Sampling method
+ *
+ * In most cases this shouldn't make any difference, only set to bilinear if 
+ * it's necessary to sample between pixels (e.g., RI=2).
+ *
+ * 0: nearest neighbor
+ * 1: bilinear
+ */
+#ifdef LUMA_raw
+#define SAMPLE 0
+#else
+#define SAMPLE 0
+#endif
+
+/* Research scaling factor
+ *
+ * Higher numbers sample more sparsely as the distance from the POI grows.
+ */
+#ifdef LUMA_raw
+#define RSF 0.0
+#else
+#define RSF 0.0
 #endif
 
 // Scaling factor (should match WIDTH/HEIGHT)
@@ -647,13 +1499,22 @@ vec4 hook()
 #define SF 1
 #endif
 
+// Use the guide image as the input image
+#ifdef LUMA_raw
+#define GUIDE_INPUT 0
+#else
+#define GUIDE_INPUT 0
+#endif
+
 /* Visualization
  *
  * 0: off
  * 1: absolute difference between input/output to the power of 0.25
  * 2: difference between input/output centered on 0.5
- * 3: avg_weight
- * 4: edge map (based on the relevant AS settings)
+ * 3: post-WD weight map
+ * 4: pre-WD weight map
+ * 5: unsharp mask
+ * 6: EP
  */
 #ifdef LUMA_raw
 #define V 0
@@ -698,37 +1559,44 @@ vec4 hook()
 
 // Shader code
 
-#define EPSILON 0.00000000001
+#define EPSILON 1.2e-38
 #define M_PI 3.14159265358979323846
 #define POW2(x) ((x)*(x))
 #define POW3(x) ((x)*(x)*(x))
-#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define bicubic_(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0))))
+#define bicubic(x) bicubic_(clamp((x), 0.0, 2.0))
 #define gaussian(x) exp(-1 * POW2(x))
-#define lanczos(x) POW2(sinc(x))
-#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
-#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
-#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+#define quadratic_(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5))
+#define quadratic(x) quadratic_(clamp((x), 0.0, 1.5))
+#define sinc_(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI))
+#define sinc(x) sinc_(clamp((x), 0.0, 1.0))
+#define sinc3(x) sinc_(clamp((x), 0.0, 3.0))
+#define lanczos(x) (sinc3(x) * sinc(x))
+#define sphinx_(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI))
+#define sphinx(x) sphinx_(clamp((x), 0.0, 1.4302966531242027))
+#define triangle_(x) (1 - (x))
+#define triangle(x) triangle_(clamp((x), 0.0, 1.0))
+#define is_zero(x) int(x == 0)
 
 // XXX could maybe be better optimized on LGC
-// XXX return original alpha component instead of 1.0
 #if defined(LUMA_raw)
 #define val float
 #define val_swizz(v) (v.x)
-#define unval(v) vec4(v.x, 0, 0, 1.0)
+#define unval(v) vec4(v.x, 0, 0, poi_.a)
 #define val_packed val
 #define val_pack(v) (v)
 #define val_unpack(v) (v)
 #elif defined(CHROMA_raw)
 #define val vec2
 #define val_swizz(v) (v.xy)
-#define unval(v) vec4(v.x, v.y, 0, 1.0)
+#define unval(v) vec4(v.x, v.y, 0, poi_.a)
 #define val_packed uint
 #define val_pack(v) packUnorm2x16(v)
 #define val_unpack(v) unpackUnorm2x16(v)
 #else
 #define val vec3
 #define val_swizz(v) (v.xyz)
-#define unval(v) vec4(v.x, v.y, v.z, 1.0)
+#define unval(v) vec4(v.x, v.y, v.z, poi_.a)
 #define val_packed val
 #define val_pack(v) (v)
 #define val_unpack(v) (v)
@@ -746,10 +1614,6 @@ const int hr = R/2;
 const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes
 #endif
 
-// donut increment, increments without landing on (0,0,0)
-// much faster than a continue statement
-#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0))))
-
 // patch/research shapes
 // each shape is depicted in a comment, where Z=5 (Z corresponds to P or R)
 // dots (.) represent samples (pixels) and X represents the pixel-of-interest
@@ -790,7 +1654,7 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 //
 // Z    ..X..
 //
-#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++)
+#define S_HORIZONTAL(z,hz,incr) for (z.y = 0; z.y <= 0; z.y++) for (z.x = -hz; z.x <= hz; incr)
 
 // 90 degree rotation of S_HORIZONTAL
 #define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr)
@@ -803,19 +1667,13 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr)
 #define S_PLUS_A(hz,Z) (Z*2 - 1)
 
-// XXX implement S_PLUS w/ an X overlayed:
 // 3    . . .
 // 3     ...
 // Z    ..X..
 // 3     ...
 // 3    . . .
-
-// XXX implement an X shape:
-// 2    .   .
-// 2     . .
-// 1      X  
-// 2     . .
-// 2    .   .
+#define S_PLUS_X(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(z.x) + -hz * int(z.x == 0); z.y <= abs(z.x) + hz * int(z.x == 0); incr)
+#define S_PLUS_X_A(hz,Z) (Z*4 - 3)
 
 // 1x1 square
 #define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++)
@@ -829,43 +1687,43 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res
 #define RF_ RF
 #endif
 
-// Skip comparing the pixel-of-interest against itself, unless RF is enabled
-#if RF_
-#define RINCR(z,c) (z.c++)
-#else
-#define RINCR DINCR
-#endif
+// donut increment, increments without landing on (0,0,0)
+// much faster than a continue statement
+#define DINCR(z,c,a) ((z.c += a),(z.c += int(z == vec3(0))))
 
-#define R_AREA(a) (a * T1 + RF_-1)
+#define R_AREA(a) (a * T1 - 1)
 
 // research shapes
 // XXX would be nice to have the option of temporally-varying research sizes
 #if R == 0 || R == 1
 #define FOR_RESEARCH(r) S_1X1(r)
 const int r_area = R_AREA(1);
+#elif RS == 8
+#define FOR_RESEARCH(r) S_PLUS_X(r,hr,DINCR(r,y,max(1,abs(r.x))))
+const int r_area = R_AREA(S_PLUS_X_A(hr,R));
 #elif RS == 7
-#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_PLUS(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(S_PLUS_A(hr,R));
 #elif RS == 6
-#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(R*R);
 #elif RS == 5
-#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x))
+#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,DINCR(r,x,1))
 const int r_area = R_AREA(S_TRIANGLE_A(hr,hr));
 #elif RS == 4
-#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x))
+#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,DINCR(r,x,1))
 const int r_area = R_AREA(S_TRIANGLE_A(hr,R));
 #elif RS == 3
-#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_DIAMOND(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(S_DIAMOND_A(hr,R));
 #elif RS == 2
-#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_VERTICAL(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(R);
 #elif RS == 1
-#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x))
+#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,DINCR(r,x,1))
 const int r_area = R_AREA(R);
 #elif RS == 0
-#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y))
+#define FOR_RESEARCH(r) S_SQUARE(r,hr,DINCR(r,y,1))
 const int r_area = R_AREA(R*R);
 #endif
 
@@ -887,7 +1745,7 @@ const int r_area = R_AREA(R*R);
 #if PD
 #define PINCR DINCR
 #else
-#define PINCR(z,c) (z.c++)
+#define PINCR(z,c,a) (z.c += a)
 #endif
 
 #define P_AREA(a) (a - PD)
@@ -896,36 +1754,44 @@ const int r_area = R_AREA(R*R);
 #if P == 0 || P == 1
 #define FOR_PATCH(p) S_1X1(p)
 const int p_area = P_AREA(1);
+#elif PS == 8
+#define FOR_PATCH(p) S_PLUS_X(p,hp,PINCR(p,y,max(1,abs(p.x))))
+const int p_area = P_AREA(S_PLUS_X_A(hp,P));
 #elif PS == 7
-#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(S_PLUS_A(hp,P));
 #elif PS == 6
-#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(P*P);
 #elif PS == 5
-#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x))
+#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x,1))
 const int p_area = P_AREA(S_TRIANGLE_A(hp,hp));
 #elif PS == 4
-#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x))
+#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x,1))
 const int p_area = P_AREA(S_TRIANGLE_A(hp,P));
 #elif PS == 3
-#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(S_DIAMOND_A(hp,P));
 #elif PS == 2
-#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(P);
 #elif PS == 1
-#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x))
+#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x,1))
 const int p_area = P_AREA(P);
 #elif PS == 0
-#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y))
+#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y,1))
 const int p_area = P_AREA(P*P);
 #endif
 
 const float r_scale = 1.0/r_area;
 const float p_scale = 1.0/p_area;
 
-#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size)))
+#if SAMPLE == 0
+#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * (vec2(off) + 0.5 - fract((pos) * (size))))
+#else
+#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * vec2(off))
+#endif
+
 #define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off)
 
 #if RF_ && defined(LUMA_raw)
@@ -962,8 +1828,13 @@ val load2(vec3 off)
 #define load2(off) val_swizz(load2_(off))
 #endif
 
-val poi = load(vec3(0)); // pixel-of-interest
 val poi2 = load2(vec3(0)); // guide pixel-of-interest
+#if GUIDE_INPUT
+#define poi poi2
+#else
+vec4 poi_ = load_(vec3(0));
+val poi = val_swizz(poi_); // pixel-of-interest
+#endif
 
 #if RI // rotation
 vec2 rot(vec2 p, float d)
@@ -1008,7 +1879,7 @@ float spatial_r(vec3 v)
 
 val range(val pdiff_sq)
 {
-	const float h = S*0.013;
+	const float h = max(S, 0.0) * 0.013;
 	const float pdiff_scale = 1.0/(h*h);
 	pdiff_sq = sqrt(pdiff_sq * pdiff_scale);
 #if defined(LUMA_raw)
@@ -1018,10 +1889,6 @@ val range(val pdiff_sq)
 #else
 	return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z));
 #endif
-	//return exp(-pdiff_sq * pdiff_scale);
-
-	// weight function from the NLM paper, it's not very good
-	//return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale);
 }
 
 val patch_comparison(vec3 r, vec3 r2)
@@ -1044,42 +1911,104 @@ val patch_comparison(vec3 r, vec3 r2)
 	return min_rot * p_scale;
 }
 
-#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false
-#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3)
+#define NO_GATHER (PD == 0 && NG == 0 && SAMPLE == 0) // never textureGather if any of these conditions are false
+#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3 || RI == 7)
 
-#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
+#if (defined(LUMA_gather) || D1W) && ((PS == 0 || ((PS == 3 || PS == 7) && RI != 7) || PS == 8) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER
 // 3x3 diamond/plus patch_comparison_gather
 // XXX extend to support arbitrary sizes (probably requires code generation)
-// XXX extend to support 3x3 square
 // XXX support PSS
-const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) };
-const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF };
-vec4 poi_patch = gather_offs(0, offsets);
+const ivec2 offsets_adj[4] = { ivec2(0,-1), ivec2(1,0), ivec2(0,1), ivec2(-1,0) };
+const ivec2 offsets_adj_sf[4] = { ivec2(0,-1) * SF, ivec2(1,0) * SF, ivec2(0,1) * SF, ivec2(-1,0) * SF };
+vec4 poi_patch_adj = gather_offs(0, offsets_adj);
+#if PS == 0 || PS == 8
+const ivec2 offsets_diag[4] = { ivec2(-1,-1), ivec2(1,-1), ivec2(1,1), ivec2(-1,1) };
+const ivec2 offsets_diag_sf[4] = { ivec2(-1,-1) * SF, ivec2(1,-1) * SF, ivec2(1,1) * SF, ivec2(-1,1) * SF };
+vec4 poi_patch_diag = gather_offs(0, offsets_diag);
+#endif
 float patch_comparison_gather(vec3 r, vec3 r2)
 {
 	float min_rot = p_area - 1;
-	vec4 transformer = gather_offs(r, offsets_sf);
+	vec4 transformer_adj = gather_offs(r, offsets_adj_sf);
+#if PS == 0 || PS == 8
+	vec4 transformer_diag = gather_offs(r, offsets_diag_sf);
+#endif
 	FOR_ROTATION {
 		FOR_REFLECTION {
-			float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1));
+#if RFI
+			/* xxy
+			 * w y
+			 * wzz
+			 */
+			switch(rfi) {
+			case 1:
+				transformer_adj = transformer_adj.zyxw;
+#if PS == 0 || PS == 8
+				transformer_diag = transformer_diag.zyxw;
+#endif
+				break;
+			case 2:
+				transformer_adj = transformer_adj.xwzy;
+#if PS == 0 || PS == 8
+				transformer_diag = transformer_diag.xwzy;
+#endif
+				break;
+			}
+#endif
+
+			vec4 diff = poi_patch_adj - transformer_adj;
+#if PS == 0 || PS == 8
+			diff += poi_patch_diag - transformer_diag;
+#endif
+			float diff_sq = dot(diff * diff, vec4(1));
 			min_rot = min(diff_sq, min_rot);
+
+// un-reflect
 #if RFI
 			switch(rfi) {
-			case 0: transformer = transformer.zyxw; break;
-			case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror
-			case 2: transformer = transformer.zyxw; break; // undoes last mirror
+			case 1:
+				transformer_adj = transformer_adj.zyxw;
+#if PS == 0 || PS == 8
+				transformer_diag = transformer_diag.zyxw;
+#endif
+				break;
+			case 2:
+				transformer_adj = transformer_adj.xwzy;
+#if PS == 0 || PS == 8
+				transformer_diag = transformer_diag.xwzy;
+#endif
+				break;
 			}
 #endif
-		}
-#if RI == 3
-		transformer = transformer.wxyz;
+		} // FOR_REFLECTION
+#if RI == 7
+		transformer_adj = transformer_adj.wxyz;
+		// swap adjacents for diagonals
+		transformer_adj += transformer_diag;
+		transformer_diag = transformer_adj - transformer_diag;
+		transformer_adj -= transformer_diag;
+#elif RI == 3
+		transformer_adj = transformer_adj.wxyz;
 #elif RI == 1
-		transformer = transformer.zwxy;
+		transformer_adj = transformer_adj.zwxy;
 #endif
-	}
-	float center_diff_sq = poi2.x - load2(r).x;
-	center_diff_sq *= center_diff_sq;
-	return (min_rot + center_diff_sq) * p_scale;
+#if RI == 3 && (PS == 0 || PS == 8)
+		transformer_diag = transformer_diag.wxyz;
+#elif RI == 1 && (PS == 0 || PS == 8)
+		transformer_diag = transformer_diag.zwxy;
+#endif
+	} // FOR_ROTATION
+	float center_diff = poi2.x - load2(r).x;
+	return (center_diff * center_diff + min_rot) * p_scale;
+}
+#elif (defined(LUMA_gather) || D1W) && PS == 4 && P == 3 && RI == 0 && RFI == 0 && NO_GATHER
+const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,0), ivec2(1,0) };
+const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,0) * SF, ivec2(1,0) * SF };
+vec4 poi_patch = gather_offs(0, offsets);
+float patch_comparison_gather(vec3 r, vec3 r2)
+{
+	vec4 pdiff = poi_patch - gather_offs(r, offsets_sf);
+	return dot(pdiff * pdiff, vec4(1)) * p_scale;
 }
 #elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER
 // tiled even square patch_comparison_gather
@@ -1127,18 +2056,23 @@ vec4 hook()
 	float me_weight = 0;
 #endif
 
-#if WD == 2 // weight discard
+#if AS
+	val total_weight_s = val(0);
+	val sum_s = val(0);
+#endif
+
+#if WD == 2 // weight discard (mean)
 	int r_index = 0;
 	val_packed all_weights[r_area];
 	val_packed all_pixels[r_area];
-#elif WD == 1 // weight discard
-	val no_weights = val(0);
-	val discard_total_weight = val(0);
-	val discard_sum = val(0);
+#elif WD == 1 // weight discard (moving cumulative average)
+	int r_iter = 1;
+	val wd_total_weight = val(0);
+	val wd_sum = val(0);
 #endif
 
 	FOR_FRAME(r) {
-	// XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
+	// XXX ME is always a frame behind, should have the option to re-research after applying ME (could do it an arbitrary number of times per frame if desired)
 #if T && ME == 1 // temporal & motion estimation max weight
 	if (r.z > 0) {
 		me += me_tmp * MEF;
@@ -1152,19 +2086,26 @@ vec4 hook()
 		me_weight = 0;
 	}
 #endif
-	FOR_RESEARCH(r) { // main NLM logic
+	FOR_RESEARCH(r) {
+		// r coords with appropriate transformations applied
+		vec3 tr = vec3(r.xy + floor(r.xy * RSF), r.z);
+		float spatial_weight = spatial_r(tr);
+		tr.xy += me.xy;
+
+		val px = load(tr);
+
 #if SKIP_PATCH
 		val weight = val(1);
 #else
-		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0));
+		val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(tr, vec3(0))) : patch_comparison(tr, vec3(0));
 		val weight = range(pdiff_sq);
 #endif
 
 #if T && ME == 1 // temporal & motion estimation max weight
-		me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
+		me_tmp = vec3(tr.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x));
 		maxweight = max(maxweight, weight.x);
 #elif T && ME == 2 // temporal & motion estimation weighted average
-		me_sum += vec3(r.xy,0) * weight.x;
+		me_sum += vec3(tr.xy,0) * weight.x;
 		me_weight += weight.x;
 #endif
 
@@ -1172,21 +2113,34 @@ vec4 hook()
 		weight = val(weight.x);
 #endif
 
-		weight *= spatial_r(r);
+		weight *= spatial_weight;
 
-#if WD == 2 // weight discard
+#if AS
+		spatial_weight *= int(r.z == 0); // ignore temporal
+		sum_s += px * spatial_weight;
+		total_weight_s += spatial_weight;
+#endif
+
+#if WD == 2 // weight discard (mean)
 		all_weights[r_index] = val_pack(weight);
-		all_pixels[r_index] = val_pack(load(r+me));
+		all_pixels[r_index] = val_pack(px);
 		r_index++;
-#elif WD == 1 // weight discard
-		val wd_scale = 1.0/max(no_weights, 1);
-		val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight);
-		discard_sum += load(r+me) * weight * (1 - keeps);
-		discard_total_weight += weight * (1 - keeps);
-		no_weights += keeps;
+#elif WD == 1 // weight discard (moving cumulative average)
+		val wd_scale = val(1.0/r_iter);
+		val below_threshold = WDS * abs(min(val(0.0), weight - (total_weight * wd_scale * WDT * WD1TK(sqrt(wd_scale*WDP)))));
+#if defined(LUMA_raw)
+		val wdkf = WDK(below_threshold);
+#elif defined(CHROMA_raw)
+		val wdkf = vec2(WDK(below_threshold.x), WDK(below_threshold.y));
+#else
+		val wdkf = vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.y));
+#endif
+		wd_sum += px * weight * wdkf;
+		wd_total_weight += weight * wdkf;
+		r_iter++;
 #endif
 
-		sum += load(r+me) * weight;
+		sum += px * weight;
 		total_weight += weight;
 	} // FOR_RESEARCH
 	} // FOR_FRAME
@@ -1194,37 +2148,37 @@ vec4 hook()
 	val avg_weight = total_weight * r_scale;
 	val old_avg_weight = avg_weight;
 
-#if WD == 2 // true average
+#if WD == 2 // weight discard (mean)
 	total_weight = val(0);
 	sum = val(0);
-	val no_weights = val(0);
 
 	for (int i = 0; i < r_area; i++) {
-		val w = val_unpack(all_weights[i]);
+		val weight = val_unpack(all_weights[i]);
 		val px = val_unpack(all_pixels[i]);
-		val keeps = step(avg_weight*WDT, w);
 
-		w *= keeps;
-		sum += px * w;
-		total_weight += w;
-		no_weights += keeps;
+		val below_threshold = WDS * abs(min(val(0.0), weight - (avg_weight * WDT)));
+#if defined(LUMA_raw)
+		weight *= WDK(below_threshold);
+#elif defined(CHROMA_raw)
+		weight *= vec2(WDK(below_threshold.x), WDK(below_threshold.y));
+#else
+		weight *= vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.z));
+#endif
+
+		sum += px * weight;
+		total_weight += weight;
 	}
-#elif WD == 1 // moving cumulative average
-	total_weight -= discard_total_weight;
-	sum -= discard_sum;
+#elif WD == 1 // weight discard (moving cumulative average)
+	total_weight = wd_total_weight;
+	sum = wd_sum;
 #endif
 #if WD // weight discard
-	avg_weight = total_weight / no_weights;
+	avg_weight = total_weight * r_scale;
 #endif
 
 	total_weight += SW * spatial_r(vec3(0));
 	sum += poi * SW * spatial_r(vec3(0));
-
-#if V == 3 // weight map
-	result = val(avg_weight);
-#else // mean
 	result = val(sum / total_weight);
-#endif
 
 	// store frames for temporal
 #if T > 1
@@ -1236,27 +2190,17 @@ vec4 hook()
 	imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2));
 #endif
 
-#if ASW == 0 // pre-WD weights
-#define AS_weight old_avg_weight
-#elif ASW == 1 // post-WD weights
-#define AS_weight avg_weight
-#endif
-
-#if ASK == 0
-	val sharpening_strength = pow(AS_weight, val(ASP));
-#elif ASK == 1
-	val sharpening_strength = mix(
-			pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)),
-			AS_weight, ASC);
-	// XXX normalize the result to account for a negative ASC?
-#elif ASK == 2
-	val sharpening_strength = val(ASP);
-#endif
-
 #if AS == 1 // sharpen+denoise
-	val sharpened = result + (poi - result) * ASF;
+#define AS_base result
 #elif AS == 2 // sharpen only
-	val sharpened = poi + (poi - result) * ASF;
+#define AS_base poi
+#endif
+#if AS
+	val usm = result - sum_s/total_weight_s;
+	usm = exp(log(abs(usm))*ASP) * sign(usm); // avoiding pow() since it's buggy on nvidia
+	usm *= gaussian(abs((AS_base + usm - 0.5) / 1.5) * ASA);
+	usm *= ASF;
+	result = AS_base + usm;
 #endif
 
 #if EP // extremes preserve
@@ -1264,26 +2208,27 @@ vec4 hook()
 	// EPSILON is needed since pow(0,0) is undefined
 	float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP));
 	result = mix(poi, result, ep_weight);
-#endif
-
-#if AS == 1 // sharpen+denoise
-	result = mix(sharpened, result, sharpening_strength);
-#elif AS == 2 // sharpen only
-	result = mix(sharpened, poi, sharpening_strength);
-#endif
-
-#if V == 4 // edge map
-	result = sharpening_strength;
-#endif
-
-#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations
-	return vec4(0.5);
+#else
+	float ep_weight = 0;
 #endif
 
 #if V == 1
 	result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0);
 #elif V == 2
 	result = (poi - result) * 0.5 + 0.5;
+#elif V == 3 // post-WD weight map
+	result = avg_weight;
+#elif V == 4 // pre-WD edge map
+	result = old_avg_weight;
+#elif V == 5
+	result = 0.5 + usm;
+#elif V == 6
+	result = val(1 - ep_weight);
+#endif
+
+// XXX visualize chroma for these
+#if defined(CHROMA_raw) && (V == 3 || V == 4 || V == 6)
+	return vec4(0.5);
 #endif
 
 	return unval(mix(poi, result, BF));
diff --git a/portable_config/vs/SR_ESRGAN_DML.vpy b/portable_config/vs/SR_ESRGAN_DML.vpy
index 3cf78190..7dbb75c3 100644
--- a/portable_config/vs/SR_ESRGAN_DML.vpy
+++ b/portable_config/vs/SR_ESRGAN_DML.vpy
@@ -13,14 +13,14 @@ clip = video_in
 
 H_Pre = 720
 Lt_Hd = False
-Model = 5000
+Model = 5005
 Gpu = 0
 Gpu_T = 2
 H_Max = 1440
 Lk_Fmt = False
 ## 整数，预降低处理源高度
 ## <True|False> 是否对超过HD分辨率（720P）的源进行处理
-## <0|2|5000|5001|5002|5003|5004> 使用的模型
+## <0|2|5005|5006|5007> 使用的模型
 ## 使用的显卡序号，0为排序一号
 ## <1|2|3> 使用的显卡线程数
 ## 整数，输出高度限制（填你的显示器高度）
diff --git a/portable_config/vs/SR_ESRGAN_NV.vpy b/portable_config/vs/SR_ESRGAN_NV.vpy
index da98aaf2..3ac10042 100644
--- a/portable_config/vs/SR_ESRGAN_NV.vpy
+++ b/portable_config/vs/SR_ESRGAN_NV.vpy
@@ -13,7 +13,7 @@ clip = video_in
 
 H_Pre = 720
 Lt_Hd = False
-Model = 5000
+Model = 5005
 Gpu = 0
 Gpu_T = 2
 St_Eng = False
@@ -22,7 +22,7 @@ H_Max = 1440
 Lk_Fmt = False
 ## 整数，预降低处理源高度
 ## <True|False> 是否对超过HD分辨率（720P）的源进行处理
-## <0|2|5000|5001|5002|5003|5004> 使用的模型
+## <0|2|5005|5006|5007> 使用的模型
 ## 使用的显卡序号，0为排序一号
 ## <1|2|3> 使用的显卡线程数
 ## <True|False> 是否使用静态引擎（需要对不同分辨率的源各进行预处理）；动态引擎自适应不同分辨率（64²→DCI2K）