From 6309bcbc41cd824a38f63c10a12e3bafe7198c6a Mon Sep 17 00:00:00 2001 From: hooke007 Date: Fri, 26 May 2023 00:58:48 +0100 Subject: [PATCH] =?UTF-8?q?=E5=90=8C=E6=AD=A5=E4=B8=8E=E6=95=B4=E5=90=88?= =?UTF-8?q?=E4=B8=8A=E6=B8=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 核心: 同步 --slang --alang --subs-fallback --hwdec 脚本: uosc 同步至 4.7.0+ ;撤回临时修复,使用上游方案代替 着色器: nlmeans再次经历了重构,只采用原始仓库标准和hqx目录中的着色器并移除微调式(关联如下)的变体 - guided guided_fast - guided_s guided_s_fast - nlmeans nlmeans_sharpen_denoise nlmeans_sharpen_only - nlmeans_temporal nlmeans_temporal_sharpen_denoise --- portable_config/mpv.conf | 15 +- portable_config/scripts/osc_plus.lua | 39 +- portable_config/scripts/thumbfast.lua | 11 +- .../scripts/uosc/elements/Elements.lua | 2 +- .../scripts/uosc/elements/Menu.lua | 192 +- .../scripts/uosc/elements/Timeline.lua | 2 +- .../scripts/uosc/elements/TopBar.lua | 8 +- portable_config/scripts/uosc/lib/std.lua | 37 +- portable_config/scripts/uosc/lib/text.lua | 15 +- portable_config/scripts/uosc/lib/utils.lua | 101 +- portable_config/scripts/uosc/main.lua | 40 +- portable_config/shaders/nlmeans.glsl | 1821 +++++++--- portable_config/shaders/nlmeans_2x.glsl | 1247 ------- portable_config/shaders/nlmeans_hqx.glsl | 2947 ++++++++++++++--- portable_config/shaders/nlmeans_lgc.glsl | 1043 ------ portable_config/shaders/nlmeans_lq.glsl | 1086 ------ portable_config/shaders/nlmeans_temporal.glsl | 1819 +++++++--- portable_config/vs/SR_ESRGAN_DML.vpy | 4 +- portable_config/vs/SR_ESRGAN_NV.vpy | 4 +- 19 files changed, 5616 insertions(+), 4817 deletions(-) delete mode 100644 portable_config/shaders/nlmeans_2x.glsl delete mode 100644 portable_config/shaders/nlmeans_lgc.glsl delete mode 100644 portable_config/shaders/nlmeans_lq.glsl diff --git a/portable_config/mpv.conf b/portable_config/mpv.conf index 85699d72..a7bd5eca 100644 --- a/portable_config/mpv.conf +++ b/portable_config/mpv.conf @@ -21,9 +21,9 @@ d3d11-exclusive-fs = no # [当 --gpu-api=d3d11 时] 全屏时独占,默认 no d3d11-flip = yes # (通常在 --d3d11-exclusive-fs=yes 和 --on-top 一起使用时)禁用它可避免MPV全屏时的冻屏问题,默认 yes - hwdec = no # 指定应使用的硬件视频解码API,默认软解(no)。10系以上N卡如需硬解强烈建议使用 nvdec-copy - # 值 auto 等效 yes 即原生硬解。追求效率可使用,但不支持部分设置/滤镜/着色器 - # 平衡选择推荐使用 auto-copy + hwdec = no # 指定应使用的硬件视频解码API,默认值 no 为软解。值 auto 等效 yes 即原生硬解,但不支持部分设置/滤镜。 + # 它也可以是多个值组成的优先级列表,例如值 vulkan-copy,nvdec-copy,dxva2-copy 表示依次尝试这些解码模式 + # 更多详情参见Wiki的FAQ页面下的“软硬解的选择”部分 hwdec-codecs = "h264,vc1,hevc,vp8,vp9,av1,prores" # 对限定范围内的编码尝试硬解,特殊值 all 即任意格式都尝试硬解,当前版本默认值 h264,vc1,hevc,vp8,vp9,av1,prores vd-lavc-dr = auto # <默认auto|yes|no> 是否直接解码到显存,个别低端英特尔处理器可能需要显式禁用此功能以大幅提速解码 @@ -139,7 +139,8 @@ # 特殊值 stereo 强制多声道音源下混为双声道输出(避免可能的7.1/5.1→2.0声音丢失和音量过小) audio-pitch-correction = yes # 变速播放时的音调修正,默认 yes alang = - # 音轨首选语言,但MPV优先加载外挂轨道,此项参数可能实际用处不大。默认为空,例值(优选中文) chs,sc,zh,chi,zho + # 音轨首选语言,但MPV优先加载外挂轨道,此项参数可能实际用处不大。 + # 默认为空,特殊值可为 auto (尝试匹配系统语言),例值(优选中文) chs,sc,zh,chi,zho audio-file-auto = no # <默认no|exact|fuzzy|all> 自动加载同名外挂音轨(fuzzy为模糊名,exact为精确名) @@ -252,8 +253,10 @@ sub-file-paths = # 在指定的额外目录中寻找匹配的字幕。支持相对和绝对路径,默认为空 # 例值( sub;subtitles;字幕;C:/字幕库 )即自动搜索当前文件路径下名为"sub","subtitles","字幕"和C盘的"字幕库"文件夹内 - slang = - # 字幕首选语言,但MPV优先加载外挂轨道,此项参数可能实际用处不大。默认为空,例值(优选中文) chs,sc,zh,chi,zho + slang = auto + # 字幕首选语言,但MPV优先加载外挂轨道,此项参数可能实际用处不大。 + # 默认值为 auto (尝试匹配系统语言),例值(优选中文) chs,sc,zh,chi,zho + subs-fallback = no # 现有字幕轨无法满足 --slang 的条件时是否回退选择其它字幕,值 default 表示仅选择带有“默认”标记的轨道 blend-subtitles = no # 在插值和颜色管理之前,将字幕混合到视频帧上。值video类似于yes,但是以视频的原始分辨率绘制字幕,并与视频一起缩放 # 启用此功能会将字幕限制在视频的可见部分(不能出现在视频下方的黑色空白处) # 还会让字幕受 --icc-profile --target-prim --target-trc --interpolation --gamma-factor --glsl-shaders 的影响 diff --git a/portable_config/scripts/osc_plus.lua b/portable_config/scripts/osc_plus.lua index 9108658b..6a4e0bd7 100644 --- a/portable_config/scripts/osc_plus.lua +++ b/portable_config/scripts/osc_plus.lua @@ -1,6 +1,6 @@ --[[ SOURCE_ https://github.com/mpv-player/mpv/blob/master/player/lua/osc.lua -COMMIT_ b7ffe0d16eec8153d9609382997baaf6a29e5e4f +COMMIT_ 945d7c1eda47c97c4bfba884fb21f398a64b2289 文档_ https://github.com/hooke007/MPV_lazy/discussions/18 改进版本的OSC,不兼容其它OSC类脚本(实现全部功能需搭配 新缩略图引擎 thumbfast ) @@ -1432,6 +1432,11 @@ layouts["box"] = function () {x = posX - pos_offsetX, y = bigbtnrowY, an = 7, w = 70, h = 18} lo.style = osc_styles.smallButtonsL + lo = add_layout("tog_forced_only") + lo.geometry = + {x = posX - pos_offsetX + 70, y = bigbtnrowY - 1, an = 7, w = 25, h = 18} + lo.style = osc_styles.smallButtonsL + lo = add_layout("tog_fs") lo.geometry = {x = posX+pos_offsetX - 25, y = bigbtnrowY, an = 4, w = 25, h = 25} @@ -1941,6 +1946,12 @@ function bar_layout(direction) lo.geometry = geo lo.style = osc_styles.smallButtonsBar + -- Forced-subs-only button + geo = { x = geo.x - geo.w - padX, y = geo.y, an = geo.an, w = geo.w, h = geo.h } + lo = add_layout("tog_forced_only") + lo.geometry = geo + lo.style = osc_styles.smallButtonsBar + -- Track selection buttons geo = { x = geo.x - tsW - padX, y = geo.y, an = geo.an, w = tsW, h = geo.h } lo = add_layout("cy_sub") @@ -2327,6 +2338,32 @@ function osc_init() ne.eventresponder["shift+mbtn_left_down"] = function () show_message(get_tracklist("sub"), 2) end + -- tog_forced_only + local tog_forced_only = new_element("tog_forced_only", "button") + + ne = tog_forced_only + ne.content = function () + sub_codec = mp.get_property("current-tracks/sub/codec") + if (sub_codec ~= "dvd_subtitle" and sub_codec ~= "hdmv_pgs_subtitle") then + return "" + end + local base_a = tog_forced_only.layout.alpha + local alpha = base_a[1] + if not mp.get_property_bool("sub-forced-only-cur") then + alpha = 255 + end + local ret = assdraw.ass_new() + ret:append("[") + ass_append_alpha(ret, {[1] = alpha, [2] = 1, [3] = base_a[3], [4] = base_a[4]}, 0) + ret:append("F") + ass_append_alpha(ret, base_a, 0) + ret:append("]") + return ret.text + end + ne.eventresponder["mbtn_left_up"] = function () + mp.set_property_bool("sub-forced-only", (not mp.get_property_bool("sub-forced-only-cur"))) + end + ne.eventresponder["wheel_up_press"] = function () set_track("sub", -1) end ne.eventresponder["wheel_down_press"] = diff --git a/portable_config/scripts/thumbfast.lua b/portable_config/scripts/thumbfast.lua index 5a5a36f4..b1e09549 100644 --- a/portable_config/scripts/thumbfast.lua +++ b/portable_config/scripts/thumbfast.lua @@ -1,6 +1,6 @@ --[[ SOURCE_ https://github.com/po5/thumbfast/blob/master/thumbfast.lua -COMMIT_ 8aa6faf10adad899e05cc9b850cde904d37515be +COMMIT_ 4241c7daa444d3859b51b65a39d30e922adb87e9 适配多个OSC类脚本的新缩略图引擎 @@ -260,8 +260,8 @@ local auto_run = options.auto_run local function info(w, h) local short_video = mp.get_property_number("duration", 0) <= options.min_duration - local image = properties["current-tracks"] and properties["current-tracks"]["video"] and properties["current-tracks"]["video"]["image"] - local albumart = image and properties["current-tracks"]["video"]["albumart"] + local image = properties["current-tracks/video"] and properties["current-tracks/video"]["image"] + local albumart = image and properties["current-tracks/video"]["albumart"] disabled = (w or 0) == 0 or (h or 0) == 0 or has_vid == 0 or @@ -692,8 +692,7 @@ local function update_tracklist(name, value) -- current-tracks shim for _, track in ipairs(value) do if track.type == "video" and track.selected then - properties["current-tracks/video/image"] = track.image - properties["current-tracks/video/albumart"] = track.albumart + properties["current-tracks/video"] = track return end end @@ -748,7 +747,7 @@ local function shutdown() end end -mp.observe_property("current-tracks", "native", function(name, value) +mp.observe_property("current-tracks/video", "native", function(name, value) update_property(name, value) end) diff --git a/portable_config/scripts/uosc/elements/Elements.lua b/portable_config/scripts/uosc/elements/Elements.lua index 489819a8..fc1cc55f 100644 --- a/portable_config/scripts/uosc/elements/Elements.lua +++ b/portable_config/scripts/uosc/elements/Elements.lua @@ -22,7 +22,7 @@ function Elements:remove(idOrElement) if element then if not element.destroyed then element:destroy() end element.enabled = false - self.itable = itable_remove(self.itable, self[id]) + self.itable = itable_delete_value(self.itable, self[id]) self[id] = nil request_render() end diff --git a/portable_config/scripts/uosc/elements/Menu.lua b/portable_config/scripts/uosc/elements/Menu.lua index 99d736f3..0a1b1f34 100644 --- a/portable_config/scripts/uosc/elements/Menu.lua +++ b/portable_config/scripts/uosc/elements/Menu.lua @@ -3,13 +3,13 @@ local Element = require('elements/Element') -- Menu data structure accepted by `Menu:open(menu)`. ---@alias MenuData {type?: string; title?: string; hint?: string; keep_open?: boolean; separator?: boolean; items?: MenuDataItem[]; selected_index?: integer;} ---@alias MenuDataItem MenuDataValue|MenuData ----@alias MenuDataValue {title?: string; hint?: string; icon?: string; value: any; bold?: boolean; italic?: boolean; muted?: boolean; active?: boolean; keep_open?: boolean; separator?: boolean;} +---@alias MenuDataValue {title?: string; hint?: string; icon?: string; value: any; bold?: boolean; italic?: boolean; muted?: boolean; active?: boolean; keep_open?: boolean; separator?: boolean; selectable?: boolean; align?: 'left'|'center'|'right'} ---@alias MenuOptions {mouse_nav?: boolean; on_open?: fun(); on_close?: fun(); on_back?: fun(); on_move_item?: fun(from_index: integer, to_index: integer, submenu_path: integer[]); on_delete_item?: fun(index: integer, submenu_path: integer[])} -- Internal data structure created from `Menu`. ---@alias MenuStack {id?: string; type?: string; title?: string; hint?: string; selected_index?: number; keep_open?: boolean; separator?: boolean; items: MenuStackItem[]; parent_menu?: MenuStack; submenu_path: integer[]; active?: boolean; width: number; height: number; top: number; scroll_y: number; scroll_height: number; title_width: number; hint_width: number; max_width: number; is_root?: boolean; fling?: Fling} ---@alias MenuStackItem MenuStackValue|MenuStack ----@alias MenuStackValue {title?: string; hint?: string; icon?: string; value: any; active?: boolean; bold?: boolean; italic?: boolean; muted?: boolean; keep_open?: boolean; separator?: boolean; title_width: number; hint_width: number} +---@alias MenuStackValue {title?: string; hint?: string; icon?: string; value: any; active?: boolean; bold?: boolean; italic?: boolean; muted?: boolean; keep_open?: boolean; separator?: boolean; selectable?: boolean; align?: 'left'|'center'|'right'; title_width: number; hint_width: number} ---@alias Fling {y: number, distance: number, time: number, easing: fun(x: number), duration: number, update_cursor?: boolean} ---@alias Modifiers {shift?: boolean, ctrl?: boolean, alt?: boolean} @@ -156,7 +156,7 @@ function Menu:update(data) -- Update items local first_active_index = nil - menu.items = {} -- {{title = lang._menu_item_empty_title, value = 'ignore', italic = 'true', muted = 'true'}} + menu.items = {} -- {{title = lang._menu_item_empty_title, value = 'ignore', italic = 'true', muted = 'true', selectable = false, align = 'center'}} for i, item_data in ipairs(menu_data.items or {}) do if item_data.active and not first_active_index then first_active_index = i end @@ -164,6 +164,7 @@ function Menu:update(data) local item = {} table_assign(item, item_data, { 'title', 'icon', 'hint', 'active', 'bold', 'italic', 'muted', 'value', 'keep_open', 'separator', + 'selectable', 'align' }) if item.keep_open == nil then item.keep_open = menu.keep_open end @@ -265,10 +266,14 @@ function Menu:reset_navigation() -- Reset indexes and scroll self:scroll_to(menu.scroll_y) -- clamps scroll_y to scroll limits - if self.mouse_nav then - self:select_item_below_cursor() + if menu.items and #menu.items > 0 then + -- Normalize existing selected_index always, and force it only in keyboard navigation + if not self.mouse_nav and not menu.selected_index then + local from = clamp(1, menu.selected_index or 1, #menu.items) + self:select_index(itable_find(menu.items, function(item) return item.selectable ~= false end, from), menu) + end else - self:select_index((menu.items and #menu.items > 0) and clamp(1, menu.selected_index or 1, #menu.items) or nil) + self:select_index(nil) end -- Walk up the parent menu chain and activate items that lead to current menu @@ -289,12 +294,6 @@ end function Menu:fadeout(callback) self:tween_property('opacity', 1, 0, callback) end -function Menu:get_item_index_below_cursor() - local menu = self.current - if #menu.items < 1 or self.proximity_raw > 0 then return nil end - return math.max(1, math.min(math.ceil((cursor.y - self.ay + menu.scroll_y) / self.scroll_step), #menu.items)) -end - function Menu:get_first_active_index(menu) menu = menu or self.current for index, item in ipairs(self.current.items) do @@ -445,15 +444,31 @@ end ---@param menu? MenuStack function Menu:prev(menu) menu = menu or self.current - menu.selected_index = math.max(menu.selected_index and menu.selected_index - 1 or #menu.items, 1) - self:scroll_to_index(menu.selected_index, menu, true) + local initial_index = menu.selected_index and menu.selected_index - 1 or #menu.items + if initial_index > 0 then + menu.selected_index = itable_find(menu.items, function(item) return item.selectable ~= false end, initial_index, 1) + self:scroll_to_index(menu.selected_index, menu, true) + end end ---@param menu? MenuStack function Menu:next(menu) menu = menu or self.current - menu.selected_index = math.min(menu.selected_index and menu.selected_index + 1 or 1, #menu.items) - self:scroll_to_index(menu.selected_index, menu, true) + local initial_index = menu.selected_index and menu.selected_index + 1 or 1 + if initial_index <= #menu.items then + menu.selected_index = itable_find(menu.items, function(item) return item.selectable ~= false end, initial_index) + self:scroll_to_index(menu.selected_index, menu, true) + end +end + +---@param menu MenuStack One of menus in `self.all`. +---@param x number `x` coordinate to slide from. +function Menu:slide_in_menu(menu, x) + local current = self.current + current.selected_index = nil + self:activate_menu(menu) + self:tween(-(display.width / 2 - menu.width / 2 - x), 0, function(offset) self:set_offset_x(offset) end) + self.opacity = 1 -- in case tween above canceled fade in animation end function Menu:back() @@ -462,20 +477,17 @@ function Menu:back() if self.is_closed then return end end - local menu = self.current - local parent = menu.parent_menu + local current = self.current + local parent = current.parent_menu if parent then - menu.selected_index = nil - self:activate_menu(parent) - self:tween(self.offset_x - menu.width / 2, 0, function(offset) self:set_offset_x(offset) end) - self.opacity = 1 -- in case tween above canceled fade in animation + self:slide_in_menu(parent, display.width / 2 - current.width / 2 - parent.width / 2 + self.offset_x) else self:close() end end ----@param opts? {keep_open?: boolean, preselect_submenu_item?: boolean} +---@param opts? {keep_open?: boolean, preselect_first_item?: boolean} function Menu:open_selected_item(opts) opts = opts or {} local menu = self.current @@ -483,7 +495,7 @@ function Menu:open_selected_item(opts) local item = menu.items[menu.selected_index] -- Is submenu if item.items then - if opts.preselect_submenu_item then + if opts.preselect_first_item then item.selected_index = #item.items > 0 and 1 or nil end self:activate_menu(item) @@ -497,8 +509,7 @@ function Menu:open_selected_item(opts) end function Menu:open_selected_item_soft() self:open_selected_item({keep_open = true}) end -function Menu:open_selected_item_preselect() self:open_selected_item({preselect_submenu_item = true}) end -function Menu:select_item_below_cursor() self.current.selected_index = self:get_item_index_below_cursor() end +function Menu:open_selected_item_preselect() self:open_selected_item({preselect_first_item = true}) end ---@param index integer function Menu:move_selected_item_to(index) @@ -531,8 +542,7 @@ function Menu:handle_cursor_down() self.drag_data = {{y = cursor.y, time = mp.get_time()}} self.current.fling = nil else - if cursor.x < self.ax and self.current.parent_menu then self:back() - else self:close() end + self:close() end end @@ -548,8 +558,7 @@ end function Menu:handle_cursor_up() if self.proximity_raw == 0 and self.drag_data and not self.is_dragging then - self:select_item_below_cursor() - self:open_selected_item({preselect_submenu_item = false, keep_open = self.modifiers and self.modifiers.shift}) + self:open_selected_item({preselect_first_item = false, keep_open = self.modifiers and self.modifiers.shift}) end if self.is_dragging then local distance = self:fling_distance() @@ -564,7 +573,6 @@ function Menu:handle_cursor_up() self.drag_data = nil end - function Menu:on_global_mouse_move() self.mouse_nav = true if self.drag_data then @@ -573,8 +581,6 @@ function Menu:on_global_mouse_move() if distance ~= 0 then self:set_scroll_by(distance) end self.drag_data[#self.drag_data + 1] = {y = cursor.y, time = mp.get_time()} end - if self.proximity_raw == 0 or self.is_dragging then self:select_item_below_cursor() - else self.current.selected_index = nil end request_render() end @@ -673,17 +679,14 @@ function Menu:create_key_action(name, modifiers) end function Menu:render() - local update_cursor = false for _, menu in ipairs(self.all) do if menu.fling then - update_cursor = update_cursor or menu.fling.update_cursor or false local time_delta = state.render_last_time - menu.fling.time local progress = menu.fling.easing(math.min(time_delta / menu.fling.duration, 1)) self:set_scroll_to(round(menu.fling.y + menu.fling.distance * progress), menu) if progress < 1 then request_render() else menu.fling = nil end end end - if update_cursor then self:select_item_below_cursor() end cursor.on_primary_down = function() self:handle_cursor_down() end cursor.on_primary_up = function() self:handle_cursor_up() end @@ -696,28 +699,44 @@ function Menu:render() local opacity = options.menu_opacity * self.opacity local spacing = self.item_padding local icon_size = self.font_size - - function draw_menu(menu, x, y, opacity) - local ax, ay, bx, by = x, y, x + menu.width, y + menu.height + local menu_gap, menu_padding = 2, 2 + + ---@param menu MenuStack + ---@param x number + ---@param pos number Horizontal position index. 0 = current menu, <0 parent menus, >1 submenu. + local function draw_menu(menu, x, pos) + local is_current, is_parent, is_submenu = pos == 0, pos < 0, pos > 0 + local menu_opacity = pos == 0 and opacity or opacity * (options.menu_parent_opacity ^ math.abs(pos)) + local ax, ay, bx, by = x, menu.top, x + menu.width, menu.top + menu.height local draw_title = menu.is_root and menu.title local scroll_clip = '\\clip(0,' .. ay .. ',' .. display.width .. ',' .. by .. ')' local start_index = math.floor(menu.scroll_y / self.scroll_step) + 1 local end_index = math.ceil((menu.scroll_y + menu.height) / self.scroll_step) - local selected_index = menu.selected_index or -1 - -- remove menu_opacity to start off with full opacity, but still decay for parent menus - local text_opacity = opacity / options.menu_opacity + -- Remove menu_opacity to start off with full, but still decay for parent menus + local text_opacity = menu_opacity / options.menu_opacity + local menu_rect = {ax = ax, ay = ay - (draw_title and self.item_height or 0) - 2, bx = bx, by = by + 2} + local blur_selected_index = is_current and self.mouse_nav -- Background - ass:rect(ax, ay - (draw_title and self.item_height or 0) - 2, bx, by + 2, { - color = bg, opacity = opacity, radius = 4, - }) + ass:rect(menu_rect.ax, menu_rect.ay, menu_rect.bx, menu_rect.by, {color = bg, opacity = menu_opacity, radius = 4}) + + if is_parent and get_point_to_rectangle_proximity(cursor, menu_rect) == 0 then + cursor.on_primary_down = function() self:slide_in_menu(menu, x) end + end + + -- Draw submenu if selected + local submenu_rect, current_item = nil, is_current and menu.selected_index and menu.items[menu.selected_index] + local submenu_is_hovered = false + if current_item and current_item.items then + submenu_rect = draw_menu(current_item, menu_rect.bx + menu_gap, 1) + submenu_is_hovered = get_point_to_rectangle_proximity(cursor, submenu_rect) == 0 + if submenu_is_hovered then + cursor.on_primary_down = function() self:open_selected_item({preselect_first_item = false}) end + end + end for index = start_index, end_index, 1 do local item = menu.items[index] - local next_item = menu.items[index + 1] - local is_highlighted = selected_index == index or item.active - local next_is_active = next_item and next_item.active - local next_is_highlighted = selected_index == index + 1 or next_is_active if not item then break end @@ -726,24 +745,47 @@ function Menu:render() local item_center_y = item_ay + (self.item_height / 2) local item_clip = (item_ay < ay or item_by > by) and scroll_clip or nil local content_ax, content_bx = ax + spacing, bx - spacing + local is_selected = menu.selected_index == index or item.active + + -- Select hovered item + if is_current and self.mouse_nav then + if submenu_rect and cursor.direction_to_rectangle_distance(submenu_rect) then + blur_selected_index = false + else + local item_rect_hitbox = { + ax = menu_rect.ax + menu_padding, + ay = item_ay, + bx = menu_rect.bx + (item.items and menu_gap or -menu_padding), -- to bridge the gap with cursor + by = item_by + } + if submenu_is_hovered or get_point_to_rectangle_proximity(cursor, item_rect_hitbox) == 0 then + blur_selected_index = false + menu.selected_index = index + end + end + end + + local next_item = menu.items[index + 1] + local next_is_active = next_item and next_item.active + local next_is_highlighted = menu.selected_index == index + 1 or next_is_active local font_color = item.active and fgt or bgt local shadow_color = item.active and fg or bg -- Separator local separator_ay = item.separator and item_by - 1 or item_by local separator_by = item_by + (item.separator and 2 or 1) - if is_highlighted then separator_ay = item_by + 1 end + if is_selected then separator_ay = item_by + 1 end if next_is_highlighted then separator_by = item_by end if separator_by - separator_ay > 0 and item_by < by then ass:rect(ax + spacing / 2, separator_ay, bx - spacing / 2, separator_by, { - color = fg, opacity = opacity * (item.separator and 0.08 or 0.06), + color = fg, opacity = menu_opacity * (item.separator and 0.08 or 0.06), }) end -- Highlight - local highlight_opacity = 0 + (item.active and 0.8 or 0) + (selected_index == index and 0.15 or 0) - if highlight_opacity > 0 then - ass:rect(ax + 2, item_ay, bx - 2, item_by, { + local highlight_opacity = 0 + (item.active and 0.8 or 0) + (menu.selected_index == index and 0.15 or 0) + if not is_submenu and highlight_opacity > 0 then + ass:rect(ax + menu_padding, item_ay, bx - menu_padding, item_by, { radius = 2, color = fg, opacity = highlight_opacity * text_opacity, clip = item_clip, }) @@ -777,7 +819,7 @@ function Menu:render() local clip = '\\clip(' .. title_cut_x .. ',' .. math.max(item_ay, ay) .. ',' .. bx .. ',' .. math.min(item_by, by) .. ')' ass:txt(content_bx, item_center_y, 6, item.ass_safe_hint, { - size = self.font_size_hint, color = font_color, wrap = 2, opacity = 0.5 * opacity, clip = clip, + size = self.font_size_hint, color = font_color, wrap = 2, opacity = 0.5 * menu_opacity, clip = clip, shadow = 1, shadow_color = shadow_color, }) end @@ -787,7 +829,13 @@ function Menu:render() item.ass_safe_title = item.ass_safe_title or ass_escape(item.title) local clip = '\\clip(' .. ax .. ',' .. math.max(item_ay, ay) .. ',' .. title_cut_x .. ',' .. math.min(item_by, by) .. ')' - ass:txt(content_ax, item_center_y, 4, item.ass_safe_title, { + local title_x, align = content_ax, 4 + if item.align == 'right' then + title_x, align = title_cut_x, 6 + elseif item.align == 'center' then + title_x, align = content_ax + (title_cut_x - content_ax) / 2, 5 + end + ass:txt(title_x, item_center_y, align, item.ass_safe_title, { size = self.font_size, color = font_color, italic = item.italic, bold = item.bold, wrap = 2, opacity = text_opacity * (item.muted and 0.5 or 1), clip = clip, shadow = 1, shadow_color = shadow_color, @@ -803,15 +851,15 @@ function Menu:render() -- Background ass:rect(ax + 2, title_ay, bx - 2, title_ay + title_height, { - color = fg, opacity = opacity * 0.8, radius = 2, + color = fg, opacity = menu_opacity * 0.8, radius = 2, }) ass:texture(ax + 2, title_ay, bx - 2, title_ay + title_height, 'n', { - size = 80, color = bg, opacity = opacity * 0.1, + size = 80, color = bg, opacity = menu_opacity * 0.1, }) -- Title ass:txt(ax + menu.width / 2, title_ay + (title_height / 2), 5, menu.ass_safe_title, { - size = self.font_size, bold = true, color = bg, wrap = 2, opacity = opacity, + size = self.font_size, bold = true, color = bg, wrap = 2, opacity = menu_opacity, clip = '\\clip(' .. ax .. ',' .. title_ay .. ',' .. bx .. ',' .. ay .. ')', }) end @@ -821,33 +869,31 @@ function Menu:render() local groove_height = menu.height - 2 local thumb_height = math.max((menu.height / (menu.scroll_height + menu.height)) * groove_height, 40) local thumb_y = ay + 1 + ((menu.scroll_y / menu.scroll_height) * (groove_height - thumb_height)) - ass:rect(bx - 3, thumb_y, bx - 1, thumb_y + thumb_height, {color = fg, opacity = opacity * 0.8}) + ass:rect(bx - 3, thumb_y, bx - 1, thumb_y + thumb_height, {color = fg, opacity = menu_opacity * 0.8}) + end + + -- We are in mouse nav and cursor isn't hovering any item + if blur_selected_index then + menu.selected_index = nil end + + return menu_rect end -- Main menu - draw_menu(self.current, self.ax, self.ay, opacity) + draw_menu(self.current, self.ax, 0) -- Parent menus local parent_menu = self.current.parent_menu - local parent_offset_x = self.ax - local parent_opacity_factor = options.menu_parent_opacity - local menu_gap = 2 + local parent_offset_x, parent_horizontal_index = self.ax, -1 while parent_menu do parent_offset_x = parent_offset_x - parent_menu.width - menu_gap - draw_menu(parent_menu, parent_offset_x, parent_menu.top, parent_opacity_factor * opacity) - parent_opacity_factor = parent_opacity_factor * parent_opacity_factor + draw_menu(parent_menu, parent_offset_x, parent_horizontal_index) + parent_horizontal_index = parent_horizontal_index - 1 parent_menu = parent_menu.parent_menu end - -- Selected menu - local selected_menu = self.current.items[self.current.selected_index] - - if selected_menu and selected_menu.items then - draw_menu(selected_menu, self.bx + menu_gap, selected_menu.top, options.menu_parent_opacity * opacity) - end - return ass end diff --git a/portable_config/scripts/uosc/elements/Timeline.lua b/portable_config/scripts/uosc/elements/Timeline.lua index ebcd2684..29708bfb 100644 --- a/portable_config/scripts/uosc/elements/Timeline.lua +++ b/portable_config/scripts/uosc/elements/Timeline.lua @@ -411,7 +411,7 @@ function Timeline:render() -- Chapter title if #state.chapters > 0 then - local _, chapter = itable_find(state.chapters, function(c) return hovered_seconds >= c.time end, true) + local _, chapter = itable_find(state.chapters, function(c) return hovered_seconds >= c.time end, #state.chapters, 1) if chapter and not chapter.is_end_only then ass:tooltip(tooltip_anchor, chapter.title_wrapped, { size = self.font_size, offset = 10, responsive = false, bold = true, diff --git a/portable_config/scripts/uosc/elements/TopBar.lua b/portable_config/scripts/uosc/elements/TopBar.lua index b15121fc..85366682 100644 --- a/portable_config/scripts/uosc/elements/TopBar.lua +++ b/portable_config/scripts/uosc/elements/TopBar.lua @@ -197,13 +197,15 @@ function TopBar:render() } local bx = math.min(max_bx, title_ax + text_width(main_title, opts) + padding * 2) local by = self.by - bg_margin - local rect = {ax = title_ax, ay = self.ay, bx = self.title_bx, by = self.by} + local title_rect = {ax = title_ax, ay = title_ay, bx = bx, by = by} - if get_point_to_rectangle_proximity(cursor, rect) == 0 then + if options.top_bar_alt_title_place == 'toggle' + and get_point_to_rectangle_proximity(cursor, title_rect) == 0 then cursor.on_primary_down = function() self:toggle_title() end + cursor.allow_dragging = true end - ass:rect(title_ax, title_ay, bx, by, { + ass:rect(title_rect.ax, title_rect.ay, title_rect.bx, title_rect.by, { color = bg, opacity = visibility * options.top_bar_title_opacity, radius = 2, }) ass:txt(title_ax + padding, self.ay + (self.size / 2), 4, main_title, opts) diff --git a/portable_config/scripts/uosc/lib/std.lua b/portable_config/scripts/uosc/lib/std.lua index 12616661..c72ccb14 100644 --- a/portable_config/scripts/uosc/lib/std.lua +++ b/portable_config/scripts/uosc/lib/std.lua @@ -75,15 +75,25 @@ function itable_index_of(itable, value) end end +---@param itable table +---@param value any +---@return boolean +function itable_has(itable, value) + return itable_index_of(itable, value) ~= nil +end + ---@param itable table ---@param compare fun(value: any, index: number) ----@param from_end? boolean Search from the end of the table. +---@param from? number Where to start search, defaults to `1`. +---@param to? number Where to end search, defaults to `#itable`. ---@return number|nil index ---@return any|nil value -function itable_find(itable, compare, from_end) - local from, to, step = from_end and #itable or 1, from_end and 1 or #itable, from_end and -1 or 1 - for index = from, to, step do - if compare(itable[index], index) then return index, itable[index] end +function itable_find(itable, compare, from, to) + from, to = from or 1, to or #itable + for index = from, to, from < to and 1 or -1 do + if index > 0 and index <= #itable and compare(itable[index], index) then + return index, itable[index] + end end end @@ -99,8 +109,21 @@ end ---@param itable table ---@param value any -function itable_remove(itable, value) - return itable_filter(itable, function(item) return item ~= value end) +function itable_delete_value(itable, value) + for index = 1, #itable, 1 do + if itable[index] == value then table.remove(itable, index) end + end + return itable +end + +---@param itable table +---@param transformer fun(value: any, index: number) : any +function itable_map(itable, transformer) + local result = {} + for index, value in ipairs(itable) do + result[index] = transformer(value, index) + end + return result end ---@param itable table diff --git a/portable_config/scripts/uosc/lib/text.lua b/portable_config/scripts/uosc/lib/text.lua index eca4de2a..0af8a6c5 100644 --- a/portable_config/scripts/uosc/lib/text.lua +++ b/portable_config/scripts/uosc/lib/text.lua @@ -51,11 +51,12 @@ local osd_width, osd_height = 100, 100 ---@return integer local function utf8_char_bytes(str, i) local char_byte = str:byte(i) - if char_byte < 0xC0 then return 1 - elseif char_byte < 0xE0 then return 2 - elseif char_byte < 0xF0 then return 3 - elseif char_byte < 0xF8 then return 4 - else return 1 end + local max_bytes = #str - i + 1 + if char_byte < 0xC0 then return math.min(max_bytes, 1) + elseif char_byte < 0xE0 then return math.min(max_bytes, 2) + elseif char_byte < 0xF0 then return math.min(max_bytes, 3) + elseif char_byte < 0xF8 then return math.min(max_bytes, 4) + else return math.min(max_bytes, 1) end end ---Creates an iterator for an utf-8 encoded string @@ -87,9 +88,7 @@ local function utf8_to_unicode(str, i) unicode = char_byte * (2 ^ 6) ^ (byte_count - 1) end for j = 2, byte_count do - if i + j - 1 <= #str then -- 临时修复 https://github.com/tomasklaen/uosc/issues/515 - char_byte = str:byte(i + j - 1) - 0x80 - end + char_byte = str:byte(i + j - 1) - 0x80 unicode = unicode + char_byte * (2 ^ 6) ^ (byte_count - j) end return round(unicode) diff --git a/portable_config/scripts/uosc/lib/utils.lua b/portable_config/scripts/uosc/lib/utils.lua index 43892c98..134d65cb 100644 --- a/portable_config/scripts/uosc/lib/utils.lua +++ b/portable_config/scripts/uosc/lib/utils.lua @@ -100,6 +100,73 @@ function get_point_to_point_proximity(point_a, point_b) return math.sqrt(dx * dx + dy * dy) end +---@param lax number +---@param lay number +---@param lbx number +---@param lby number +---@param max number +---@param may number +---@param mbx number +---@param mby number +function get_line_to_line_intersection(lax, lay, lbx, lby, max, may, mbx, mby) + -- Calculate the direction of the lines + local uA = ((mbx-max)*(lay-may) - (mby-may)*(lax-max)) / ((mby-may)*(lbx-lax) - (mbx-max)*(lby-lay)) + local uB = ((lbx-lax)*(lay-may) - (lby-lay)*(lax-max)) / ((mby-may)*(lbx-lax) - (mbx-max)*(lby-lay)) + + -- If uA and uB are between 0-1, lines are colliding + if uA >= 0 and uA <= 1 and uB >= 0 and uB <= 1 then + return lax + (uA * (lbx-lax)), lay + (uA * (lby-lay)) + end + + return nil, nil +end + +-- Returns distance from the start of a finite ray assumed to be at (rax, ray) +-- coordinates to a line. +---@param rax number +---@param ray number +---@param rbx number +---@param rby number +---@param lax number +---@param lay number +---@param lbx number +---@param lby number +function get_ray_to_line_distance(rax, ray, rbx, rby, lax, lay, lbx, lby) + local x, y = get_line_to_line_intersection(rax, ray, rbx, rby, lax, lay, lbx, lby) + if x then + return math.sqrt((rax - x) ^ 2 + (ray - y) ^ 2) + end + return nil +end + +-- Returns distance from the start of a finite ray assumed to be at (ax, ay) +-- coordinates to a rectangle. Returns `0` if ray originates inside rectangle. +---@param ax number +---@param ay number +---@param bx number +---@param by number +---@param rect {ax: number; ay: number; bx: number; by: number} +---@return number|nil +function get_ray_to_rectangle_distance(ax, ay, bx, by, rect) + -- Is inside + if ax >= rect.ax and ax <= rect.bx and ay >= rect.ay and ay <= rect.by then + return 0 + end + + local closest = nil + + local function updateDistance(distance) + if distance and (not closest or distance < closest) then closest = distance end + end + + updateDistance(get_ray_to_line_distance(ax, ay, bx, by, rect.ax, rect.ay, rect.bx, rect.ay)) + updateDistance(get_ray_to_line_distance(ax, ay, bx, by, rect.bx, rect.ay, rect.bx, rect.by)) + updateDistance(get_ray_to_line_distance(ax, ay, bx, by, rect.ax, rect.by, rect.bx, rect.by)) + updateDistance(get_ray_to_line_distance(ax, ay, bx, by, rect.ax, rect.ay, rect.ax, rect.by)) + + return closest +end + -- Call function with args if it exists function call_maybe(fn, ...) if type(fn) == 'function' then fn(...) end @@ -350,28 +417,39 @@ end -- Navigates in a list, using delta or, when `state.shuffle` is enabled, -- randomness to determine the next item. Loops around if `loop-playlist` is enabled. ----@param list table +---@param paths table ---@param current_index number ---@param delta number -function decide_navigation_in_list(list, current_index, delta) - if #list < 2 then return #list, list[#list] end +function decide_navigation_in_list(paths, current_index, delta) + if #paths < 2 then return #paths, paths[#paths] end + -- Shuffle looks at the played files history trimmed to 80% length of the paths + -- and removes all paths in it from the potential shuffle pool. This guarantees + -- no path repetition until at least 80% of the playlist has been exhausted. if state.shuffle then - local new_index = current_index + local trimmed_history = itable_slice(state.history, -math.floor(#paths * 0.8)) + local shuffle_pool = {} + + for index, value in ipairs(paths) do + if not itable_has(trimmed_history, value) then + shuffle_pool[#shuffle_pool + 1] = index + end + end + math.randomseed(os.time()) - while current_index == new_index do new_index = math.random(#list) end - return new_index, list[new_index] + local next_index = shuffle_pool[math.random(#shuffle_pool)] + return next_index, paths[next_index] end local new_index = current_index + delta if mp.get_property_native('loop-playlist') then - if new_index > #list then new_index = new_index % #list - elseif new_index < 1 then new_index = #list - new_index end - elseif new_index < 1 or new_index > #list then + if new_index > #paths then new_index = new_index % #paths + elseif new_index < 1 then new_index = #paths - new_index end + elseif new_index < 1 or new_index > #paths then return end - return new_index, list[new_index] + return new_index, paths[new_index] end ---@param delta number @@ -389,7 +467,8 @@ end function navigate_playlist(delta) local playlist, pos = mp.get_property_native('playlist'), mp.get_property_native('playlist-pos-1') if playlist and #playlist > 1 and pos then - local index = decide_navigation_in_list(playlist, pos, delta) + local paths = itable_map(playlist, function(item) return normalize_path(item.filename) end) + local index = decide_navigation_in_list(paths, pos, delta) if index then mp.commandv('playlist-play-index', index - 1) return true end end return false diff --git a/portable_config/scripts/uosc/main.lua b/portable_config/scripts/uosc/main.lua index b411b9c0..73fd62b5 100644 --- a/portable_config/scripts/uosc/main.lua +++ b/portable_config/scripts/uosc/main.lua @@ -1,6 +1,6 @@ --[[ SOURCE_ https://github.com/tomasklaen/uosc/tree/main/scripts -COMMIT_ 5e2c93055155bc9aec7534d13804d4f0d7f8a72d +COMMIT_ c8ad77a1a92d0667e1e66f11e84692cd03796ec8 文档_ https://github.com/hooke007/MPV_lazy/discussions/186 极简主义设计驱动的多功能界面脚本群组,兼容 thumbfast 新缩略图引擎 @@ -347,10 +347,14 @@ cursor = { on_primary_up = nil, on_wheel_down = nil, on_wheel_up = nil, + allow_dragging = false, + history = {}, -- {x, y}[] history + history_size = 10, -- Called at the beginning of each render reset_handlers = function() cursor.on_primary_down, cursor.on_primary_up = nil, nil cursor.on_wheel_down, cursor.on_wheel_up = nil, nil + cursor.allow_dragging = false end, mbtn_left_enabled = nil, wheel_enabled = nil, @@ -359,7 +363,8 @@ cursor = { local enable_mbtn_left = (cursor.on_primary_down or cursor.on_primary_up) ~= nil local enable_wheel = (cursor.on_wheel_down or cursor.on_wheel_up) ~= nil if enable_mbtn_left ~= cursor.mbtn_left_enabled then - mp[(enable_mbtn_left and 'enable' or 'disable') .. '_key_bindings']('mbtn_left') + local flags = cursor.allow_dragging and 'allow-vo-dragging' or nil + mp[(enable_mbtn_left and 'enable' or 'disable') .. '_key_bindings']('mbtn_left', flags) cursor.mbtn_left_enabled = enable_mbtn_left end if enable_wheel ~= cursor.wheel_enabled then @@ -381,6 +386,17 @@ cursor = { cursor.autohide_timer:kill() cursor.autohide_timer:resume() end + end, + -- Calculates distance in which cursor reaches rectangle if it continues moving in the same path. + -- Returns `nil` if cursor is not moving towards the rectangle. + direction_to_rectangle_distance = function(rect) + if cursor.hidden or not cursor.history[1] then + return false + end + + local prev_x, prev_y = cursor.history[1][1], cursor.history[1][2] + local end_x, end_y = cursor.x + (cursor.x - prev_x) * 1e10, cursor.y + (cursor.y - prev_y) * 1e10 + return get_ray_to_rectangle_distance(cursor.x, cursor.y, end_x, end_y, rect) end } state = { @@ -397,6 +413,7 @@ state = { end)(), cwd = mp.get_property('working-directory'), path = nil, -- current file path or URL + history = {}, -- history of last played files stored as full paths title = nil, alt_title = nil, time = nil, -- current media playback time @@ -579,18 +596,24 @@ function update_cursor_position(x, y) else x, y = INFINITY, INFINITY end end - -- add 0.5 to be in the middle of the pixel + -- Add 0.5 to be in the middle of the pixel cursor.x, cursor.y = (x + 0.5) / display.scale_x, (y + 0.5) / display.scale_y if old_x ~= cursor.x or old_y ~= cursor.y then Elements:update_proximities() if cursor.x == INFINITY or cursor.y == INFINITY then - cursor.hidden = true + cursor.hidden, cursor.history = true, {} Elements:trigger('global_mouse_leave') elseif cursor.hidden then - cursor.hidden = false + cursor.hidden, cursor.history = false, {} Elements:trigger('global_mouse_enter') + else + -- Update cursor history + for i = 1, cursor.history_size - 1, 1 do + cursor.history[i] = cursor.history[i + 1] + end + cursor.history[cursor.history_size] = {x, y} end Elements:proximity_trigger('mouse_move') @@ -658,7 +681,7 @@ end function select_current_chapter() local current_chapter if state.time and state.chapters then - _, current_chapter = itable_find(state.chapters, function(c) return state.time >= c.time end, true) + _, current_chapter = itable_find(state.chapters, function(c) return state.time >= c.time end, #state.chapters, 1) end set_state('current_chapter', current_chapter) end @@ -699,7 +722,10 @@ end mp.observe_property('mouse-pos', 'native', handle_mouse_pos) mp.observe_property('osc', 'bool', function(name, value) if value == true then mp.set_property('osc', 'no') end end) mp.register_event('file-loaded', function() - set_state('path', normalize_path(mp.get_property_native('path'))) + local path = normalize_path(mp.get_property_native('path')) + itable_delete_value(state.history, path) + state.history[#state.history + 1] = path + set_state('path', path) Elements:flash({'top_bar'}) end) mp.register_event('end-file', function(event) diff --git a/portable_config/shaders/nlmeans.glsl b/portable_config/shaders/nlmeans.glsl index 655da37d..5af94a5d 100644 --- a/portable_config/shaders/nlmeans.glsl +++ b/portable_config/shaders/nlmeans.glsl @@ -21,299 +21,1121 @@ // Description: nlmeans.glsl: Default profile, general purpose, tuned for low noise -/* The recommended usage of this shader and its variant profiles is to add them - * to input.conf and then dispatch the appropriate shader via a keybind during - * media playback. Here is an example input.conf entry: +/* This shader is highly configurable via user variables below. Although the + * default settings should offer good quality at a reasonable speed, you are + * encouraged to tweak them to your preferences. + */ + +// The following is shader code injected from ../LQ/nlmeans.glsl +/* vi: ft=c * - * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)" + * Based on vf_nlmeans.c from FFmpeg. * - * These shaders can also be enabled by default in mpv.conf, for example: + * Copyright (c) 2022 an3223 + * Copyright (c) 2016 Clément Bœsch * - * glsl-shaders='~~/shaders/nlmeans.glsl' + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 2.1 of the License, or (at + * your option) any later version. * - * Both of the examples above assume the shaders are located in a subdirectory - * named "shaders" within mpv's config directory. Refer to the mpv - * documentation for more details. + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. * - * This shader is highly configurable via user variables below. Although the + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + */ + +// Description: nlmeans.glsl: Faster, but lower quality. + +/* This shader is highly configurable via user variables below. Although the * default settings should offer good quality at a reasonable speed, you are - * encouraged to tweak them to your preferences. Be mindful that certain - * settings may greatly affect speed. + * encouraged to tweak them to your preferences. + */ + +//!HOOK LUMA +//!HOOK CHROMA +//!BIND HOOKED +//!DESC Non-local means (nlmeans.glsl) +//!SAVE RF_LUMA + +// User variables + +// It is generally preferable to denoise luma and chroma differently, so the +// user variables for luma and chroma are split. + +// Denoising factor (level of blur, higher means more blur) +#ifdef LUMA_raw +#define S 3.5968056672833097 +#else +#define S 5.191526541606411 +#endif + +/* Adaptive sharpening + * + * Performs an unsharp mask by subtracting the spatial kernel's blur from the + * NLM blur. For sharpen+denoise the sharpening is limited to edge areas and + * denoising is done everywhere else. * - * Denoising is most useful for noisy content. If there is no perceptible - * noise, you probably won't see a positive difference. + * Use V=4 to visualize which areas are sharpened (black means sharpen). + * + * AS: + * - 0: disable + * - 1: sharpen+denoise + * - 2: sharpen only + * ASF: Higher numbers make a sharper image + * ASA: Anti-ringing, higher numbers increase strength + * ASP: Power, lower numbers increase sharpening on lower frequency detail + */ +#ifdef LUMA_raw +#define AS 0 +#define ASF 0.1625 +#define ASA 5.0 +#define ASP 0.5 +#else +#define AS 0 +#define ASF 0.1625 +#define ASA 5.0 +#define ASP 0.5 +#endif + +/* Starting weight * - * The default settings are generally tuned for low noise and high detail - * preservation. The "medium" and "heavy" profiles are tuned for higher levels - * of noise. + * Also known as the center weight. This represents the weight of the + * pixel-of-interest. Lower numbers may help handle heavy noise & ringing. * - * The denoiser will not work properly if the content has been upscaled - * beforehand (whether it was done by you or not). In such cases, consider - * issuing a command to downscale in the mpv console (backtick ` key): + * EPSILON should be used instead of zero to avoid divide-by-zero errors. + */ +#ifdef LUMA_raw +#define SW 0.7392620481427672 +#else +#define SW 0.6448288408806067 +#endif + +/* Weight discard * - * vf toggle scale=-2:720 + * Reduces weights that fall below a fraction of the average weight. This culls + * the most dissimilar samples from the blur, which can yield a better result, + * especially around edges. + * + * WD: + * - 2: Mean. Better quality, but slower and requires GLSL 4.0 or later + * - 1: Moving cumulative average. Fast but inaccurate, blurs directionally. + * - 0: Disable * - * ...replacing 720 with whatever resolution seems appropriate. Rerun the - * command to undo the downscale. It may take some trial-and-error to find the - * proper resolution. + * WDT: Threshold coefficient, higher numbers discard more + * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes + * WDS (not for WDK=is_zero): Higher numbers are more eager to reduce weights */ +#ifdef LUMA_raw +#define WD 1 +#define WDT 0.580415381682815 +#define WDP 5.381278367349288 +#define WDS 1.0 +#else +#define WD 1 +#define WDT 0.913447511792627 +#define WDP 5.832936323930807 +#define WDS 1.0 +#endif -/* Regarding speed +/* Extremes preserve * - * Speed may vary wildly for different vo and gpu-api settings. Generally - * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this - * may be different for your system. + * Reduce denoising in very bright/dark areas. * - * If your GPU doesn't support textureGather, or if you are on a version of mpv - * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile + * Disabled by default now. If you want to reenable this, set EP=3/ in + * Makefile.nlm and rebuild. * - * If you plan on tinkering with NLM's settings, read below: + * The downscaling factor of the EP shader stage affects what is considered a + * bright/dark area. * - * textureGather only applies to luma and limited to the these configurations: + * This is incompatible with RGB. If you have RGB hooks enabled then you will + * have to delete the EP shader stage or specify EP=0 through shader_cfg. * - * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2} - * - Default, very fast, rotations and reflections should be free - * - If this is unusually slow then try changing gpu-api and vo - * - If it's still slow, try setting RI/RFI to 0. + * EP: 1 to enable, 0 to disable + * DP: EP strength on dark areas, 0 to fully denoise + * BP: EP strength on bright areas, 0 to fully denoise + */ +#ifdef LUMA_raw +#define EP 0 +#define BP 0.75 +#define DP 0.25 +#else +#define EP 0 +#define BP 0.0 +#define DP 0.0 +#endif + +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ + +/* textureGather applicable configurations: * - * - PS=6:RI={0,1,3}:RFI={0,1,2} + * - PS={0,3,7,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2} + * - PS={0,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2} + * - PS=6:RI=0:RFI=0 * - Currently the only scalable variant - * - Patch shape is asymmetric on two axis - * - Rotations should have very little speed impact - * - Reflections may have a significant speed impact * * Options which always disable textureGather: - * - PD - * - NG + * - NG + * - SAMPLE + * - PD + * + * Running without textureGather may be much slower. */ -// The following is shader code injected from guided.glsl -/* vi: ft=c +/* Patch & research sizes * - * Copyright (c) 2022 an3223 + * P should be an odd number. Higher values are slower and not always better. * - * This program is free software: you can redistribute it and/or modify it - * under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 2.1 of the License, or (at - * your option) any later version. + * R should be an odd number greater than or equal to 3. Higher values are + * generally better, but slower, blurrier, and gives diminishing returns. + */ +#ifdef LUMA_raw +#define P 3 +#define R 5 +#else +#define P 3 +#define R 5 +#endif + +/* Patch and research shapes * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License - * for more details. + * Different shapes have different speed and quality characteristics. Every + * shape (besides square) is smaller than square. * - * You should have received a copy of the GNU Lesser General Public License - * along with this program. If not, see . + * PS applies applies to patches, RS applies to research zones. + * + * 0: square (symmetrical) + * 1: horizontal line (asymmetric) + * 2: vertical line (asymmetric) + * 3: diamond (symmetrical) + * 4: triangle (asymmetric, pointing upward) + * 5: truncated triangle (asymmetric on two axis, last row halved) + * 6: even sized square (asymmetric on two axis) + * 7: plus (symmetrical) + * 8: plus X (symmetrical) */ +#ifdef LUMA_raw +#define RS 3 +#define PS 4 +#else +#define RS 3 +#define PS 3 +#endif -// Description: guided.glsl: Guided by the downscaled image +/* Robust filtering + * + * This setting is dependent on code generation from shader_cfg, so this + * setting can only be enabled via shader_cfg. + * + * Computes weights on a guide, which could be a downscaled image or the output + * of another shader, and applies the weights to the original image + */ +#define RF_LUMA 0 +#define RF 0 -/* The radius can be adjusted with the MEANI stage's downscaling factor. - * Higher numbers give a bigger radius. +/* Rotational/reflectional invariance + * + * Number of rotations/reflections to try for each patch comparison. Can be + * slow, but improves feature preservation. More rotations/reflections gives + * diminishing returns. The most similar rotation/reflection will be used. * - * The E variable can be found in the A stage. + * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a + * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc. * - * The subsampling (fast guided filter) can be adjusted with the I stage's - * downscaling factor. Higher numbers are faster. + * Consider setting SAMPLE=1 if setting RI to a setting that would require + * sampling between pixels. * - * The guide's subsampling can be adjusted with the PREI stage's downscaling - * factor. Higher numbers downscale more. + * RI: Rotational invariance + * RFI (0 to 2): Reflectional invariance */ +#ifdef LUMA_raw +#define RI 0 +#define RFI 0 +#else +#define RI 0 +#define RFI 0 +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!BIND HOOKED -//!WIDTH HOOKED.w 1.25 / -//!HEIGHT HOOKED.h 1.25 / -//!DESC Guided filter (PREI) -//!SAVE _INJ_PREI +/* Temporal denoising + * + * This setting is dependent on code generation from shader_cfg, so this + * setting can only be enabled via shader_cfg. + * + * Caveats: + * - Slower: + * - Each frame needs to be researched (more samples & more math) + * - Gather optimizations only apply to the current frame + * - Requires vo=gpu-next + * - Luma-only (this is a bug) + * - Buggy + * + * May cause motion blur and may struggle more with noise that persists across + * multiple frames (e.g., from compression or duplicate frames), but can work + * very well on high quality video. + * + * Motion estimation (ME) should improve quality without impacting speed. + * + * T: number of frames used + * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg + * MEF: estimate factor, compensates for ME being one frame behind + * TRF: compare against the denoised frames + */ +#ifdef LUMA_raw +#define T 0 +#define ME 1 +#define MEF 2 +#define TRF 0 +#else +#define T 0 +#define ME 0 +#define MEF 2 +#define TRF 0 +#endif + +/* Spatial kernel + * + * Increasing the spatial denoising factor (SS) reduces the weight of further + * pixels. + * + * Spatial distortion instructs the spatial kernel to view that axis as + * closer/further, for instance SD=(1,1,0.5) would make the temporal axis + * appear closer and increase blur between frames. + * + * The intra-patch variants are supposed to help with larger patch sizes. + * + * SST: enables spatial kernel if R>=PST, 0 fully disables + * SS: spatial sigma + * SD: spatial distortion (X, Y, time) + * PSS: intra-patch spatial sigma + * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables + * PSD: intra-patch spatial distortion (X, Y) + */ +#ifdef LUMA_raw +#define SST 1 +#define SS 0.49764743714339127 +#define SD vec3(1,1,1) +#define PST 0 +#define PSS 0.0 +#define PSD vec2(1,1) +#else +#define SST 1 +#define SS 0.32091162692066677 +#define SD vec3(1,1,1) +#define PST 0 +#define PSS 0.0 +#define PSD vec2(1,1) +#endif + +/* Kernels + * + * SK: spatial kernel + * RK: range kernel (takes patch differences) + * PSK: intra-patch spatial kernel + * WDK: weight discard kernel + * WD1TK (WD=1 only): weight discard tolerance kernel + * + * List of available kernels: + * + * bicubic + * cos + * gaussian + * lanczos + * quadratic_ (unclamped) + * sinc + * sinc_ (unclamped) + * sinc3 + * sphinx + * sphinx_ (unclamped) + * triangle_ (unclamped) + * triangle + */ +#ifdef LUMA_raw +#define SK gaussian +#define RK gaussian +#define PSK gaussian +#define WDK is_zero +#define WD1TK gaussian +#else +#define SK gaussian +#define RK gaussian +#define PSK gaussian +#define WDK is_zero +#define WD1TK gaussian +#endif + +/* Sampling method + * + * In most cases this shouldn't make any difference, only set to bilinear if + * it's necessary to sample between pixels (e.g., RI=2). + * + * 0: nearest neighbor + * 1: bilinear + */ +#ifdef LUMA_raw +#define SAMPLE 0 +#else +#define SAMPLE 0 +#endif + +/* Research scaling factor + * + * Higher numbers sample more sparsely as the distance from the POI grows. + */ +#ifdef LUMA_raw +#define RSF 0.0 +#else +#define RSF 0.0 +#endif + +// Scaling factor (should match WIDTH/HEIGHT) +#ifdef LUMA_raw +#define SF 1 +#else +#define SF 1 +#endif + +// Use the guide image as the input image +#ifdef LUMA_raw +#define GUIDE_INPUT 0 +#else +#define GUIDE_INPUT 0 +#endif + +/* Visualization + * + * 0: off + * 1: absolute difference between input/output to the power of 0.25 + * 2: difference between input/output centered on 0.5 + * 3: post-WD weight map + * 4: pre-WD weight map + * 5: unsharp mask + * 6: EP + */ +#ifdef LUMA_raw +#define V 0 +#else +#define V 0 +#endif + +// Blur factor (0.0 returns the input image, 1.0 returns the output image) +#ifdef LUMA_raw +#define BF 1.0 +#else +#define BF 1.0 +#endif + +// Force disable textureGather +#ifdef LUMA_raw +#define NG 0 +#else +#define NG 0 +#endif + +// Patch donut (probably useless) +#ifdef LUMA_raw +#define PD 0 +#else +#define PD 0 +#endif + +// Duplicate 1st weight (for luma-guided-chroma) +#ifdef LUMA_raw +#define D1W 0 +#else +#define D1W 0 +#endif + +// Skip patch comparison +#ifdef LUMA_raw +#define SKIP_PATCH 0 +#else +#define SKIP_PATCH 0 +#endif + +// Shader code + +#define EPSILON 1.2e-38 +#define M_PI 3.14159265358979323846 +#define POW2(x) ((x)*(x)) +#define POW3(x) ((x)*(x)*(x)) +#define bicubic_(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0)))) +#define bicubic(x) bicubic_(clamp((x), 0.0, 2.0)) +#define gaussian(x) exp(-1 * POW2(x)) +#define quadratic_(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5)) +#define quadratic(x) quadratic_(clamp((x), 0.0, 1.5)) +#define sinc_(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI)) +#define sinc(x) sinc_(clamp((x), 0.0, 1.0)) +#define sinc3(x) sinc_(clamp((x), 0.0, 3.0)) +#define lanczos(x) (sinc3(x) * sinc(x)) +#define sphinx_(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI)) +#define sphinx(x) sphinx_(clamp((x), 0.0, 1.4302966531242027)) +#define triangle_(x) (1 - (x)) +#define triangle(x) triangle_(clamp((x), 0.0, 1.0)) +#define is_zero(x) int(x == 0) + +// XXX could maybe be better optimized on LGC +#if defined(LUMA_raw) +#define val float +#define val_swizz(v) (v.x) +#define unval(v) vec4(v.x, 0, 0, poi_.a) +#define val_packed val +#define val_pack(v) (v) +#define val_unpack(v) (v) +#elif defined(CHROMA_raw) +#define val vec2 +#define val_swizz(v) (v.xy) +#define unval(v) vec4(v.x, v.y, 0, poi_.a) +#define val_packed uint +#define val_pack(v) packUnorm2x16(v) +#define val_unpack(v) unpackUnorm2x16(v) +#else +#define val vec3 +#define val_swizz(v) (v.xyz) +#define unval(v) vec4(v.x, v.y, v.z, poi_.a) +#define val_packed val +#define val_pack(v) (v) +#define val_unpack(v) (v) +#endif + +#if PS == 6 +const int hp = P/2; +#else +const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes +#endif + +#if RS == 6 +const int hr = R/2; +#else +const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes +#endif + +// patch/research shapes +// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R) +// dots (.) represent samples (pixels) and X represents the pixel-of-interest + +// Z ..... +// Z ..... +// Z ..X.. +// Z ..... +// Z ..... +#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr) + +// (in this instance Z=4) +// Z .... +// Z .... +// Z ..X. +// Z .... +#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr) + +// Z-4 . +// Z-2 ... +// Z ..X.. +#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr) + +// Z-4 . +// Z-2 ... +// hz+1 ..X +#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr) +#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z) + +// Z-4 . +// Z-2 ... +// Z ..X.. +// Z-2 ... +// Z-4 . +#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr) +#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z) + +// +// Z ..X.. +// +#define S_HORIZONTAL(z,hz,incr) for (z.y = 0; z.y <= 0; z.y++) for (z.x = -hz; z.x <= hz; incr) + +// 90 degree rotation of S_HORIZONTAL +#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr) + +// 1 . +// 1 . +// Z ..X.. +// 1 . +// 1 . +#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr) +#define S_PLUS_A(hz,Z) (Z*2 - 1) + +// 3 . . . +// 3 ... +// Z ..X.. +// 3 ... +// 3 . . . +#define S_PLUS_X(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(z.x) + -hz * int(z.x == 0); z.y <= abs(z.x) + hz * int(z.x == 0); incr) +#define S_PLUS_X_A(hz,Z) (Z*4 - 3) + +// 1x1 square +#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++) + +#define T1 (T+1) +#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++) + +#ifdef LUMA_raw +#define RF_ RF_LUMA +#else +#define RF_ RF +#endif + +// donut increment, increments without landing on (0,0,0) +// much faster than a continue statement +#define DINCR(z,c,a) ((z.c += a),(z.c += int(z == vec3(0)))) + +#define R_AREA(a) (a * T1 - 1) + +// research shapes +// XXX would be nice to have the option of temporally-varying research sizes +#if R == 0 || R == 1 +#define FOR_RESEARCH(r) S_1X1(r) +const int r_area = R_AREA(1); +#elif RS == 8 +#define FOR_RESEARCH(r) S_PLUS_X(r,hr,DINCR(r,y,max(1,abs(r.x)))) +const int r_area = R_AREA(S_PLUS_X_A(hr,R)); +#elif RS == 7 +#define FOR_RESEARCH(r) S_PLUS(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(S_PLUS_A(hr,R)); +#elif RS == 6 +#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(R*R); +#elif RS == 5 +#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,DINCR(r,x,1)) +const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); +#elif RS == 4 +#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,DINCR(r,x,1)) +const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); +#elif RS == 3 +#define FOR_RESEARCH(r) S_DIAMOND(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(S_DIAMOND_A(hr,R)); +#elif RS == 2 +#define FOR_RESEARCH(r) S_VERTICAL(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(R); +#elif RS == 1 +#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,DINCR(r,x,1)) +const int r_area = R_AREA(R); +#elif RS == 0 +#define FOR_RESEARCH(r) S_SQUARE(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(R*R); +#endif + +#define RI1 (RI+1) +#define RFI1 (RFI+1) + +#if RI +#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1) +#else +#define FOR_ROTATION +#endif + +#if RFI +#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++) +#else +#define FOR_REFLECTION +#endif + +#if PD +#define PINCR DINCR +#else +#define PINCR(z,c,a) (z.c += a) +#endif + +#define P_AREA(a) (a - PD) + +// patch shapes +#if P == 0 || P == 1 +#define FOR_PATCH(p) S_1X1(p) +const int p_area = P_AREA(1); +#elif PS == 8 +#define FOR_PATCH(p) S_PLUS_X(p,hp,PINCR(p,y,max(1,abs(p.x)))) +const int p_area = P_AREA(S_PLUS_X_A(hp,P)); +#elif PS == 7 +#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(S_PLUS_A(hp,P)); +#elif PS == 6 +#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(P*P); +#elif PS == 5 +#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x,1)) +const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); +#elif PS == 4 +#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x,1)) +const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); +#elif PS == 3 +#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(S_DIAMOND_A(hp,P)); +#elif PS == 2 +#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(P); +#elif PS == 1 +#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x,1)) +const int p_area = P_AREA(P); +#elif PS == 0 +#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(P*P); +#endif + +const float r_scale = 1.0/r_area; +const float p_scale = 1.0/p_area; + +#if SAMPLE == 0 +#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * (vec2(off) + 0.5 - fract((pos) * (size)))) +#else +#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * vec2(off)) +#endif + +#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off) + +#if RF_ && defined(LUMA_raw) +#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off) +#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr))) +#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0) +#elif RF_ && D1W +#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off) +#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr))) +#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0) +#elif RF_ +#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off) +#else +#define load2_(off) load_(off) +#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr))) +#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0) +#endif + +#if T +val load(vec3 off) +{ + switch (min(int(off.z), frame)) { + case 0: return val_swizz(load_(off)); + + } +} +val load2(vec3 off) +{ + return off.z == 0 ? val_swizz(load2_(off)) : load(off); +} +#else +#define load(off) val_swizz(load_(off)) +#define load2(off) val_swizz(load2_(off)) +#endif + +val poi2 = load2(vec3(0)); // guide pixel-of-interest +#if GUIDE_INPUT +#define poi poi2 +#else +vec4 poi_ = load_(vec3(0)); +val poi = val_swizz(poi_); // pixel-of-interest +#endif + +#if RI // rotation +vec2 rot(vec2 p, float d) +{ + return vec2( + p.x * cos(radians(d)) - p.y * sin(radians(d)), + p.y * sin(radians(d)) + p.x * cos(radians(d)) + ); +} +#else +#define rot(p, d) (p) +#endif + +#if RFI // reflection +vec2 ref(vec2 p, int d) +{ + switch (d) { + case 0: return p; + case 1: return p * vec2(1, -1); + case 2: return p * vec2(-1, 1); + } +} +#else +#define ref(p, d) (p) +#endif + +#if SST && R >= SST +float spatial_r(vec3 v) +{ + v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size); + return SK(length(v*SD)*SS); +} +#else +#define spatial_r(v) (1) +#endif + +#if PST && P >= PST +#define spatial_p(v) PSK(length(v*PSD)*PSS) +#else +#define spatial_p(v) (1) +#endif + +val range(val pdiff_sq) +{ + const float h = max(S, 0.0) * 0.013; + const float pdiff_scale = 1.0/(h*h); + pdiff_sq = sqrt(pdiff_sq * pdiff_scale); +#if defined(LUMA_raw) + return RK(pdiff_sq); +#elif defined(CHROMA_raw) + return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y)); +#else + return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z)); +#endif +} -vec4 hook() +val patch_comparison(vec3 r, vec3 r2) { - return HOOKED_texOff(0); + vec3 p; + val min_rot = val(p_area); + + FOR_ROTATION FOR_REFLECTION { + val pdiff_sq = val(0); + FOR_PATCH(p) { + vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z); + val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF); + diff_sq *= diff_sq; + diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy); + pdiff_sq += diff_sq; + } + min_rot = min(min_rot, pdiff_sq); + } + + return min_rot * p_scale; } -//!HOOK LUMA -//!HOOK CHROMA -//!BIND _INJ_PREI -//!WIDTH HOOKED.w -//!HEIGHT HOOKED.h -//!DESC Guided filter (I) -//!SAVE _INJ_I +#define NO_GATHER (PD == 0 && NG == 0 && SAMPLE == 0) // never textureGather if any of these conditions are false +#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3 || RI == 7) -vec4 hook() +#if (defined(LUMA_gather) || D1W) && ((PS == 0 || ((PS == 3 || PS == 7) && RI != 7) || PS == 8) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER +// 3x3 diamond/plus patch_comparison_gather +// XXX extend to support arbitrary sizes (probably requires code generation) +// XXX support PSS +const ivec2 offsets_adj[4] = { ivec2(0,-1), ivec2(1,0), ivec2(0,1), ivec2(-1,0) }; +const ivec2 offsets_adj_sf[4] = { ivec2(0,-1) * SF, ivec2(1,0) * SF, ivec2(0,1) * SF, ivec2(-1,0) * SF }; +vec4 poi_patch_adj = gather_offs(0, offsets_adj); +#if PS == 0 || PS == 8 +const ivec2 offsets_diag[4] = { ivec2(-1,-1), ivec2(1,-1), ivec2(1,1), ivec2(-1,1) }; +const ivec2 offsets_diag_sf[4] = { ivec2(-1,-1) * SF, ivec2(1,-1) * SF, ivec2(1,1) * SF, ivec2(-1,1) * SF }; +vec4 poi_patch_diag = gather_offs(0, offsets_diag); +#endif +float patch_comparison_gather(vec3 r, vec3 r2) { -return _INJ_PREI_texOff(0); -} - + float min_rot = p_area - 1; + vec4 transformer_adj = gather_offs(r, offsets_adj_sf); +#if PS == 0 || PS == 8 + vec4 transformer_diag = gather_offs(r, offsets_diag_sf); +#endif + FOR_ROTATION { + FOR_REFLECTION { +#if RFI + /* xxy + * w y + * wzz + */ + switch(rfi) { + case 1: + transformer_adj = transformer_adj.zyxw; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.zyxw; +#endif + break; + case 2: + transformer_adj = transformer_adj.xwzy; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.xwzy; +#endif + break; + } +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (P) -//!BIND HOOKED -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_P + vec4 diff = poi_patch_adj - transformer_adj; +#if PS == 0 || PS == 8 + diff += poi_patch_diag - transformer_diag; +#endif + float diff_sq = dot(diff * diff, vec4(1)); + min_rot = min(diff_sq, min_rot); -vec4 hook() +// un-reflect +#if RFI + switch(rfi) { + case 1: + transformer_adj = transformer_adj.zyxw; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.zyxw; +#endif + break; + case 2: + transformer_adj = transformer_adj.xwzy; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.xwzy; +#endif + break; + } +#endif + } // FOR_REFLECTION +#if RI == 7 + transformer_adj = transformer_adj.wxyz; + // swap adjacents for diagonals + transformer_adj += transformer_diag; + transformer_diag = transformer_adj - transformer_diag; + transformer_adj -= transformer_diag; +#elif RI == 3 + transformer_adj = transformer_adj.wxyz; +#elif RI == 1 + transformer_adj = transformer_adj.zwxy; +#endif +#if RI == 3 && (PS == 0 || PS == 8) + transformer_diag = transformer_diag.wxyz; +#elif RI == 1 && (PS == 0 || PS == 8) + transformer_diag = transformer_diag.zwxy; +#endif + } // FOR_ROTATION + float center_diff = poi2.x - load2(r).x; + return (center_diff * center_diff + min_rot) * p_scale; +} +#elif (defined(LUMA_gather) || D1W) && PS == 4 && P == 3 && RI == 0 && RFI == 0 && NO_GATHER +const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,0), ivec2(1,0) }; +const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,0) * SF, ivec2(1,0) * SF }; +vec4 poi_patch = gather_offs(0, offsets); +float patch_comparison_gather(vec3 r, vec3 r2) { - return HOOKED_texOff(0); + vec4 pdiff = poi_patch - gather_offs(r, offsets_sf); + return dot(pdiff * pdiff, vec4(1)) * p_scale; } - -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (MEANI) -//!BIND _INJ_I -//!WIDTH _INJ_I.w 1.5 / -//!HEIGHT _INJ_I.h 1.5 / -//!SAVE _INJ_MEANI - -vec4 hook() +#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER +// tiled even square patch_comparison_gather +// XXX extend to support odd square? +float patch_comparison_gather(vec3 r, vec3 r2) { -return _INJ_I_texOff(0); + vec2 tile; + float min_rot = p_area; + + /* gather order: + * w z + * x y + */ + float pdiff_sq = 0; + for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) { + vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy); + diff_sq *= diff_sq; + diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)), + spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0))); + pdiff_sq += dot(diff_sq, vec4(1)); + } + min_rot = min(min_rot, pdiff_sq); + + return min_rot * p_scale; } - -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (MEANP) -//!BIND _INJ_P -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_MEANP +#else +#define patch_comparison_gather patch_comparison +#endif vec4 hook() { -return _INJ_P_texOff(0); -} + val total_weight = val(0); + val sum = val(0); + val result = val(0); -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (_INJ_I_SQ) -//!BIND _INJ_I -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_I_SQ + vec3 r = vec3(0); + vec3 p = vec3(0); + vec3 me = vec3(0); -vec4 hook() -{ -return _INJ_I_texOff(0) * _INJ_I_texOff(0); -} +#if T && ME == 1 // temporal & motion estimation + vec3 me_tmp = vec3(0); + float maxweight = 0; +#elif T && ME == 2 // temporal & motion estimation + vec3 me_sum = vec3(0); + float me_weight = 0; +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (_INJ_IXP) -//!BIND _INJ_I -//!BIND _INJ_P -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_IXP +#if AS + val total_weight_s = val(0); + val sum_s = val(0); +#endif -vec4 hook() -{ -return _INJ_I_texOff(0) * _INJ_P_texOff(0); -} +#if WD == 2 // weight discard (mean) + int r_index = 0; + val_packed all_weights[r_area]; + val_packed all_pixels[r_area]; +#elif WD == 1 // weight discard (moving cumulative average) + int r_iter = 1; + val wd_total_weight = val(0); + val wd_sum = val(0); +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (CORRI) -//!BIND _INJ_I_SQ -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_CORRI + FOR_FRAME(r) { + // XXX ME is always a frame behind, should have the option to re-research after applying ME (could do it an arbitrary number of times per frame if desired) +#if T && ME == 1 // temporal & motion estimation max weight + if (r.z > 0) { + me += me_tmp * MEF; + me_tmp = vec3(0); + maxweight = 0; + } +#elif T && ME == 2 // temporal & motion estimation weighted average + if (r.z > 0) { + me += round(me_sum / me_weight * MEF); + me_sum = vec3(0); + me_weight = 0; + } +#endif + FOR_RESEARCH(r) { + // r coords with appropriate transformations applied + vec3 tr = vec3(r.xy + floor(r.xy * RSF), r.z); + float spatial_weight = spatial_r(tr); + tr.xy += me.xy; -vec4 hook() -{ -return _INJ_I_SQ_texOff(0); -} + val px = load(tr); -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (CORRP) -//!BIND _INJ_IXP -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_CORRP +#if SKIP_PATCH + val weight = val(1); +#else + val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(tr, vec3(0))) : patch_comparison(tr, vec3(0)); + val weight = range(pdiff_sq); +#endif -vec4 hook() -{ -return _INJ_IXP_texOff(0); -} +#if T && ME == 1 // temporal & motion estimation max weight + me_tmp = vec3(tr.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); + maxweight = max(maxweight, weight.x); +#elif T && ME == 2 // temporal & motion estimation weighted average + me_sum += vec3(tr.xy,0) * weight.x; + me_weight += weight.x; +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (A) -//!BIND _INJ_MEANI -//!BIND _INJ_MEANP -//!BIND _INJ_CORRI -//!BIND _INJ_CORRP -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_A +#if D1W + weight = val(weight.x); +#endif -#define E 0.0013 + weight *= spatial_weight; -vec4 hook() -{ -vec4 var = _INJ_CORRI_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANI_texOff(0); -vec4 cov = _INJ_CORRP_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANP_texOff(0); - return cov / (var + E); -} +#if AS + spatial_weight *= int(r.z == 0); // ignore temporal + sum_s += px * spatial_weight; + total_weight_s += spatial_weight; +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (B) -//!BIND _INJ_A -//!BIND _INJ_MEANI -//!BIND _INJ_MEANP -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_B +#if WD == 2 // weight discard (mean) + all_weights[r_index] = val_pack(weight); + all_pixels[r_index] = val_pack(px); + r_index++; +#elif WD == 1 // weight discard (moving cumulative average) + val wd_scale = val(1.0/r_iter); + val below_threshold = WDS * abs(min(val(0.0), weight - (total_weight * wd_scale * WDT * WD1TK(sqrt(wd_scale*WDP))))); +#if defined(LUMA_raw) + val wdkf = WDK(below_threshold); +#elif defined(CHROMA_raw) + val wdkf = vec2(WDK(below_threshold.x), WDK(below_threshold.y)); +#else + val wdkf = vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.y)); +#endif + wd_sum += px * weight * wdkf; + wd_total_weight += weight * wdkf; + r_iter++; +#endif -vec4 hook() -{ -return _INJ_MEANP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANI_texOff(0); -} + sum += px * weight; + total_weight += weight; + } // FOR_RESEARCH + } // FOR_FRAME -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (MEANA) -//!BIND _INJ_A -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_MEANA + val avg_weight = total_weight * r_scale; + val old_avg_weight = avg_weight; -vec4 hook() -{ -return _INJ_A_texOff(0); -} +#if WD == 2 // weight discard (mean) + total_weight = val(0); + sum = val(0); -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (MEANB) -//!BIND _INJ_B -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_MEANB + for (int i = 0; i < r_area; i++) { + val weight = val_unpack(all_weights[i]); + val px = val_unpack(all_pixels[i]); -vec4 hook() -{ -return _INJ_B_texOff(0); -} + val below_threshold = WDS * abs(min(val(0.0), weight - (avg_weight * WDT))); +#if defined(LUMA_raw) + weight *= WDK(below_threshold); +#elif defined(CHROMA_raw) + weight *= vec2(WDK(below_threshold.x), WDK(below_threshold.y)); +#else + weight *= vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.z)); +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter -//!BIND HOOKED -//!BIND _INJ_MEANA -//!BIND _INJ_MEANB -//!SAVE RF_LUMA + sum += px * weight; + total_weight += weight; + } +#elif WD == 1 // weight discard (moving cumulative average) + total_weight = wd_total_weight; + sum = wd_sum; +#endif +#if WD // weight discard + avg_weight = total_weight * r_scale; +#endif -vec4 hook() -{ -return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0); + total_weight += SW * spatial_r(vec3(0)); + sum += poi * SW * spatial_r(vec3(0)); + result = val(sum / total_weight); + + // store frames for temporal +#if T > 1 + +#endif +#if T && TRF + imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result)); +#elif T + imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2)); +#endif + +#if AS == 1 // sharpen+denoise +#define AS_base result +#elif AS == 2 // sharpen only +#define AS_base poi +#endif +#if AS + val usm = result - sum_s/total_weight_s; + usm = exp(log(abs(usm))*ASP) * sign(usm); // avoiding pow() since it's buggy on nvidia + usm *= gaussian(abs((AS_base + usm - 0.5) / 1.5) * ASA); + usm *= ASF; + result = AS_base + usm; +#endif + +#if EP // extremes preserve + float luminance = EP_texOff(0).x; + // EPSILON is needed since pow(0,0) is undefined + float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); + result = mix(poi, result, ep_weight); +#else + float ep_weight = 0; +#endif + +#if V == 1 + result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0); +#elif V == 2 + result = (poi - result) * 0.5 + 0.5; +#elif V == 3 // post-WD weight map + result = avg_weight; +#elif V == 4 // pre-WD edge map + result = old_avg_weight; +#elif V == 5 + result = 0.5 + usm; +#elif V == 6 + result = val(1 - ep_weight); +#endif + +// XXX visualize chroma for these +#if defined(CHROMA_raw) && (V == 3 || V == 4 || V == 6) + return vec4(0.5); +#endif + + return unval(mix(poi, result, BF)); } -// End of source code injected from guided.glsl +// End of source code injected from ../LQ/nlmeans.glsl //!HOOK LUMA //!HOOK CHROMA @@ -328,25 +1150,11 @@ vec4 hook() return RF_LUMA_texOff(0); } -//!HOOK LUMA -//!HOOK CHROMA -//!BIND LUMA -//!WIDTH LUMA.w 3 / -//!HEIGHT LUMA.h 3 / -//!DESC Non-local means (EP) -//!SAVE EP - -vec4 hook() -{ - return LUMA_texOff(0); -} - //!HOOK LUMA //!HOOK CHROMA //!BIND HOOKED //!BIND RF_LUMA //!BIND RF -//!BIND EP //!DESC Non-local means (nlmeans.glsl) // User variables @@ -356,49 +1164,37 @@ vec4 hook() // Denoising factor (level of blur, higher means more blur) #ifdef LUMA_raw -#define S 2.0 +#define S 2.0522687499802097 #else -#define S 5.0 +#define S 2.5168955531436197 #endif /* Adaptive sharpening * - * Uses the blur incurred by denoising to perform an unsharp mask, and uses the - * weight map to restrict the sharpening to edges. - * - * If you just want to increase/decrease sharpness then you want to change ASF. + * Performs an unsharp mask by subtracting the spatial kernel's blur from the + * NLM blur. For sharpen+denoise the sharpening is limited to edge areas and + * denoising is done everywhere else. * * Use V=4 to visualize which areas are sharpened (black means sharpen). * * AS: - * - 0 to disable - * - 1 to sharpen+denoise - * - 2 to sharpen only + * - 0: disable + * - 1: sharpen+denoise + * - 2: sharpen only * ASF: Higher numbers make a sharper image - * ASP: Higher numbers use more of the sharp image - * ASW: - * - 0 to use pre-WD weights - * - 1 to use post-WD weights (ASP should be ~2x to compensate) - * ASK: Weight kernel: - * - 0 for power. This is the old method. - * - 1 for sigmoid. This is generally recommended. - * - 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image) - * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map + * ASA: Anti-ringing, higher numbers increase strength + * ASP: Power, lower numbers increase sharpening on lower frequency detail */ #ifdef LUMA_raw #define AS 0 -#define ASF 3.0 -#define ASP 1.0 -#define ASW 0 -#define ASK 1 -#define ASC 0.0 +#define ASF 0.1625 +#define ASA 5.0 +#define ASP 0.5 #else #define AS 0 -#define ASF 3.0 -#define ASP 1.0 -#define ASW 0 -#define ASK 1 -#define ASC 0.0 +#define ASF 0.1625 +#define ASA 5.0 +#define ASP 0.5 #endif /* Starting weight @@ -409,52 +1205,57 @@ vec4 hook() * EPSILON should be used instead of zero to avoid divide-by-zero errors. */ #ifdef LUMA_raw -#define SW 1.0 +#define SW 1.3011446081346498 #else -#define SW 0.5 +#define SW 1.2219854377433914 #endif /* Weight discard * - * Discard weights that fall below a fraction of the average weight. This culls - * the most dissimilar samples from the blur, yielding a much more pleasant - * result, especially around edges. + * Reduces weights that fall below a fraction of the average weight. This culls + * the most dissimilar samples from the blur, which can yield a better result, + * especially around edges. * * WD: - * - 2: True average. Better quality, but slower and requires GLSL 4.0 or later - * - 1: Moving cumulative average. Inaccurate, tends to blur directionally. + * - 2: Mean. Better quality, but slower and requires GLSL 4.0 or later + * - 1: Moving cumulative average. Fast but inaccurate, blurs directionally. * - 0: Disable * * WDT: Threshold coefficient, higher numbers discard more * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes + * WDS (not for WDK=is_zero): Higher numbers are more eager to reduce weights */ #ifdef LUMA_raw #define WD 2 -#define WDT 0.5 -#define WDP 6.0 +#define WDT 0.11671341022864548 +#define WDP 5.381278367349288 +#define WDS 1.0 #else -#define WD 2 -#define WDT 0.75 -#define WDP 6.0 +#define WD 0 +#define WDT 0.002713346103131793 +#define WDP 5.832936323930807 +#define WDS 1.0 #endif /* Extremes preserve * - * Reduces denoising around very bright/dark areas. + * Reduce denoising in very bright/dark areas. + * + * Disabled by default now. If you want to reenable this, set EP=3/ in + * Makefile.nlm and rebuild. * * The downscaling factor of the EP shader stage affects what is considered a - * bright/dark area. The default of 3 should be fine, it's not recommended to - * change this. + * bright/dark area. * * This is incompatible with RGB. If you have RGB hooks enabled then you will * have to delete the EP shader stage or specify EP=0 through shader_cfg. * * EP: 1 to enable, 0 to disable - * DP: EP strength on dark patches, 0 to fully denoise - * BP: EP strength on bright patches, 0 to fully denoise + * DP: EP strength on dark areas, 0 to fully denoise + * BP: EP strength on bright areas, 0 to fully denoise */ #ifdef LUMA_raw -#define EP 1 +#define EP 0 #define BP 0.75 #define DP 0.25 #else @@ -469,12 +1270,26 @@ vec4 hook() /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* textureGather applicable configurations: + * + * - PS={0,3,7,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2} + * - PS={0,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2} + * - PS=6:RI=0:RFI=0 + * - Currently the only scalable variant + * + * Options which always disable textureGather: + * - NG + * - SAMPLE + * - PD + * + * Running without textureGather may be much slower. + */ + /* Patch & research sizes * - * Patch size should be an odd number greater than or equal to 3. Higher values - * are slower and not always better. + * P should be an odd number. Higher values are slower and not always better. * - * Research size be an odd number greater than or equal to 3. Higher values are + * R should be an odd number greater than or equal to 3. Higher values are * generally better, but slower, blurrier, and gives diminishing returns. */ #ifdef LUMA_raw @@ -492,8 +1307,6 @@ vec4 hook() * * PS applies applies to patches, RS applies to research zones. * - * Be wary of gather optimizations (see the Regarding Speed comment at the top) - * * 0: square (symmetrical) * 1: horizontal line (asymmetric) * 2: vertical line (asymmetric) @@ -502,6 +1315,7 @@ vec4 hook() * 5: truncated triangle (asymmetric on two axis, last row halved) * 6: even sized square (asymmetric on two axis) * 7: plus (symmetrical) + * 8: plus X (symmetrical) */ #ifdef LUMA_raw #define RS 3 @@ -516,8 +1330,8 @@ vec4 hook() * This setting is dependent on code generation from shader_cfg, so this * setting can only be enabled via shader_cfg. * - * Compares the pixel-of-interest against a guide, which could be a downscaled - * image or the output of another shader + * Computes weights on a guide, which could be a downscaled image or the output + * of another shader, and applies the weights to the original image */ #define RF_LUMA 1 #define RF 1 @@ -531,11 +1345,14 @@ vec4 hook() * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc. * + * Consider setting SAMPLE=1 if setting RI to a setting that would require + * sampling between pixels. + * * RI: Rotational invariance * RFI (0 to 2): Reflectional invariance */ #ifdef LUMA_raw -#define RI 3 +#define RI 0 #define RFI 2 #else #define RI 0 @@ -598,14 +1415,14 @@ vec4 hook() */ #ifdef LUMA_raw #define SST 1 -#define SS 0.25 +#define SS 0.5296176863733414 #define SD vec3(1,1,1) #define PST 0 #define PSS 0.0 #define PSD vec2(1,1) #else #define SST 1 -#define SS 0.25 +#define SS 0.26295970436981203 #define SD vec3(1,1,1) #define PST 0 #define PSS 0.0 @@ -617,6 +1434,8 @@ vec4 hook() * SK: spatial kernel * RK: range kernel (takes patch differences) * PSK: intra-patch spatial kernel + * WDK: weight discard kernel + * WD1TK (WD=1 only): weight discard tolerance kernel * * List of available kernels: * @@ -624,18 +1443,51 @@ vec4 hook() * cos * gaussian * lanczos - * quadratic + * quadratic_ (unclamped) * sinc + * sinc_ (unclamped) + * sinc3 * sphinx + * sphinx_ (unclamped) + * triangle_ (unclamped) + * triangle */ #ifdef LUMA_raw #define SK gaussian #define RK gaussian #define PSK gaussian +#define WDK is_zero +#define WD1TK gaussian #else #define SK gaussian #define RK gaussian #define PSK gaussian +#define WDK is_zero +#define WD1TK gaussian +#endif + +/* Sampling method + * + * In most cases this shouldn't make any difference, only set to bilinear if + * it's necessary to sample between pixels (e.g., RI=2). + * + * 0: nearest neighbor + * 1: bilinear + */ +#ifdef LUMA_raw +#define SAMPLE 0 +#else +#define SAMPLE 0 +#endif + +/* Research scaling factor + * + * Higher numbers sample more sparsely as the distance from the POI grows. + */ +#ifdef LUMA_raw +#define RSF 0.0 +#else +#define RSF 0.0 #endif // Scaling factor (should match WIDTH/HEIGHT) @@ -645,13 +1497,22 @@ vec4 hook() #define SF 1 #endif +// Use the guide image as the input image +#ifdef LUMA_raw +#define GUIDE_INPUT 0 +#else +#define GUIDE_INPUT 0 +#endif + /* Visualization * * 0: off * 1: absolute difference between input/output to the power of 0.25 * 2: difference between input/output centered on 0.5 - * 3: avg_weight - * 4: edge map (based on the relevant AS settings) + * 3: post-WD weight map + * 4: pre-WD weight map + * 5: unsharp mask + * 6: EP */ #ifdef LUMA_raw #define V 0 @@ -696,37 +1557,44 @@ vec4 hook() // Shader code -#define EPSILON 0.00000000001 +#define EPSILON 1.2e-38 #define M_PI 3.14159265358979323846 #define POW2(x) ((x)*(x)) #define POW3(x) ((x)*(x)*(x)) -#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0)))) +#define bicubic_(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0)))) +#define bicubic(x) bicubic_(clamp((x), 0.0, 2.0)) #define gaussian(x) exp(-1 * POW2(x)) -#define lanczos(x) POW2(sinc(x)) -#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5)) -#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI)) -#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI)) +#define quadratic_(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5)) +#define quadratic(x) quadratic_(clamp((x), 0.0, 1.5)) +#define sinc_(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI)) +#define sinc(x) sinc_(clamp((x), 0.0, 1.0)) +#define sinc3(x) sinc_(clamp((x), 0.0, 3.0)) +#define lanczos(x) (sinc3(x) * sinc(x)) +#define sphinx_(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI)) +#define sphinx(x) sphinx_(clamp((x), 0.0, 1.4302966531242027)) +#define triangle_(x) (1 - (x)) +#define triangle(x) triangle_(clamp((x), 0.0, 1.0)) +#define is_zero(x) int(x == 0) // XXX could maybe be better optimized on LGC -// XXX return original alpha component instead of 1.0 #if defined(LUMA_raw) #define val float #define val_swizz(v) (v.x) -#define unval(v) vec4(v.x, 0, 0, 1.0) +#define unval(v) vec4(v.x, 0, 0, poi_.a) #define val_packed val #define val_pack(v) (v) #define val_unpack(v) (v) #elif defined(CHROMA_raw) #define val vec2 #define val_swizz(v) (v.xy) -#define unval(v) vec4(v.x, v.y, 0, 1.0) +#define unval(v) vec4(v.x, v.y, 0, poi_.a) #define val_packed uint #define val_pack(v) packUnorm2x16(v) #define val_unpack(v) unpackUnorm2x16(v) #else #define val vec3 #define val_swizz(v) (v.xyz) -#define unval(v) vec4(v.x, v.y, v.z, 1.0) +#define unval(v) vec4(v.x, v.y, v.z, poi_.a) #define val_packed val #define val_pack(v) (v) #define val_unpack(v) (v) @@ -744,10 +1612,6 @@ const int hr = R/2; const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes #endif -// donut increment, increments without landing on (0,0,0) -// much faster than a continue statement -#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0)))) - // patch/research shapes // each shape is depicted in a comment, where Z=5 (Z corresponds to P or R) // dots (.) represent samples (pixels) and X represents the pixel-of-interest @@ -788,7 +1652,7 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res // // Z ..X.. // -#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++) +#define S_HORIZONTAL(z,hz,incr) for (z.y = 0; z.y <= 0; z.y++) for (z.x = -hz; z.x <= hz; incr) // 90 degree rotation of S_HORIZONTAL #define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr) @@ -801,19 +1665,13 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res #define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr) #define S_PLUS_A(hz,Z) (Z*2 - 1) -// XXX implement S_PLUS w/ an X overlayed: // 3 . . . // 3 ... // Z ..X.. // 3 ... // 3 . . . - -// XXX implement an X shape: -// 2 . . -// 2 . . -// 1 X -// 2 . . -// 2 . . +#define S_PLUS_X(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(z.x) + -hz * int(z.x == 0); z.y <= abs(z.x) + hz * int(z.x == 0); incr) +#define S_PLUS_X_A(hz,Z) (Z*4 - 3) // 1x1 square #define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++) @@ -827,43 +1685,43 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res #define RF_ RF #endif -// Skip comparing the pixel-of-interest against itself, unless RF is enabled -#if RF_ -#define RINCR(z,c) (z.c++) -#else -#define RINCR DINCR -#endif +// donut increment, increments without landing on (0,0,0) +// much faster than a continue statement +#define DINCR(z,c,a) ((z.c += a),(z.c += int(z == vec3(0)))) -#define R_AREA(a) (a * T1 + RF_-1) +#define R_AREA(a) (a * T1 - 1) // research shapes // XXX would be nice to have the option of temporally-varying research sizes #if R == 0 || R == 1 #define FOR_RESEARCH(r) S_1X1(r) const int r_area = R_AREA(1); +#elif RS == 8 +#define FOR_RESEARCH(r) S_PLUS_X(r,hr,DINCR(r,y,max(1,abs(r.x)))) +const int r_area = R_AREA(S_PLUS_X_A(hr,R)); #elif RS == 7 -#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_PLUS(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(S_PLUS_A(hr,R)); #elif RS == 6 -#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(R*R); #elif RS == 5 -#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x)) +#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,DINCR(r,x,1)) const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); #elif RS == 4 -#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x)) +#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,DINCR(r,x,1)) const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); #elif RS == 3 -#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_DIAMOND(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(S_DIAMOND_A(hr,R)); #elif RS == 2 -#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_VERTICAL(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(R); #elif RS == 1 -#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x)) +#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,DINCR(r,x,1)) const int r_area = R_AREA(R); #elif RS == 0 -#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_SQUARE(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(R*R); #endif @@ -885,7 +1743,7 @@ const int r_area = R_AREA(R*R); #if PD #define PINCR DINCR #else -#define PINCR(z,c) (z.c++) +#define PINCR(z,c,a) (z.c += a) #endif #define P_AREA(a) (a - PD) @@ -894,36 +1752,44 @@ const int r_area = R_AREA(R*R); #if P == 0 || P == 1 #define FOR_PATCH(p) S_1X1(p) const int p_area = P_AREA(1); +#elif PS == 8 +#define FOR_PATCH(p) S_PLUS_X(p,hp,PINCR(p,y,max(1,abs(p.x)))) +const int p_area = P_AREA(S_PLUS_X_A(hp,P)); #elif PS == 7 -#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(S_PLUS_A(hp,P)); #elif PS == 6 -#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(P*P); #elif PS == 5 -#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x)) +#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x,1)) const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); #elif PS == 4 -#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x)) +#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x,1)) const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); #elif PS == 3 -#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(S_DIAMOND_A(hp,P)); #elif PS == 2 -#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(P); #elif PS == 1 -#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x)) +#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x,1)) const int p_area = P_AREA(P); #elif PS == 0 -#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(P*P); #endif const float r_scale = 1.0/r_area; const float p_scale = 1.0/p_area; -#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size))) +#if SAMPLE == 0 +#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * (vec2(off) + 0.5 - fract((pos) * (size)))) +#else +#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * vec2(off)) +#endif + #define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off) #if RF_ && defined(LUMA_raw) @@ -959,8 +1825,13 @@ val load2(vec3 off) #define load2(off) val_swizz(load2_(off)) #endif -val poi = load(vec3(0)); // pixel-of-interest val poi2 = load2(vec3(0)); // guide pixel-of-interest +#if GUIDE_INPUT +#define poi poi2 +#else +vec4 poi_ = load_(vec3(0)); +val poi = val_swizz(poi_); // pixel-of-interest +#endif #if RI // rotation vec2 rot(vec2 p, float d) @@ -1005,7 +1876,7 @@ float spatial_r(vec3 v) val range(val pdiff_sq) { - const float h = S*0.013; + const float h = max(S, 0.0) * 0.013; const float pdiff_scale = 1.0/(h*h); pdiff_sq = sqrt(pdiff_sq * pdiff_scale); #if defined(LUMA_raw) @@ -1015,10 +1886,6 @@ val range(val pdiff_sq) #else return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z)); #endif - //return exp(-pdiff_sq * pdiff_scale); - - // weight function from the NLM paper, it's not very good - //return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale); } val patch_comparison(vec3 r, vec3 r2) @@ -1041,42 +1908,104 @@ val patch_comparison(vec3 r, vec3 r2) return min_rot * p_scale; } -#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false -#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3) +#define NO_GATHER (PD == 0 && NG == 0 && SAMPLE == 0) // never textureGather if any of these conditions are false +#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3 || RI == 7) -#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER +#if (defined(LUMA_gather) || D1W) && ((PS == 0 || ((PS == 3 || PS == 7) && RI != 7) || PS == 8) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER // 3x3 diamond/plus patch_comparison_gather // XXX extend to support arbitrary sizes (probably requires code generation) -// XXX extend to support 3x3 square // XXX support PSS -const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) }; -const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF }; -vec4 poi_patch = gather_offs(0, offsets); +const ivec2 offsets_adj[4] = { ivec2(0,-1), ivec2(1,0), ivec2(0,1), ivec2(-1,0) }; +const ivec2 offsets_adj_sf[4] = { ivec2(0,-1) * SF, ivec2(1,0) * SF, ivec2(0,1) * SF, ivec2(-1,0) * SF }; +vec4 poi_patch_adj = gather_offs(0, offsets_adj); +#if PS == 0 || PS == 8 +const ivec2 offsets_diag[4] = { ivec2(-1,-1), ivec2(1,-1), ivec2(1,1), ivec2(-1,1) }; +const ivec2 offsets_diag_sf[4] = { ivec2(-1,-1) * SF, ivec2(1,-1) * SF, ivec2(1,1) * SF, ivec2(-1,1) * SF }; +vec4 poi_patch_diag = gather_offs(0, offsets_diag); +#endif float patch_comparison_gather(vec3 r, vec3 r2) { float min_rot = p_area - 1; - vec4 transformer = gather_offs(r, offsets_sf); + vec4 transformer_adj = gather_offs(r, offsets_adj_sf); +#if PS == 0 || PS == 8 + vec4 transformer_diag = gather_offs(r, offsets_diag_sf); +#endif FOR_ROTATION { FOR_REFLECTION { - float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1)); +#if RFI + /* xxy + * w y + * wzz + */ + switch(rfi) { + case 1: + transformer_adj = transformer_adj.zyxw; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.zyxw; +#endif + break; + case 2: + transformer_adj = transformer_adj.xwzy; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.xwzy; +#endif + break; + } +#endif + + vec4 diff = poi_patch_adj - transformer_adj; +#if PS == 0 || PS == 8 + diff += poi_patch_diag - transformer_diag; +#endif + float diff_sq = dot(diff * diff, vec4(1)); min_rot = min(diff_sq, min_rot); + +// un-reflect #if RFI switch(rfi) { - case 0: transformer = transformer.zyxw; break; - case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror - case 2: transformer = transformer.zyxw; break; // undoes last mirror + case 1: + transformer_adj = transformer_adj.zyxw; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.zyxw; +#endif + break; + case 2: + transformer_adj = transformer_adj.xwzy; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.xwzy; +#endif + break; } #endif - } -#if RI == 3 - transformer = transformer.wxyz; + } // FOR_REFLECTION +#if RI == 7 + transformer_adj = transformer_adj.wxyz; + // swap adjacents for diagonals + transformer_adj += transformer_diag; + transformer_diag = transformer_adj - transformer_diag; + transformer_adj -= transformer_diag; +#elif RI == 3 + transformer_adj = transformer_adj.wxyz; #elif RI == 1 - transformer = transformer.zwxy; + transformer_adj = transformer_adj.zwxy; #endif - } - float center_diff_sq = poi2.x - load2(r).x; - center_diff_sq *= center_diff_sq; - return (min_rot + center_diff_sq) * p_scale; +#if RI == 3 && (PS == 0 || PS == 8) + transformer_diag = transformer_diag.wxyz; +#elif RI == 1 && (PS == 0 || PS == 8) + transformer_diag = transformer_diag.zwxy; +#endif + } // FOR_ROTATION + float center_diff = poi2.x - load2(r).x; + return (center_diff * center_diff + min_rot) * p_scale; +} +#elif (defined(LUMA_gather) || D1W) && PS == 4 && P == 3 && RI == 0 && RFI == 0 && NO_GATHER +const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,0), ivec2(1,0) }; +const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,0) * SF, ivec2(1,0) * SF }; +vec4 poi_patch = gather_offs(0, offsets); +float patch_comparison_gather(vec3 r, vec3 r2) +{ + vec4 pdiff = poi_patch - gather_offs(r, offsets_sf); + return dot(pdiff * pdiff, vec4(1)) * p_scale; } #elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER // tiled even square patch_comparison_gather @@ -1124,18 +2053,23 @@ vec4 hook() float me_weight = 0; #endif -#if WD == 2 // weight discard +#if AS + val total_weight_s = val(0); + val sum_s = val(0); +#endif + +#if WD == 2 // weight discard (mean) int r_index = 0; val_packed all_weights[r_area]; val_packed all_pixels[r_area]; -#elif WD == 1 // weight discard - val no_weights = val(0); - val discard_total_weight = val(0); - val discard_sum = val(0); +#elif WD == 1 // weight discard (moving cumulative average) + int r_iter = 1; + val wd_total_weight = val(0); + val wd_sum = val(0); #endif FOR_FRAME(r) { - // XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired) + // XXX ME is always a frame behind, should have the option to re-research after applying ME (could do it an arbitrary number of times per frame if desired) #if T && ME == 1 // temporal & motion estimation max weight if (r.z > 0) { me += me_tmp * MEF; @@ -1149,19 +2083,26 @@ vec4 hook() me_weight = 0; } #endif - FOR_RESEARCH(r) { // main NLM logic + FOR_RESEARCH(r) { + // r coords with appropriate transformations applied + vec3 tr = vec3(r.xy + floor(r.xy * RSF), r.z); + float spatial_weight = spatial_r(tr); + tr.xy += me.xy; + + val px = load(tr); + #if SKIP_PATCH val weight = val(1); #else - val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0)); + val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(tr, vec3(0))) : patch_comparison(tr, vec3(0)); val weight = range(pdiff_sq); #endif #if T && ME == 1 // temporal & motion estimation max weight - me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); + me_tmp = vec3(tr.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); maxweight = max(maxweight, weight.x); #elif T && ME == 2 // temporal & motion estimation weighted average - me_sum += vec3(r.xy,0) * weight.x; + me_sum += vec3(tr.xy,0) * weight.x; me_weight += weight.x; #endif @@ -1169,21 +2110,34 @@ vec4 hook() weight = val(weight.x); #endif - weight *= spatial_r(r); + weight *= spatial_weight; -#if WD == 2 // weight discard +#if AS + spatial_weight *= int(r.z == 0); // ignore temporal + sum_s += px * spatial_weight; + total_weight_s += spatial_weight; +#endif + +#if WD == 2 // weight discard (mean) all_weights[r_index] = val_pack(weight); - all_pixels[r_index] = val_pack(load(r+me)); + all_pixels[r_index] = val_pack(px); r_index++; -#elif WD == 1 // weight discard - val wd_scale = 1.0/max(no_weights, 1); - val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight); - discard_sum += load(r+me) * weight * (1 - keeps); - discard_total_weight += weight * (1 - keeps); - no_weights += keeps; +#elif WD == 1 // weight discard (moving cumulative average) + val wd_scale = val(1.0/r_iter); + val below_threshold = WDS * abs(min(val(0.0), weight - (total_weight * wd_scale * WDT * WD1TK(sqrt(wd_scale*WDP))))); +#if defined(LUMA_raw) + val wdkf = WDK(below_threshold); +#elif defined(CHROMA_raw) + val wdkf = vec2(WDK(below_threshold.x), WDK(below_threshold.y)); +#else + val wdkf = vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.y)); +#endif + wd_sum += px * weight * wdkf; + wd_total_weight += weight * wdkf; + r_iter++; #endif - sum += load(r+me) * weight; + sum += px * weight; total_weight += weight; } // FOR_RESEARCH } // FOR_FRAME @@ -1191,37 +2145,37 @@ vec4 hook() val avg_weight = total_weight * r_scale; val old_avg_weight = avg_weight; -#if WD == 2 // true average +#if WD == 2 // weight discard (mean) total_weight = val(0); sum = val(0); - val no_weights = val(0); for (int i = 0; i < r_area; i++) { - val w = val_unpack(all_weights[i]); + val weight = val_unpack(all_weights[i]); val px = val_unpack(all_pixels[i]); - val keeps = step(avg_weight*WDT, w); - w *= keeps; - sum += px * w; - total_weight += w; - no_weights += keeps; + val below_threshold = WDS * abs(min(val(0.0), weight - (avg_weight * WDT))); +#if defined(LUMA_raw) + weight *= WDK(below_threshold); +#elif defined(CHROMA_raw) + weight *= vec2(WDK(below_threshold.x), WDK(below_threshold.y)); +#else + weight *= vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.z)); +#endif + + sum += px * weight; + total_weight += weight; } -#elif WD == 1 // moving cumulative average - total_weight -= discard_total_weight; - sum -= discard_sum; +#elif WD == 1 // weight discard (moving cumulative average) + total_weight = wd_total_weight; + sum = wd_sum; #endif #if WD // weight discard - avg_weight = total_weight / no_weights; + avg_weight = total_weight * r_scale; #endif total_weight += SW * spatial_r(vec3(0)); sum += poi * SW * spatial_r(vec3(0)); - -#if V == 3 // weight map - result = val(avg_weight); -#else // mean result = val(sum / total_weight); -#endif // store frames for temporal #if T > 1 @@ -1233,27 +2187,17 @@ vec4 hook() imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2)); #endif -#if ASW == 0 // pre-WD weights -#define AS_weight old_avg_weight -#elif ASW == 1 // post-WD weights -#define AS_weight avg_weight -#endif - -#if ASK == 0 - val sharpening_strength = pow(AS_weight, val(ASP)); -#elif ASK == 1 - val sharpening_strength = mix( - pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)), - AS_weight, ASC); - // XXX normalize the result to account for a negative ASC? -#elif ASK == 2 - val sharpening_strength = val(ASP); -#endif - #if AS == 1 // sharpen+denoise - val sharpened = result + (poi - result) * ASF; +#define AS_base result #elif AS == 2 // sharpen only - val sharpened = poi + (poi - result) * ASF; +#define AS_base poi +#endif +#if AS + val usm = result - sum_s/total_weight_s; + usm = exp(log(abs(usm))*ASP) * sign(usm); // avoiding pow() since it's buggy on nvidia + usm *= gaussian(abs((AS_base + usm - 0.5) / 1.5) * ASA); + usm *= ASF; + result = AS_base + usm; #endif #if EP // extremes preserve @@ -1261,26 +2205,27 @@ vec4 hook() // EPSILON is needed since pow(0,0) is undefined float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); result = mix(poi, result, ep_weight); -#endif - -#if AS == 1 // sharpen+denoise - result = mix(sharpened, result, sharpening_strength); -#elif AS == 2 // sharpen only - result = mix(sharpened, poi, sharpening_strength); -#endif - -#if V == 4 // edge map - result = sharpening_strength; -#endif - -#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations - return vec4(0.5); +#else + float ep_weight = 0; #endif #if V == 1 result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0); #elif V == 2 result = (poi - result) * 0.5 + 0.5; +#elif V == 3 // post-WD weight map + result = avg_weight; +#elif V == 4 // pre-WD edge map + result = old_avg_weight; +#elif V == 5 + result = 0.5 + usm; +#elif V == 6 + result = val(1 - ep_weight); +#endif + +// XXX visualize chroma for these +#if defined(CHROMA_raw) && (V == 3 || V == 4 || V == 6) + return vec4(0.5); #endif return unval(mix(poi, result, BF)); diff --git a/portable_config/shaders/nlmeans_2x.glsl b/portable_config/shaders/nlmeans_2x.glsl deleted file mode 100644 index 737f245e..00000000 --- a/portable_config/shaders/nlmeans_2x.glsl +++ /dev/null @@ -1,1247 +0,0 @@ -/* vi: ft=c - * - * Based on vf_nlmeans.c from FFmpeg. - * - * Copyright (c) 2022 an3223 - * Copyright (c) 2016 Clément Bœsch - * - * This program is free software: you can redistribute it and/or modify it - * under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 2.1 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License - * for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program. If not, see . - */ - -// Description: nlmeans_2x.glsl: Experimental upscaler - -/* The recommended usage of this shader and its variant profiles is to add them - * to input.conf and then dispatch the appropriate shader via a keybind during - * media playback. Here is an example input.conf entry: - * - * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)" - * - * These shaders can also be enabled by default in mpv.conf, for example: - * - * glsl-shaders='~~/shaders/nlmeans.glsl' - * - * Both of the examples above assume the shaders are located in a subdirectory - * named "shaders" within mpv's config directory. Refer to the mpv - * documentation for more details. - * - * This shader is highly configurable via user variables below. Although the - * default settings should offer good quality at a reasonable speed, you are - * encouraged to tweak them to your preferences. Be mindful that certain - * settings may greatly affect speed. - * - * Denoising is most useful for noisy content. If there is no perceptible - * noise, you probably won't see a positive difference. - * - * The default settings are generally tuned for low noise and high detail - * preservation. The "medium" and "heavy" profiles are tuned for higher levels - * of noise. - * - * The denoiser will not work properly if the content has been upscaled - * beforehand (whether it was done by you or not). In such cases, consider - * issuing a command to downscale in the mpv console (backtick ` key): - * - * vf toggle scale=-2:720 - * - * ...replacing 720 with whatever resolution seems appropriate. Rerun the - * command to undo the downscale. It may take some trial-and-error to find the - * proper resolution. - */ - -/* Regarding speed - * - * Speed may vary wildly for different vo and gpu-api settings. Generally - * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this - * may be different for your system. - * - * If your GPU doesn't support textureGather, or if you are on a version of mpv - * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile - * - * If you plan on tinkering with NLM's settings, read below: - * - * textureGather only applies to luma and limited to the these configurations: - * - * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2} - * - Default, very fast, rotations and reflections should be free - * - If this is unusually slow then try changing gpu-api and vo - * - If it's still slow, try setting RI/RFI to 0. - * - * - PS=6:RI={0,1,3}:RFI={0,1,2} - * - Currently the only scalable variant - * - Patch shape is asymmetric on two axis - * - Rotations should have very little speed impact - * - Reflections may have a significant speed impact - * - * Options which always disable textureGather: - * - PD - * - NG - */ - -// The following is shader code injected from guided.glsl -/* vi: ft=c - * - * Copyright (c) 2022 an3223 - * - * This program is free software: you can redistribute it and/or modify it - * under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 2.1 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License - * for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program. If not, see . - */ - -// Description: guided.glsl: Guided by the downscaled image - -/* The radius can be adjusted with the MEANI stage's downscaling factor. - * Higher numbers give a bigger radius. - * - * The E variable can be found in the A stage. - * - * The subsampling (fast guided filter) can be adjusted with the I stage's - * downscaling factor. Higher numbers are faster. - * - * The guide's subsampling can be adjusted with the PREI stage's downscaling - * factor. Higher numbers downscale more. - */ - -//!HOOK LUMA -//!BIND HOOKED -//!WIDTH HOOKED.w 1.25 / -//!HEIGHT HOOKED.h 1.25 / -//!DESC Guided filter (PREI) -//!SAVE _INJ_PREI - -vec4 hook() -{ - return HOOKED_texOff(0); -} - -//!HOOK LUMA -//!BIND _INJ_PREI -//!WIDTH HOOKED.w -//!HEIGHT HOOKED.h -//!DESC Guided filter (I) -//!SAVE _INJ_I - -vec4 hook() -{ -return _INJ_PREI_texOff(0); -} - - -//!HOOK LUMA -//!DESC Guided filter (P) -//!BIND HOOKED -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_P - -vec4 hook() -{ - return HOOKED_texOff(0); -} - -//!HOOK LUMA -//!DESC Guided filter (MEANI) -//!BIND _INJ_I -//!WIDTH _INJ_I.w 1.5 / -//!HEIGHT _INJ_I.h 1.5 / -//!SAVE _INJ_MEANI - -vec4 hook() -{ -return _INJ_I_texOff(0); -} - -//!HOOK LUMA -//!DESC Guided filter (MEANP) -//!BIND _INJ_P -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_MEANP - -vec4 hook() -{ -return _INJ_P_texOff(0); -} - -//!HOOK LUMA -//!DESC Guided filter (_INJ_I_SQ) -//!BIND _INJ_I -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_I_SQ - -vec4 hook() -{ -return _INJ_I_texOff(0) * _INJ_I_texOff(0); -} - -//!HOOK LUMA -//!DESC Guided filter (_INJ_IXP) -//!BIND _INJ_I -//!BIND _INJ_P -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_IXP - -vec4 hook() -{ -return _INJ_I_texOff(0) * _INJ_P_texOff(0); -} - -//!HOOK LUMA -//!DESC Guided filter (CORRI) -//!BIND _INJ_I_SQ -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_CORRI - -vec4 hook() -{ -return _INJ_I_SQ_texOff(0); -} - -//!HOOK LUMA -//!DESC Guided filter (CORRP) -//!BIND _INJ_IXP -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_CORRP - -vec4 hook() -{ -return _INJ_IXP_texOff(0); -} - -//!HOOK LUMA -//!DESC Guided filter (A) -//!BIND _INJ_MEANI -//!BIND _INJ_MEANP -//!BIND _INJ_CORRI -//!BIND _INJ_CORRP -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_A - -#define E 0.0013 - -vec4 hook() -{ -vec4 var = _INJ_CORRI_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANI_texOff(0); -vec4 cov = _INJ_CORRP_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANP_texOff(0); - return cov / (var + E); -} - -//!HOOK LUMA -//!DESC Guided filter (B) -//!BIND _INJ_A -//!BIND _INJ_MEANI -//!BIND _INJ_MEANP -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_B - -vec4 hook() -{ -return _INJ_MEANP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANI_texOff(0); -} - -//!HOOK LUMA -//!DESC Guided filter (MEANA) -//!BIND _INJ_A -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_MEANA - -vec4 hook() -{ -return _INJ_A_texOff(0); -} - -//!HOOK LUMA -//!DESC Guided filter (MEANB) -//!BIND _INJ_B -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_MEANB - -vec4 hook() -{ -return _INJ_B_texOff(0); -} - -//!HOOK LUMA -//!DESC Guided filter -//!BIND HOOKED -//!BIND _INJ_MEANA -//!BIND _INJ_MEANB -//!SAVE RF_LUMA - -vec4 hook() -{ -return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0); -} - -// End of source code injected from guided.glsl - -//!HOOK LUMA -//!BIND HOOKED -//!BIND RF_LUMA -//!DESC Non-local means (nlmeans_2x.glsl) -//!WIDTH HOOKED.w 2 * -//!HEIGHT HOOKED.h 2 * - -// User variables - -// It is generally preferable to denoise luma and chroma differently, so the -// user variables for luma and chroma are split. - -// Denoising factor (level of blur, higher means more blur) -#ifdef LUMA_raw -#define S 12.8125 -#else -#define S 12.8125 -#endif - -/* Adaptive sharpening - * - * Uses the blur incurred by denoising to perform an unsharp mask, and uses the - * weight map to restrict the sharpening to edges. - * - * If you just want to increase/decrease sharpness then you want to change ASF. - * - * Use V=4 to visualize which areas are sharpened (black means sharpen). - * - * AS: - * - 0 to disable - * - 1 to sharpen+denoise - * - 2 to sharpen only - * ASF: Higher numbers make a sharper image - * ASP: Higher numbers use more of the sharp image - * ASW: - * - 0 to use pre-WD weights - * - 1 to use post-WD weights (ASP should be ~2x to compensate) - * ASK: Weight kernel: - * - 0 for power. This is the old method. - * - 1 for sigmoid. This is generally recommended. - * - 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image) - * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map - */ -#ifdef LUMA_raw -#define AS 0 -#define ASF 3.0 -#define ASP 1.0 -#define ASW 0 -#define ASK 1 -#define ASC 0.0 -#else -#define AS 0 -#define ASF 3.0 -#define ASP 1.0 -#define ASW 0 -#define ASK 1 -#define ASC 0.0 -#endif - -/* Starting weight - * - * Also known as the center weight. This represents the weight of the - * pixel-of-interest. Lower numbers may help handle heavy noise & ringing. - * - * EPSILON should be used instead of zero to avoid divide-by-zero errors. - */ -#ifdef LUMA_raw -#define SW 0.14876 -#else -#define SW 0.14876 -#endif - -/* Weight discard - * - * Discard weights that fall below a fraction of the average weight. This culls - * the most dissimilar samples from the blur, yielding a much more pleasant - * result, especially around edges. - * - * WD: - * - 2: True average. Better quality, but slower and requires GLSL 4.0 or later - * - 1: Moving cumulative average. Inaccurate, tends to blur directionally. - * - 0: Disable - * - * WDT: Threshold coefficient, higher numbers discard more - * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes - */ -#ifdef LUMA_raw -#define WD 2 -#define WDT 0.63888239592 -#define WDP 6.0 -#else -#define WD 2 -#define WDT 0.63888239592 -#define WDP 6.0 -#endif - -/* Extremes preserve - * - * Reduces denoising around very bright/dark areas. - * - * The downscaling factor of the EP shader stage affects what is considered a - * bright/dark area. The default of 3 should be fine, it's not recommended to - * change this. - * - * This is incompatible with RGB. If you have RGB hooks enabled then you will - * have to delete the EP shader stage or specify EP=0 through shader_cfg. - * - * EP: 1 to enable, 0 to disable - * DP: EP strength on dark patches, 0 to fully denoise - * BP: EP strength on bright patches, 0 to fully denoise - */ -#ifdef LUMA_raw -#define EP 0 -#define BP 0.75 -#define DP 0.25 -#else -#define EP 0 -#define BP 0.0 -#define DP 0.0 -#endif - -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ - -/* Patch & research sizes - * - * Patch size should be an odd number greater than or equal to 3. Higher values - * are slower and not always better. - * - * Research size be an odd number greater than or equal to 3. Higher values are - * generally better, but slower, blurrier, and gives diminishing returns. - */ -#ifdef LUMA_raw -#define P 3 -#define R 5 -#else -#define P 3 -#define R 5 -#endif - -/* Patch and research shapes - * - * Different shapes have different speed and quality characteristics. Every - * shape (besides square) is smaller than square. - * - * PS applies applies to patches, RS applies to research zones. - * - * Be wary of gather optimizations (see the Regarding Speed comment at the top) - * - * 0: square (symmetrical) - * 1: horizontal line (asymmetric) - * 2: vertical line (asymmetric) - * 3: diamond (symmetrical) - * 4: triangle (asymmetric, pointing upward) - * 5: truncated triangle (asymmetric on two axis, last row halved) - * 6: even sized square (asymmetric on two axis) - * 7: plus (symmetrical) - */ -#ifdef LUMA_raw -#define RS 3 -#define PS 3 -#else -#define RS 3 -#define PS 3 -#endif - -/* Robust filtering - * - * This setting is dependent on code generation from shader_cfg, so this - * setting can only be enabled via shader_cfg. - * - * Compares the pixel-of-interest against a guide, which could be a downscaled - * image or the output of another shader - */ -#define RF_LUMA 1 -#define RF 0 - -/* Rotational/reflectional invariance - * - * Number of rotations/reflections to try for each patch comparison. Can be - * slow, but improves feature preservation. More rotations/reflections gives - * diminishing returns. The most similar rotation/reflection will be used. - * - * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a - * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc. - * - * RI: Rotational invariance - * RFI (0 to 2): Reflectional invariance - */ -#ifdef LUMA_raw -#define RI 3 -#define RFI 2 -#else -#define RI 0 -#define RFI 0 -#endif - -/* Temporal denoising - * - * This setting is dependent on code generation from shader_cfg, so this - * setting can only be enabled via shader_cfg. - * - * Caveats: - * - Slower: - * - Each frame needs to be researched (more samples & more math) - * - Gather optimizations only apply to the current frame - * - Requires vo=gpu-next - * - Luma-only (this is a bug) - * - Buggy - * - * May cause motion blur and may struggle more with noise that persists across - * multiple frames (e.g., from compression or duplicate frames), but can work - * very well on high quality video. - * - * Motion estimation (ME) should improve quality without impacting speed. - * - * T: number of frames used - * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg - * MEF: estimate factor, compensates for ME being one frame behind - * TRF: compare against the denoised frames - */ -#ifdef LUMA_raw -#define T 0 -#define ME 1 -#define MEF 2 -#define TRF 0 -#else -#define T 0 -#define ME 0 -#define MEF 2 -#define TRF 0 -#endif - -/* Spatial kernel - * - * Increasing the spatial denoising factor (SS) reduces the weight of further - * pixels. - * - * Spatial distortion instructs the spatial kernel to view that axis as - * closer/further, for instance SD=(1,1,0.5) would make the temporal axis - * appear closer and increase blur between frames. - * - * The intra-patch variants are supposed to help with larger patch sizes. - * - * SST: enables spatial kernel if R>=PST, 0 fully disables - * SS: spatial sigma - * SD: spatial distortion (X, Y, time) - * PSS: intra-patch spatial sigma - * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables - * PSD: intra-patch spatial distortion (X, Y) - */ -#ifdef LUMA_raw -#define SST 1 -#define SS 0.5547703803256947 -#define SD vec3(1,1,1) -#define PST 0 -#define PSS 0.0 -#define PSD vec2(1,1) -#else -#define SST 1 -#define SS 0.5547703803256947 -#define SD vec3(1,1,1) -#define PST 0 -#define PSS 0.0 -#define PSD vec2(1,1) -#endif - -/* Kernels - * - * SK: spatial kernel - * RK: range kernel (takes patch differences) - * PSK: intra-patch spatial kernel - * - * List of available kernels: - * - * bicubic - * cos - * gaussian - * lanczos - * quadratic - * sinc - * sphinx - */ -#ifdef LUMA_raw -#define SK lanczos -#define RK gaussian -#define PSK gaussian -#else -#define SK lanczos -#define RK gaussian -#define PSK gaussian -#endif - -// Scaling factor (should match WIDTH/HEIGHT) -#ifdef LUMA_raw -#define SF 1 -#else -#define SF 1 -#endif - -/* Visualization - * - * 0: off - * 1: absolute difference between input/output to the power of 0.25 - * 2: difference between input/output centered on 0.5 - * 3: avg_weight - * 4: edge map (based on the relevant AS settings) - */ -#ifdef LUMA_raw -#define V 0 -#else -#define V 0 -#endif - -// Blur factor (0.0 returns the input image, 1.0 returns the output image) -#ifdef LUMA_raw -#define BF 1.0 -#else -#define BF 1.0 -#endif - -// Force disable textureGather -#ifdef LUMA_raw -#define NG 0 -#else -#define NG 0 -#endif - -// Patch donut (probably useless) -#ifdef LUMA_raw -#define PD 0 -#else -#define PD 0 -#endif - -// Duplicate 1st weight (for luma-guided-chroma) -#ifdef LUMA_raw -#define D1W 0 -#else -#define D1W 0 -#endif - -// Skip patch comparison -#ifdef LUMA_raw -#define SKIP_PATCH 0 -#else -#define SKIP_PATCH 0 -#endif - -// Shader code - -#define EPSILON 0.00000000001 -#define M_PI 3.14159265358979323846 -#define POW2(x) ((x)*(x)) -#define POW3(x) ((x)*(x)*(x)) -#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0)))) -#define gaussian(x) exp(-1 * POW2(x)) -#define lanczos(x) POW2(sinc(x)) -#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5)) -#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI)) -#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI)) - -// XXX could maybe be better optimized on LGC -// XXX return original alpha component instead of 1.0 -#if defined(LUMA_raw) -#define val float -#define val_swizz(v) (v.x) -#define unval(v) vec4(v.x, 0, 0, 1.0) -#define val_packed val -#define val_pack(v) (v) -#define val_unpack(v) (v) -#elif defined(CHROMA_raw) -#define val vec2 -#define val_swizz(v) (v.xy) -#define unval(v) vec4(v.x, v.y, 0, 1.0) -#define val_packed uint -#define val_pack(v) packUnorm2x16(v) -#define val_unpack(v) unpackUnorm2x16(v) -#else -#define val vec3 -#define val_swizz(v) (v.xyz) -#define unval(v) vec4(v.x, v.y, v.z, 1.0) -#define val_packed val -#define val_pack(v) (v) -#define val_unpack(v) (v) -#endif - -#if PS == 6 -const int hp = P/2; -#else -const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes -#endif - -#if RS == 6 -const int hr = R/2; -#else -const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes -#endif - -// donut increment, increments without landing on (0,0,0) -// much faster than a continue statement -#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0)))) - -// patch/research shapes -// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R) -// dots (.) represent samples (pixels) and X represents the pixel-of-interest - -// Z ..... -// Z ..... -// Z ..X.. -// Z ..... -// Z ..... -#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr) - -// (in this instance Z=4) -// Z .... -// Z .... -// Z ..X. -// Z .... -#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr) - -// Z-4 . -// Z-2 ... -// Z ..X.. -#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr) - -// Z-4 . -// Z-2 ... -// hz+1 ..X -#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr) -#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z) - -// Z-4 . -// Z-2 ... -// Z ..X.. -// Z-2 ... -// Z-4 . -#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr) -#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z) - -// -// Z ..X.. -// -#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++) - -// 90 degree rotation of S_HORIZONTAL -#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr) - -// 1 . -// 1 . -// Z ..X.. -// 1 . -// 1 . -#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr) -#define S_PLUS_A(hz,Z) (Z*2 - 1) - -// XXX implement S_PLUS w/ an X overlayed: -// 3 . . . -// 3 ... -// Z ..X.. -// 3 ... -// 3 . . . - -// XXX implement an X shape: -// 2 . . -// 2 . . -// 1 X -// 2 . . -// 2 . . - -// 1x1 square -#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++) - -#define T1 (T+1) -#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++) - -#ifdef LUMA_raw -#define RF_ RF_LUMA -#else -#define RF_ RF -#endif - -// Skip comparing the pixel-of-interest against itself, unless RF is enabled -#if RF_ -#define RINCR(z,c) (z.c++) -#else -#define RINCR DINCR -#endif - -#define R_AREA(a) (a * T1 + RF_-1) - -// research shapes -// XXX would be nice to have the option of temporally-varying research sizes -#if R == 0 || R == 1 -#define FOR_RESEARCH(r) S_1X1(r) -const int r_area = R_AREA(1); -#elif RS == 7 -#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y)) -const int r_area = R_AREA(S_PLUS_A(hr,R)); -#elif RS == 6 -#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y)) -const int r_area = R_AREA(R*R); -#elif RS == 5 -#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x)) -const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); -#elif RS == 4 -#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x)) -const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); -#elif RS == 3 -#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y)) -const int r_area = R_AREA(S_DIAMOND_A(hr,R)); -#elif RS == 2 -#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y)) -const int r_area = R_AREA(R); -#elif RS == 1 -#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x)) -const int r_area = R_AREA(R); -#elif RS == 0 -#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y)) -const int r_area = R_AREA(R*R); -#endif - -#define RI1 (RI+1) -#define RFI1 (RFI+1) - -#if RI -#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1) -#else -#define FOR_ROTATION -#endif - -#if RFI -#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++) -#else -#define FOR_REFLECTION -#endif - -#if PD -#define PINCR DINCR -#else -#define PINCR(z,c) (z.c++) -#endif - -#define P_AREA(a) (a - PD) - -// patch shapes -#if P == 0 || P == 1 -#define FOR_PATCH(p) S_1X1(p) -const int p_area = P_AREA(1); -#elif PS == 7 -#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y)) -const int p_area = P_AREA(S_PLUS_A(hp,P)); -#elif PS == 6 -#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y)) -const int p_area = P_AREA(P*P); -#elif PS == 5 -#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x)) -const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); -#elif PS == 4 -#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x)) -const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); -#elif PS == 3 -#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y)) -const int p_area = P_AREA(S_DIAMOND_A(hp,P)); -#elif PS == 2 -#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y)) -const int p_area = P_AREA(P); -#elif PS == 1 -#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x)) -const int p_area = P_AREA(P); -#elif PS == 0 -#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y)) -const int p_area = P_AREA(P*P); -#endif - -const float r_scale = 1.0/r_area; -const float p_scale = 1.0/p_area; - -#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size))) -#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off) - -#if RF_ && defined(LUMA_raw) -#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off) -#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr))) -#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0) -#elif RF_ && D1W -#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off) -#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr))) -#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0) -#elif RF_ -#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off) -#else -#define load2_(off) load_(off) -#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr))) -#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0) -#endif - -#if T -val load(vec3 off) -{ - switch (min(int(off.z), frame)) { - case 0: return val_swizz(load_(off)); - - } -} -val load2(vec3 off) -{ - return off.z == 0 ? val_swizz(load2_(off)) : load(off); -} -#else -#define load(off) val_swizz(load_(off)) -#define load2(off) val_swizz(load2_(off)) -#endif - -val poi = load(vec3(0)); // pixel-of-interest -val poi2 = load2(vec3(0)); // guide pixel-of-interest - -#if RI // rotation -vec2 rot(vec2 p, float d) -{ - return vec2( - p.x * cos(radians(d)) - p.y * sin(radians(d)), - p.y * sin(radians(d)) + p.x * cos(radians(d)) - ); -} -#else -#define rot(p, d) (p) -#endif - -#if RFI // reflection -vec2 ref(vec2 p, int d) -{ - switch (d) { - case 0: return p; - case 1: return p * vec2(1, -1); - case 2: return p * vec2(-1, 1); - } -} -#else -#define ref(p, d) (p) -#endif - -#if SST && R >= SST -float spatial_r(vec3 v) -{ - v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size); - return SK(length(v*SD)*SS); -} -#else -#define spatial_r(v) (1) -#endif - -#if PST && P >= PST -#define spatial_p(v) PSK(length(v*PSD)*PSS) -#else -#define spatial_p(v) (1) -#endif - -val range(val pdiff_sq) -{ - const float h = S*0.013; - const float pdiff_scale = 1.0/(h*h); - pdiff_sq = sqrt(pdiff_sq * pdiff_scale); -#if defined(LUMA_raw) - return RK(pdiff_sq); -#elif defined(CHROMA_raw) - return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y)); -#else - return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z)); -#endif - //return exp(-pdiff_sq * pdiff_scale); - - // weight function from the NLM paper, it's not very good - //return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale); -} - -val patch_comparison(vec3 r, vec3 r2) -{ - vec3 p; - val min_rot = val(p_area); - - FOR_ROTATION FOR_REFLECTION { - val pdiff_sq = val(0); - FOR_PATCH(p) { - vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z); - val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF); - diff_sq *= diff_sq; - diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy); - pdiff_sq += diff_sq; - } - min_rot = min(min_rot, pdiff_sq); - } - - return min_rot * p_scale; -} - -#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false -#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3) - -#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER -// 3x3 diamond/plus patch_comparison_gather -// XXX extend to support arbitrary sizes (probably requires code generation) -// XXX extend to support 3x3 square -// XXX support PSS -const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) }; -const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF }; -vec4 poi_patch = gather_offs(0, offsets); -float patch_comparison_gather(vec3 r, vec3 r2) -{ - float min_rot = p_area - 1; - vec4 transformer = gather_offs(r, offsets_sf); - FOR_ROTATION { - FOR_REFLECTION { - float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1)); - min_rot = min(diff_sq, min_rot); -#if RFI - switch(rfi) { - case 0: transformer = transformer.zyxw; break; - case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror - case 2: transformer = transformer.zyxw; break; // undoes last mirror - } -#endif - } -#if RI == 3 - transformer = transformer.wxyz; -#elif RI == 1 - transformer = transformer.zwxy; -#endif - } - float center_diff_sq = poi2.x - load2(r).x; - center_diff_sq *= center_diff_sq; - return (min_rot + center_diff_sq) * p_scale; -} -#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER -// tiled even square patch_comparison_gather -// XXX extend to support odd square? -float patch_comparison_gather(vec3 r, vec3 r2) -{ - vec2 tile; - float min_rot = p_area; - - /* gather order: - * w z - * x y - */ - float pdiff_sq = 0; - for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) { - vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy); - diff_sq *= diff_sq; - diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)), - spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0))); - pdiff_sq += dot(diff_sq, vec4(1)); - } - min_rot = min(min_rot, pdiff_sq); - - return min_rot * p_scale; -} -#else -#define patch_comparison_gather patch_comparison -#endif - -vec4 hook() -{ - val total_weight = val(0); - val sum = val(0); - val result = val(0); - - vec3 r = vec3(0); - vec3 p = vec3(0); - vec3 me = vec3(0); - -#if T && ME == 1 // temporal & motion estimation - vec3 me_tmp = vec3(0); - float maxweight = 0; -#elif T && ME == 2 // temporal & motion estimation - vec3 me_sum = vec3(0); - float me_weight = 0; -#endif - -#if WD == 2 // weight discard - int r_index = 0; - val_packed all_weights[r_area]; - val_packed all_pixels[r_area]; -#elif WD == 1 // weight discard - val no_weights = val(0); - val discard_total_weight = val(0); - val discard_sum = val(0); -#endif - - FOR_FRAME(r) { - // XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired) -#if T && ME == 1 // temporal & motion estimation max weight - if (r.z > 0) { - me += me_tmp * MEF; - me_tmp = vec3(0); - maxweight = 0; - } -#elif T && ME == 2 // temporal & motion estimation weighted average - if (r.z > 0) { - me += round(me_sum / me_weight * MEF); - me_sum = vec3(0); - me_weight = 0; - } -#endif - FOR_RESEARCH(r) { // main NLM logic -#if SKIP_PATCH - val weight = val(1); -#else - val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0)); - val weight = range(pdiff_sq); -#endif - -#if T && ME == 1 // temporal & motion estimation max weight - me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); - maxweight = max(maxweight, weight.x); -#elif T && ME == 2 // temporal & motion estimation weighted average - me_sum += vec3(r.xy,0) * weight.x; - me_weight += weight.x; -#endif - -#if D1W - weight = val(weight.x); -#endif - - weight *= spatial_r(r); - -#if WD == 2 // weight discard - all_weights[r_index] = val_pack(weight); - all_pixels[r_index] = val_pack(load(r+me)); - r_index++; -#elif WD == 1 // weight discard - val wd_scale = 1.0/max(no_weights, 1); - val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight); - discard_sum += load(r+me) * weight * (1 - keeps); - discard_total_weight += weight * (1 - keeps); - no_weights += keeps; -#endif - - sum += load(r+me) * weight; - total_weight += weight; - } // FOR_RESEARCH - } // FOR_FRAME - - val avg_weight = total_weight * r_scale; - val old_avg_weight = avg_weight; - -#if WD == 2 // true average - total_weight = val(0); - sum = val(0); - val no_weights = val(0); - - for (int i = 0; i < r_area; i++) { - val w = val_unpack(all_weights[i]); - val px = val_unpack(all_pixels[i]); - val keeps = step(avg_weight*WDT, w); - - w *= keeps; - sum += px * w; - total_weight += w; - no_weights += keeps; - } -#elif WD == 1 // moving cumulative average - total_weight -= discard_total_weight; - sum -= discard_sum; -#endif -#if WD // weight discard - avg_weight = total_weight / no_weights; -#endif - - total_weight += SW * spatial_r(vec3(0)); - sum += poi * SW * spatial_r(vec3(0)); - -#if V == 3 // weight map - result = val(avg_weight); -#else // mean - result = val(sum / total_weight); -#endif - - // store frames for temporal -#if T > 1 - -#endif -#if T && TRF - imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result)); -#elif T - imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2)); -#endif - -#if ASW == 0 // pre-WD weights -#define AS_weight old_avg_weight -#elif ASW == 1 // post-WD weights -#define AS_weight avg_weight -#endif - -#if ASK == 0 - val sharpening_strength = pow(AS_weight, val(ASP)); -#elif ASK == 1 - val sharpening_strength = mix( - pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)), - AS_weight, ASC); - // XXX normalize the result to account for a negative ASC? -#elif ASK == 2 - val sharpening_strength = val(ASP); -#endif - -#if AS == 1 // sharpen+denoise - val sharpened = result + (poi - result) * ASF; -#elif AS == 2 // sharpen only - val sharpened = poi + (poi - result) * ASF; -#endif - -#if EP // extremes preserve - float luminance = EP_texOff(0).x; - // EPSILON is needed since pow(0,0) is undefined - float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); - result = mix(poi, result, ep_weight); -#endif - -#if AS == 1 // sharpen+denoise - result = mix(sharpened, result, sharpening_strength); -#elif AS == 2 // sharpen only - result = mix(sharpened, poi, sharpening_strength); -#endif - -#if V == 4 // edge map - result = sharpening_strength; -#endif - -#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations - return vec4(0.5); -#endif - -#if V == 1 - result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0); -#elif V == 2 - result = (poi - result) * 0.5 + 0.5; -#endif - - return unval(mix(poi, result, BF)); -} - diff --git a/portable_config/shaders/nlmeans_hqx.glsl b/portable_config/shaders/nlmeans_hqx.glsl index d9b0a96e..a90f7876 100644 --- a/portable_config/shaders/nlmeans_hqx.glsl +++ b/portable_config/shaders/nlmeans_hqx.glsl @@ -19,301 +19,2249 @@ * along with this program. If not, see . */ -// Description: nlmeans_hqx.glsl: Very slow, should offer the best quality. +// Description: nlmeans.glsl: Very slow, should offer the best quality. -/* The recommended usage of this shader and its variant profiles is to add them - * to input.conf and then dispatch the appropriate shader via a keybind during - * media playback. Here is an example input.conf entry: +/* This shader is highly configurable via user variables below. Although the + * default settings should offer good quality at a reasonable speed, you are + * encouraged to tweak them to your preferences. + */ + +// The following is shader code injected from ../nlmeans.glsl +/* vi: ft=c + * + * Based on vf_nlmeans.c from FFmpeg. + * + * Copyright (c) 2022 an3223 + * Copyright (c) 2016 Clément Bœsch + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 2.1 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + */ + +// Description: nlmeans.glsl: Default profile, general purpose, tuned for low noise + +/* This shader is highly configurable via user variables below. Although the + * default settings should offer good quality at a reasonable speed, you are + * encouraged to tweak them to your preferences. + */ + +// The following is shader code injected from ../LQ/nlmeans.glsl +/* vi: ft=c * - * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)" + * Based on vf_nlmeans.c from FFmpeg. * - * These shaders can also be enabled by default in mpv.conf, for example: + * Copyright (c) 2022 an3223 + * Copyright (c) 2016 Clément Bœsch * - * glsl-shaders='~~/shaders/nlmeans.glsl' + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 2.1 of the License, or (at + * your option) any later version. * - * Both of the examples above assume the shaders are located in a subdirectory - * named "shaders" within mpv's config directory. Refer to the mpv - * documentation for more details. + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. * - * This shader is highly configurable via user variables below. Although the + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + */ + +// Description: nlmeans.glsl: Faster, but lower quality. + +/* This shader is highly configurable via user variables below. Although the * default settings should offer good quality at a reasonable speed, you are - * encouraged to tweak them to your preferences. Be mindful that certain - * settings may greatly affect speed. + * encouraged to tweak them to your preferences. + */ + +//!HOOK LUMA +//!HOOK CHROMA +//!BIND HOOKED +//!DESC Non-local means (nlmeans.glsl) +//!SAVE _INJ_RF_LUMA + +// User variables + +// It is generally preferable to denoise luma and chroma differently, so the +// user variables for luma and chroma are split. + +// Denoising factor (level of blur, higher means more blur) +#ifdef LUMA_raw +#define S 3.5968056672833097 +#else +#define S 5.191526541606411 +#endif + +/* Adaptive sharpening + * + * Performs an unsharp mask by subtracting the spatial kernel's blur from the + * NLM blur. For sharpen+denoise the sharpening is limited to edge areas and + * denoising is done everywhere else. + * + * Use V=4 to visualize which areas are sharpened (black means sharpen). * - * Denoising is most useful for noisy content. If there is no perceptible - * noise, you probably won't see a positive difference. + * AS: + * - 0: disable + * - 1: sharpen+denoise + * - 2: sharpen only + * ASF: Higher numbers make a sharper image + * ASA: Anti-ringing, higher numbers increase strength + * ASP: Power, lower numbers increase sharpening on lower frequency detail + */ +#ifdef LUMA_raw +#define AS 0 +#define ASF 0.1625 +#define ASA 5.0 +#define ASP 0.5 +#else +#define AS 0 +#define ASF 0.1625 +#define ASA 5.0 +#define ASP 0.5 +#endif + +/* Starting weight * - * The default settings are generally tuned for low noise and high detail - * preservation. The "medium" and "heavy" profiles are tuned for higher levels - * of noise. + * Also known as the center weight. This represents the weight of the + * pixel-of-interest. Lower numbers may help handle heavy noise & ringing. * - * The denoiser will not work properly if the content has been upscaled - * beforehand (whether it was done by you or not). In such cases, consider - * issuing a command to downscale in the mpv console (backtick ` key): + * EPSILON should be used instead of zero to avoid divide-by-zero errors. + */ +#ifdef LUMA_raw +#define SW 0.7392620481427672 +#else +#define SW 0.6448288408806067 +#endif + +/* Weight discard * - * vf toggle scale=-2:720 + * Reduces weights that fall below a fraction of the average weight. This culls + * the most dissimilar samples from the blur, which can yield a better result, + * especially around edges. + * + * WD: + * - 2: Mean. Better quality, but slower and requires GLSL 4.0 or later + * - 1: Moving cumulative average. Fast but inaccurate, blurs directionally. + * - 0: Disable * - * ...replacing 720 with whatever resolution seems appropriate. Rerun the - * command to undo the downscale. It may take some trial-and-error to find the - * proper resolution. + * WDT: Threshold coefficient, higher numbers discard more + * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes + * WDS (not for WDK=is_zero): Higher numbers are more eager to reduce weights */ +#ifdef LUMA_raw +#define WD 1 +#define WDT 0.580415381682815 +#define WDP 5.381278367349288 +#define WDS 1.0 +#else +#define WD 1 +#define WDT 0.913447511792627 +#define WDP 5.832936323930807 +#define WDS 1.0 +#endif -/* Regarding speed +/* Extremes preserve * - * Speed may vary wildly for different vo and gpu-api settings. Generally - * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this - * may be different for your system. + * Reduce denoising in very bright/dark areas. * - * If your GPU doesn't support textureGather, or if you are on a version of mpv - * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile + * Disabled by default now. If you want to reenable this, set EP=3/ in + * Makefile.nlm and rebuild. * - * If you plan on tinkering with NLM's settings, read below: + * The downscaling factor of the EP shader stage affects what is considered a + * bright/dark area. * - * textureGather only applies to luma and limited to the these configurations: + * This is incompatible with RGB. If you have RGB hooks enabled then you will + * have to delete the EP shader stage or specify EP=0 through shader_cfg. * - * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2} - * - Default, very fast, rotations and reflections should be free - * - If this is unusually slow then try changing gpu-api and vo - * - If it's still slow, try setting RI/RFI to 0. + * EP: 1 to enable, 0 to disable + * DP: EP strength on dark areas, 0 to fully denoise + * BP: EP strength on bright areas, 0 to fully denoise + */ +#ifdef LUMA_raw +#define EP 0 +#define BP 0.75 +#define DP 0.25 +#else +#define EP 0 +#define BP 0.0 +#define DP 0.0 +#endif + +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ + +/* textureGather applicable configurations: * - * - PS=6:RI={0,1,3}:RFI={0,1,2} + * - PS={0,3,7,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2} + * - PS={0,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2} + * - PS=6:RI=0:RFI=0 * - Currently the only scalable variant - * - Patch shape is asymmetric on two axis - * - Rotations should have very little speed impact - * - Reflections may have a significant speed impact * * Options which always disable textureGather: - * - PD - * - NG + * - NG + * - SAMPLE + * - PD + * + * Running without textureGather may be much slower. */ -// The following is shader code injected from guided.glsl -/* vi: ft=c +/* Patch & research sizes * - * Copyright (c) 2022 an3223 + * P should be an odd number. Higher values are slower and not always better. * - * This program is free software: you can redistribute it and/or modify it - * under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 2.1 of the License, or (at - * your option) any later version. + * R should be an odd number greater than or equal to 3. Higher values are + * generally better, but slower, blurrier, and gives diminishing returns. + */ +#ifdef LUMA_raw +#define P 3 +#define R 5 +#else +#define P 3 +#define R 5 +#endif + +/* Patch and research shapes * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License - * for more details. + * Different shapes have different speed and quality characteristics. Every + * shape (besides square) is smaller than square. * - * You should have received a copy of the GNU Lesser General Public License - * along with this program. If not, see . + * PS applies applies to patches, RS applies to research zones. + * + * 0: square (symmetrical) + * 1: horizontal line (asymmetric) + * 2: vertical line (asymmetric) + * 3: diamond (symmetrical) + * 4: triangle (asymmetric, pointing upward) + * 5: truncated triangle (asymmetric on two axis, last row halved) + * 6: even sized square (asymmetric on two axis) + * 7: plus (symmetrical) + * 8: plus X (symmetrical) */ +#ifdef LUMA_raw +#define RS 3 +#define PS 4 +#else +#define RS 3 +#define PS 3 +#endif -// Description: guided.glsl: Guided by the downscaled image +/* Robust filtering + * + * This setting is dependent on code generation from shader_cfg, so this + * setting can only be enabled via shader_cfg. + * + * Computes weights on a guide, which could be a downscaled image or the output + * of another shader, and applies the weights to the original image + */ +#define RF_LUMA 0 +#define RF 0 -/* The radius can be adjusted with the MEANI stage's downscaling factor. - * Higher numbers give a bigger radius. +/* Rotational/reflectional invariance + * + * Number of rotations/reflections to try for each patch comparison. Can be + * slow, but improves feature preservation. More rotations/reflections gives + * diminishing returns. The most similar rotation/reflection will be used. * - * The E variable can be found in the A stage. + * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a + * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc. * - * The subsampling (fast guided filter) can be adjusted with the I stage's - * downscaling factor. Higher numbers are faster. + * Consider setting SAMPLE=1 if setting RI to a setting that would require + * sampling between pixels. * - * The guide's subsampling can be adjusted with the PREI stage's downscaling - * factor. Higher numbers downscale more. + * RI: Rotational invariance + * RFI (0 to 2): Reflectional invariance */ +#ifdef LUMA_raw +#define RI 0 +#define RFI 0 +#else +#define RI 0 +#define RFI 0 +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!BIND HOOKED -//!WIDTH HOOKED.w 1.25 / -//!HEIGHT HOOKED.h 1.25 / -//!DESC Guided filter (PREI) -//!SAVE _INJ_PREI +/* Temporal denoising + * + * This setting is dependent on code generation from shader_cfg, so this + * setting can only be enabled via shader_cfg. + * + * Caveats: + * - Slower: + * - Each frame needs to be researched (more samples & more math) + * - Gather optimizations only apply to the current frame + * - Requires vo=gpu-next + * - Luma-only (this is a bug) + * - Buggy + * + * May cause motion blur and may struggle more with noise that persists across + * multiple frames (e.g., from compression or duplicate frames), but can work + * very well on high quality video. + * + * Motion estimation (ME) should improve quality without impacting speed. + * + * T: number of frames used + * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg + * MEF: estimate factor, compensates for ME being one frame behind + * TRF: compare against the denoised frames + */ +#ifdef LUMA_raw +#define T 0 +#define ME 1 +#define MEF 2 +#define TRF 0 +#else +#define T 0 +#define ME 0 +#define MEF 2 +#define TRF 0 +#endif + +/* Spatial kernel + * + * Increasing the spatial denoising factor (SS) reduces the weight of further + * pixels. + * + * Spatial distortion instructs the spatial kernel to view that axis as + * closer/further, for instance SD=(1,1,0.5) would make the temporal axis + * appear closer and increase blur between frames. + * + * The intra-patch variants are supposed to help with larger patch sizes. + * + * SST: enables spatial kernel if R>=PST, 0 fully disables + * SS: spatial sigma + * SD: spatial distortion (X, Y, time) + * PSS: intra-patch spatial sigma + * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables + * PSD: intra-patch spatial distortion (X, Y) + */ +#ifdef LUMA_raw +#define SST 1 +#define SS 0.49764743714339127 +#define SD vec3(1,1,1) +#define PST 0 +#define PSS 0.0 +#define PSD vec2(1,1) +#else +#define SST 1 +#define SS 0.32091162692066677 +#define SD vec3(1,1,1) +#define PST 0 +#define PSS 0.0 +#define PSD vec2(1,1) +#endif + +/* Kernels + * + * SK: spatial kernel + * RK: range kernel (takes patch differences) + * PSK: intra-patch spatial kernel + * WDK: weight discard kernel + * WD1TK (WD=1 only): weight discard tolerance kernel + * + * List of available kernels: + * + * bicubic + * cos + * gaussian + * lanczos + * quadratic_ (unclamped) + * sinc + * sinc_ (unclamped) + * sinc3 + * sphinx + * sphinx_ (unclamped) + * triangle_ (unclamped) + * triangle + */ +#ifdef LUMA_raw +#define SK gaussian +#define RK gaussian +#define PSK gaussian +#define WDK is_zero +#define WD1TK gaussian +#else +#define SK gaussian +#define RK gaussian +#define PSK gaussian +#define WDK is_zero +#define WD1TK gaussian +#endif + +/* Sampling method + * + * In most cases this shouldn't make any difference, only set to bilinear if + * it's necessary to sample between pixels (e.g., RI=2). + * + * 0: nearest neighbor + * 1: bilinear + */ +#ifdef LUMA_raw +#define SAMPLE 0 +#else +#define SAMPLE 0 +#endif + +/* Research scaling factor + * + * Higher numbers sample more sparsely as the distance from the POI grows. + */ +#ifdef LUMA_raw +#define RSF 0.0 +#else +#define RSF 0.0 +#endif + +// Scaling factor (should match WIDTH/HEIGHT) +#ifdef LUMA_raw +#define SF 1 +#else +#define SF 1 +#endif + +// Use the guide image as the input image +#ifdef LUMA_raw +#define GUIDE_INPUT 0 +#else +#define GUIDE_INPUT 0 +#endif + +/* Visualization + * + * 0: off + * 1: absolute difference between input/output to the power of 0.25 + * 2: difference between input/output centered on 0.5 + * 3: post-WD weight map + * 4: pre-WD weight map + * 5: unsharp mask + * 6: EP + */ +#ifdef LUMA_raw +#define V 0 +#else +#define V 0 +#endif + +// Blur factor (0.0 returns the input image, 1.0 returns the output image) +#ifdef LUMA_raw +#define BF 1.0 +#else +#define BF 1.0 +#endif + +// Force disable textureGather +#ifdef LUMA_raw +#define NG 0 +#else +#define NG 0 +#endif + +// Patch donut (probably useless) +#ifdef LUMA_raw +#define PD 0 +#else +#define PD 0 +#endif + +// Duplicate 1st weight (for luma-guided-chroma) +#ifdef LUMA_raw +#define D1W 0 +#else +#define D1W 0 +#endif + +// Skip patch comparison +#ifdef LUMA_raw +#define SKIP_PATCH 0 +#else +#define SKIP_PATCH 0 +#endif + +// Shader code + +#define EPSILON 1.2e-38 +#define M_PI 3.14159265358979323846 +#define POW2(x) ((x)*(x)) +#define POW3(x) ((x)*(x)*(x)) +#define bicubic_(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0)))) +#define bicubic(x) bicubic_(clamp((x), 0.0, 2.0)) +#define gaussian(x) exp(-1 * POW2(x)) +#define quadratic_(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5)) +#define quadratic(x) quadratic_(clamp((x), 0.0, 1.5)) +#define sinc_(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI)) +#define sinc(x) sinc_(clamp((x), 0.0, 1.0)) +#define sinc3(x) sinc_(clamp((x), 0.0, 3.0)) +#define lanczos(x) (sinc3(x) * sinc(x)) +#define sphinx_(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI)) +#define sphinx(x) sphinx_(clamp((x), 0.0, 1.4302966531242027)) +#define triangle_(x) (1 - (x)) +#define triangle(x) triangle_(clamp((x), 0.0, 1.0)) +#define is_zero(x) int(x == 0) + +// XXX could maybe be better optimized on LGC +#if defined(LUMA_raw) +#define val float +#define val_swizz(v) (v.x) +#define unval(v) vec4(v.x, 0, 0, poi_.a) +#define val_packed val +#define val_pack(v) (v) +#define val_unpack(v) (v) +#elif defined(CHROMA_raw) +#define val vec2 +#define val_swizz(v) (v.xy) +#define unval(v) vec4(v.x, v.y, 0, poi_.a) +#define val_packed uint +#define val_pack(v) packUnorm2x16(v) +#define val_unpack(v) unpackUnorm2x16(v) +#else +#define val vec3 +#define val_swizz(v) (v.xyz) +#define unval(v) vec4(v.x, v.y, v.z, poi_.a) +#define val_packed val +#define val_pack(v) (v) +#define val_unpack(v) (v) +#endif + +#if PS == 6 +const int hp = P/2; +#else +const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes +#endif + +#if RS == 6 +const int hr = R/2; +#else +const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes +#endif + +// patch/research shapes +// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R) +// dots (.) represent samples (pixels) and X represents the pixel-of-interest + +// Z ..... +// Z ..... +// Z ..X.. +// Z ..... +// Z ..... +#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr) + +// (in this instance Z=4) +// Z .... +// Z .... +// Z ..X. +// Z .... +#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr) + +// Z-4 . +// Z-2 ... +// Z ..X.. +#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr) + +// Z-4 . +// Z-2 ... +// hz+1 ..X +#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr) +#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z) + +// Z-4 . +// Z-2 ... +// Z ..X.. +// Z-2 ... +// Z-4 . +#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr) +#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z) + +// +// Z ..X.. +// +#define S_HORIZONTAL(z,hz,incr) for (z.y = 0; z.y <= 0; z.y++) for (z.x = -hz; z.x <= hz; incr) + +// 90 degree rotation of S_HORIZONTAL +#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr) + +// 1 . +// 1 . +// Z ..X.. +// 1 . +// 1 . +#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr) +#define S_PLUS_A(hz,Z) (Z*2 - 1) + +// 3 . . . +// 3 ... +// Z ..X.. +// 3 ... +// 3 . . . +#define S_PLUS_X(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(z.x) + -hz * int(z.x == 0); z.y <= abs(z.x) + hz * int(z.x == 0); incr) +#define S_PLUS_X_A(hz,Z) (Z*4 - 3) + +// 1x1 square +#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++) + +#define T1 (T+1) +#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++) + +#ifdef LUMA_raw +#define RF_ RF_LUMA +#else +#define RF_ RF +#endif + +// donut increment, increments without landing on (0,0,0) +// much faster than a continue statement +#define DINCR(z,c,a) ((z.c += a),(z.c += int(z == vec3(0)))) + +#define R_AREA(a) (a * T1 - 1) + +// research shapes +// XXX would be nice to have the option of temporally-varying research sizes +#if R == 0 || R == 1 +#define FOR_RESEARCH(r) S_1X1(r) +const int r_area = R_AREA(1); +#elif RS == 8 +#define FOR_RESEARCH(r) S_PLUS_X(r,hr,DINCR(r,y,max(1,abs(r.x)))) +const int r_area = R_AREA(S_PLUS_X_A(hr,R)); +#elif RS == 7 +#define FOR_RESEARCH(r) S_PLUS(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(S_PLUS_A(hr,R)); +#elif RS == 6 +#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(R*R); +#elif RS == 5 +#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,DINCR(r,x,1)) +const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); +#elif RS == 4 +#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,DINCR(r,x,1)) +const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); +#elif RS == 3 +#define FOR_RESEARCH(r) S_DIAMOND(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(S_DIAMOND_A(hr,R)); +#elif RS == 2 +#define FOR_RESEARCH(r) S_VERTICAL(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(R); +#elif RS == 1 +#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,DINCR(r,x,1)) +const int r_area = R_AREA(R); +#elif RS == 0 +#define FOR_RESEARCH(r) S_SQUARE(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(R*R); +#endif + +#define RI1 (RI+1) +#define RFI1 (RFI+1) + +#if RI +#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1) +#else +#define FOR_ROTATION +#endif + +#if RFI +#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++) +#else +#define FOR_REFLECTION +#endif + +#if PD +#define PINCR DINCR +#else +#define PINCR(z,c,a) (z.c += a) +#endif + +#define P_AREA(a) (a - PD) + +// patch shapes +#if P == 0 || P == 1 +#define FOR_PATCH(p) S_1X1(p) +const int p_area = P_AREA(1); +#elif PS == 8 +#define FOR_PATCH(p) S_PLUS_X(p,hp,PINCR(p,y,max(1,abs(p.x)))) +const int p_area = P_AREA(S_PLUS_X_A(hp,P)); +#elif PS == 7 +#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(S_PLUS_A(hp,P)); +#elif PS == 6 +#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(P*P); +#elif PS == 5 +#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x,1)) +const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); +#elif PS == 4 +#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x,1)) +const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); +#elif PS == 3 +#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(S_DIAMOND_A(hp,P)); +#elif PS == 2 +#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(P); +#elif PS == 1 +#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x,1)) +const int p_area = P_AREA(P); +#elif PS == 0 +#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(P*P); +#endif + +const float r_scale = 1.0/r_area; +const float p_scale = 1.0/p_area; + +#if SAMPLE == 0 +#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * (vec2(off) + 0.5 - fract((pos) * (size)))) +#else +#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * vec2(off)) +#endif + +#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off) + +#if RF_ && defined(LUMA_raw) +#define load2_(off) sample(_INJ_RF_LUMA_tex, _INJ_RF_LUMA_pos, _INJ_RF_LUMA_size, _INJ_RF_LUMA_pt, off) +#define gather_offs(off, off_arr) (_INJ_RF_LUMA_mul * vec4(textureGatherOffsets(_INJ_RF_LUMA_raw, _INJ_RF_LUMA_pos + vec2(off) * _INJ_RF_LUMA_pt, off_arr))) +#define gather(off) _INJ_RF_LUMA_gather(_INJ_RF_LUMA_pos + (off) * _INJ_RF_LUMA_pt, 0) +#elif RF_ && D1W +#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off) +#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr))) +#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0) +#elif RF_ +#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off) +#else +#define load2_(off) load_(off) +#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr))) +#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0) +#endif + +#if T +val load(vec3 off) +{ + switch (min(int(off.z), frame)) { + case 0: return val_swizz(load_(off)); + + } +} +val load2(vec3 off) +{ + return off.z == 0 ? val_swizz(load2_(off)) : load(off); +} +#else +#define load(off) val_swizz(load_(off)) +#define load2(off) val_swizz(load2_(off)) +#endif + +val poi2 = load2(vec3(0)); // guide pixel-of-interest +#if GUIDE_INPUT +#define poi poi2 +#else +vec4 poi_ = load_(vec3(0)); +val poi = val_swizz(poi_); // pixel-of-interest +#endif + +#if RI // rotation +vec2 rot(vec2 p, float d) +{ + return vec2( + p.x * cos(radians(d)) - p.y * sin(radians(d)), + p.y * sin(radians(d)) + p.x * cos(radians(d)) + ); +} +#else +#define rot(p, d) (p) +#endif + +#if RFI // reflection +vec2 ref(vec2 p, int d) +{ + switch (d) { + case 0: return p; + case 1: return p * vec2(1, -1); + case 2: return p * vec2(-1, 1); + } +} +#else +#define ref(p, d) (p) +#endif + +#if SST && R >= SST +float spatial_r(vec3 v) +{ + v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size); + return SK(length(v*SD)*SS); +} +#else +#define spatial_r(v) (1) +#endif + +#if PST && P >= PST +#define spatial_p(v) PSK(length(v*PSD)*PSS) +#else +#define spatial_p(v) (1) +#endif + +val range(val pdiff_sq) +{ + const float h = max(S, 0.0) * 0.013; + const float pdiff_scale = 1.0/(h*h); + pdiff_sq = sqrt(pdiff_sq * pdiff_scale); +#if defined(LUMA_raw) + return RK(pdiff_sq); +#elif defined(CHROMA_raw) + return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y)); +#else + return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z)); +#endif +} + +val patch_comparison(vec3 r, vec3 r2) +{ + vec3 p; + val min_rot = val(p_area); + + FOR_ROTATION FOR_REFLECTION { + val pdiff_sq = val(0); + FOR_PATCH(p) { + vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z); + val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF); + diff_sq *= diff_sq; + diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy); + pdiff_sq += diff_sq; + } + min_rot = min(min_rot, pdiff_sq); + } + + return min_rot * p_scale; +} + +#define NO_GATHER (PD == 0 && NG == 0 && SAMPLE == 0) // never textureGather if any of these conditions are false +#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3 || RI == 7) + +#if (defined(LUMA_gather) || D1W) && ((PS == 0 || ((PS == 3 || PS == 7) && RI != 7) || PS == 8) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER +// 3x3 diamond/plus patch_comparison_gather +// XXX extend to support arbitrary sizes (probably requires code generation) +// XXX support PSS +const ivec2 offsets_adj[4] = { ivec2(0,-1), ivec2(1,0), ivec2(0,1), ivec2(-1,0) }; +const ivec2 offsets_adj_sf[4] = { ivec2(0,-1) * SF, ivec2(1,0) * SF, ivec2(0,1) * SF, ivec2(-1,0) * SF }; +vec4 poi_patch_adj = gather_offs(0, offsets_adj); +#if PS == 0 || PS == 8 +const ivec2 offsets_diag[4] = { ivec2(-1,-1), ivec2(1,-1), ivec2(1,1), ivec2(-1,1) }; +const ivec2 offsets_diag_sf[4] = { ivec2(-1,-1) * SF, ivec2(1,-1) * SF, ivec2(1,1) * SF, ivec2(-1,1) * SF }; +vec4 poi_patch_diag = gather_offs(0, offsets_diag); +#endif +float patch_comparison_gather(vec3 r, vec3 r2) +{ + float min_rot = p_area - 1; + vec4 transformer_adj = gather_offs(r, offsets_adj_sf); +#if PS == 0 || PS == 8 + vec4 transformer_diag = gather_offs(r, offsets_diag_sf); +#endif + FOR_ROTATION { + FOR_REFLECTION { +#if RFI + /* xxy + * w y + * wzz + */ + switch(rfi) { + case 1: + transformer_adj = transformer_adj.zyxw; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.zyxw; +#endif + break; + case 2: + transformer_adj = transformer_adj.xwzy; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.xwzy; +#endif + break; + } +#endif + + vec4 diff = poi_patch_adj - transformer_adj; +#if PS == 0 || PS == 8 + diff += poi_patch_diag - transformer_diag; +#endif + float diff_sq = dot(diff * diff, vec4(1)); + min_rot = min(diff_sq, min_rot); + +// un-reflect +#if RFI + switch(rfi) { + case 1: + transformer_adj = transformer_adj.zyxw; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.zyxw; +#endif + break; + case 2: + transformer_adj = transformer_adj.xwzy; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.xwzy; +#endif + break; + } +#endif + } // FOR_REFLECTION +#if RI == 7 + transformer_adj = transformer_adj.wxyz; + // swap adjacents for diagonals + transformer_adj += transformer_diag; + transformer_diag = transformer_adj - transformer_diag; + transformer_adj -= transformer_diag; +#elif RI == 3 + transformer_adj = transformer_adj.wxyz; +#elif RI == 1 + transformer_adj = transformer_adj.zwxy; +#endif +#if RI == 3 && (PS == 0 || PS == 8) + transformer_diag = transformer_diag.wxyz; +#elif RI == 1 && (PS == 0 || PS == 8) + transformer_diag = transformer_diag.zwxy; +#endif + } // FOR_ROTATION + float center_diff = poi2.x - load2(r).x; + return (center_diff * center_diff + min_rot) * p_scale; +} +#elif (defined(LUMA_gather) || D1W) && PS == 4 && P == 3 && RI == 0 && RFI == 0 && NO_GATHER +const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,0), ivec2(1,0) }; +const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,0) * SF, ivec2(1,0) * SF }; +vec4 poi_patch = gather_offs(0, offsets); +float patch_comparison_gather(vec3 r, vec3 r2) +{ + vec4 pdiff = poi_patch - gather_offs(r, offsets_sf); + return dot(pdiff * pdiff, vec4(1)) * p_scale; +} +#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER +// tiled even square patch_comparison_gather +// XXX extend to support odd square? +float patch_comparison_gather(vec3 r, vec3 r2) +{ + vec2 tile; + float min_rot = p_area; + + /* gather order: + * w z + * x y + */ + float pdiff_sq = 0; + for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) { + vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy); + diff_sq *= diff_sq; + diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)), + spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0))); + pdiff_sq += dot(diff_sq, vec4(1)); + } + min_rot = min(min_rot, pdiff_sq); + + return min_rot * p_scale; +} +#else +#define patch_comparison_gather patch_comparison +#endif + +vec4 hook() +{ + val total_weight = val(0); + val sum = val(0); + val result = val(0); + + vec3 r = vec3(0); + vec3 p = vec3(0); + vec3 me = vec3(0); + +#if T && ME == 1 // temporal & motion estimation + vec3 me_tmp = vec3(0); + float maxweight = 0; +#elif T && ME == 2 // temporal & motion estimation + vec3 me_sum = vec3(0); + float me_weight = 0; +#endif + +#if AS + val total_weight_s = val(0); + val sum_s = val(0); +#endif + +#if WD == 2 // weight discard (mean) + int r_index = 0; + val_packed all_weights[r_area]; + val_packed all_pixels[r_area]; +#elif WD == 1 // weight discard (moving cumulative average) + int r_iter = 1; + val wd_total_weight = val(0); + val wd_sum = val(0); +#endif + + FOR_FRAME(r) { + // XXX ME is always a frame behind, should have the option to re-research after applying ME (could do it an arbitrary number of times per frame if desired) +#if T && ME == 1 // temporal & motion estimation max weight + if (r.z > 0) { + me += me_tmp * MEF; + me_tmp = vec3(0); + maxweight = 0; + } +#elif T && ME == 2 // temporal & motion estimation weighted average + if (r.z > 0) { + me += round(me_sum / me_weight * MEF); + me_sum = vec3(0); + me_weight = 0; + } +#endif + FOR_RESEARCH(r) { + // r coords with appropriate transformations applied + vec3 tr = vec3(r.xy + floor(r.xy * RSF), r.z); + float spatial_weight = spatial_r(tr); + tr.xy += me.xy; + + val px = load(tr); + +#if SKIP_PATCH + val weight = val(1); +#else + val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(tr, vec3(0))) : patch_comparison(tr, vec3(0)); + val weight = range(pdiff_sq); +#endif + +#if T && ME == 1 // temporal & motion estimation max weight + me_tmp = vec3(tr.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); + maxweight = max(maxweight, weight.x); +#elif T && ME == 2 // temporal & motion estimation weighted average + me_sum += vec3(tr.xy,0) * weight.x; + me_weight += weight.x; +#endif + +#if D1W + weight = val(weight.x); +#endif + + weight *= spatial_weight; + +#if AS + spatial_weight *= int(r.z == 0); // ignore temporal + sum_s += px * spatial_weight; + total_weight_s += spatial_weight; +#endif + +#if WD == 2 // weight discard (mean) + all_weights[r_index] = val_pack(weight); + all_pixels[r_index] = val_pack(px); + r_index++; +#elif WD == 1 // weight discard (moving cumulative average) + val wd_scale = val(1.0/r_iter); + val below_threshold = WDS * abs(min(val(0.0), weight - (total_weight * wd_scale * WDT * WD1TK(sqrt(wd_scale*WDP))))); +#if defined(LUMA_raw) + val wdkf = WDK(below_threshold); +#elif defined(CHROMA_raw) + val wdkf = vec2(WDK(below_threshold.x), WDK(below_threshold.y)); +#else + val wdkf = vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.y)); +#endif + wd_sum += px * weight * wdkf; + wd_total_weight += weight * wdkf; + r_iter++; +#endif + + sum += px * weight; + total_weight += weight; + } // FOR_RESEARCH + } // FOR_FRAME + + val avg_weight = total_weight * r_scale; + val old_avg_weight = avg_weight; + +#if WD == 2 // weight discard (mean) + total_weight = val(0); + sum = val(0); + + for (int i = 0; i < r_area; i++) { + val weight = val_unpack(all_weights[i]); + val px = val_unpack(all_pixels[i]); + + val below_threshold = WDS * abs(min(val(0.0), weight - (avg_weight * WDT))); +#if defined(LUMA_raw) + weight *= WDK(below_threshold); +#elif defined(CHROMA_raw) + weight *= vec2(WDK(below_threshold.x), WDK(below_threshold.y)); +#else + weight *= vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.z)); +#endif + + sum += px * weight; + total_weight += weight; + } +#elif WD == 1 // weight discard (moving cumulative average) + total_weight = wd_total_weight; + sum = wd_sum; +#endif +#if WD // weight discard + avg_weight = total_weight * r_scale; +#endif + + total_weight += SW * spatial_r(vec3(0)); + sum += poi * SW * spatial_r(vec3(0)); + result = val(sum / total_weight); + + // store frames for temporal +#if T > 1 + +#endif +#if T && TRF + imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result)); +#elif T + imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2)); +#endif + +#if AS == 1 // sharpen+denoise +#define AS_base result +#elif AS == 2 // sharpen only +#define AS_base poi +#endif +#if AS + val usm = result - sum_s/total_weight_s; + usm = exp(log(abs(usm))*ASP) * sign(usm); // avoiding pow() since it's buggy on nvidia + usm *= gaussian(abs((AS_base + usm - 0.5) / 1.5) * ASA); + usm *= ASF; + result = AS_base + usm; +#endif + +#if EP // extremes preserve + float luminance = EP_texOff(0).x; + // EPSILON is needed since pow(0,0) is undefined + float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); + result = mix(poi, result, ep_weight); +#else + float ep_weight = 0; +#endif + +#if V == 1 + result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0); +#elif V == 2 + result = (poi - result) * 0.5 + 0.5; +#elif V == 3 // post-WD weight map + result = avg_weight; +#elif V == 4 // pre-WD edge map + result = old_avg_weight; +#elif V == 5 + result = 0.5 + usm; +#elif V == 6 + result = val(1 - ep_weight); +#endif + +// XXX visualize chroma for these +#if defined(CHROMA_raw) && (V == 3 || V == 4 || V == 6) + return vec4(0.5); +#endif + + return unval(mix(poi, result, BF)); +} + +// End of source code injected from ../LQ/nlmeans.glsl + +//!HOOK LUMA +//!HOOK CHROMA +//!BIND _INJ_RF_LUMA +//!WIDTH _INJ_RF_LUMA.w +//!HEIGHT _INJ_RF_LUMA.h +//!DESC Non-local means (RF, share) +//!SAVE _INJ_RF + +vec4 hook() +{ +return _INJ_RF_LUMA_texOff(0); +} + +//!HOOK LUMA +//!HOOK CHROMA +//!BIND HOOKED +//!BIND _INJ_RF_LUMA +//!BIND _INJ_RF +//!DESC Non-local means (nlmeans.glsl) +//!SAVE RF_LUMA + +// User variables + +// It is generally preferable to denoise luma and chroma differently, so the +// user variables for luma and chroma are split. + +// Denoising factor (level of blur, higher means more blur) +#ifdef LUMA_raw +#define S 2.0522687499802097 +#else +#define S 2.5168955531436197 +#endif + +/* Adaptive sharpening + * + * Performs an unsharp mask by subtracting the spatial kernel's blur from the + * NLM blur. For sharpen+denoise the sharpening is limited to edge areas and + * denoising is done everywhere else. + * + * Use V=4 to visualize which areas are sharpened (black means sharpen). + * + * AS: + * - 0: disable + * - 1: sharpen+denoise + * - 2: sharpen only + * ASF: Higher numbers make a sharper image + * ASA: Anti-ringing, higher numbers increase strength + * ASP: Power, lower numbers increase sharpening on lower frequency detail + */ +#ifdef LUMA_raw +#define AS 0 +#define ASF 0.1625 +#define ASA 5.0 +#define ASP 0.5 +#else +#define AS 0 +#define ASF 0.1625 +#define ASA 5.0 +#define ASP 0.5 +#endif + +/* Starting weight + * + * Also known as the center weight. This represents the weight of the + * pixel-of-interest. Lower numbers may help handle heavy noise & ringing. + * + * EPSILON should be used instead of zero to avoid divide-by-zero errors. + */ +#ifdef LUMA_raw +#define SW 1.3011446081346498 +#else +#define SW 1.2219854377433914 +#endif + +/* Weight discard + * + * Reduces weights that fall below a fraction of the average weight. This culls + * the most dissimilar samples from the blur, which can yield a better result, + * especially around edges. + * + * WD: + * - 2: Mean. Better quality, but slower and requires GLSL 4.0 or later + * - 1: Moving cumulative average. Fast but inaccurate, blurs directionally. + * - 0: Disable + * + * WDT: Threshold coefficient, higher numbers discard more + * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes + * WDS (not for WDK=is_zero): Higher numbers are more eager to reduce weights + */ +#ifdef LUMA_raw +#define WD 2 +#define WDT 0.11671341022864548 +#define WDP 5.381278367349288 +#define WDS 1.0 +#else +#define WD 0 +#define WDT 0.002713346103131793 +#define WDP 5.832936323930807 +#define WDS 1.0 +#endif + +/* Extremes preserve + * + * Reduce denoising in very bright/dark areas. + * + * Disabled by default now. If you want to reenable this, set EP=3/ in + * Makefile.nlm and rebuild. + * + * The downscaling factor of the EP shader stage affects what is considered a + * bright/dark area. + * + * This is incompatible with RGB. If you have RGB hooks enabled then you will + * have to delete the EP shader stage or specify EP=0 through shader_cfg. + * + * EP: 1 to enable, 0 to disable + * DP: EP strength on dark areas, 0 to fully denoise + * BP: EP strength on bright areas, 0 to fully denoise + */ +#ifdef LUMA_raw +#define EP 0 +#define BP 0.75 +#define DP 0.25 +#else +#define EP 0 +#define BP 0.0 +#define DP 0.0 +#endif + +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ + +/* textureGather applicable configurations: + * + * - PS={0,3,7,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2} + * - PS={0,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2} + * - PS=6:RI=0:RFI=0 + * - Currently the only scalable variant + * + * Options which always disable textureGather: + * - NG + * - SAMPLE + * - PD + * + * Running without textureGather may be much slower. + */ + +/* Patch & research sizes + * + * P should be an odd number. Higher values are slower and not always better. + * + * R should be an odd number greater than or equal to 3. Higher values are + * generally better, but slower, blurrier, and gives diminishing returns. + */ +#ifdef LUMA_raw +#define P 3 +#define R 5 +#else +#define P 3 +#define R 5 +#endif + +/* Patch and research shapes + * + * Different shapes have different speed and quality characteristics. Every + * shape (besides square) is smaller than square. + * + * PS applies applies to patches, RS applies to research zones. + * + * 0: square (symmetrical) + * 1: horizontal line (asymmetric) + * 2: vertical line (asymmetric) + * 3: diamond (symmetrical) + * 4: triangle (asymmetric, pointing upward) + * 5: truncated triangle (asymmetric on two axis, last row halved) + * 6: even sized square (asymmetric on two axis) + * 7: plus (symmetrical) + * 8: plus X (symmetrical) + */ +#ifdef LUMA_raw +#define RS 3 +#define PS 3 +#else +#define RS 3 +#define PS 3 +#endif + +/* Robust filtering + * + * This setting is dependent on code generation from shader_cfg, so this + * setting can only be enabled via shader_cfg. + * + * Computes weights on a guide, which could be a downscaled image or the output + * of another shader, and applies the weights to the original image + */ +#define _INJ_RF_LUMA 1 +#define RF 1 + +/* Rotational/reflectional invariance + * + * Number of rotations/reflections to try for each patch comparison. Can be + * slow, but improves feature preservation. More rotations/reflections gives + * diminishing returns. The most similar rotation/reflection will be used. + * + * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a + * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc. + * + * Consider setting SAMPLE=1 if setting RI to a setting that would require + * sampling between pixels. + * + * RI: Rotational invariance + * RFI (0 to 2): Reflectional invariance + */ +#ifdef LUMA_raw +#define RI 0 +#define RFI 2 +#else +#define RI 0 +#define RFI 0 +#endif + +/* Temporal denoising + * + * This setting is dependent on code generation from shader_cfg, so this + * setting can only be enabled via shader_cfg. + * + * Caveats: + * - Slower: + * - Each frame needs to be researched (more samples & more math) + * - Gather optimizations only apply to the current frame + * - Requires vo=gpu-next + * - Luma-only (this is a bug) + * - Buggy + * + * May cause motion blur and may struggle more with noise that persists across + * multiple frames (e.g., from compression or duplicate frames), but can work + * very well on high quality video. + * + * Motion estimation (ME) should improve quality without impacting speed. + * + * T: number of frames used + * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg + * MEF: estimate factor, compensates for ME being one frame behind + * TRF: compare against the denoised frames + */ +#ifdef LUMA_raw +#define T 0 +#define ME 1 +#define MEF 2 +#define TRF 0 +#else +#define T 0 +#define ME 0 +#define MEF 2 +#define TRF 0 +#endif + +/* Spatial kernel + * + * Increasing the spatial denoising factor (SS) reduces the weight of further + * pixels. + * + * Spatial distortion instructs the spatial kernel to view that axis as + * closer/further, for instance SD=(1,1,0.5) would make the temporal axis + * appear closer and increase blur between frames. + * + * The intra-patch variants are supposed to help with larger patch sizes. + * + * SST: enables spatial kernel if R>=PST, 0 fully disables + * SS: spatial sigma + * SD: spatial distortion (X, Y, time) + * PSS: intra-patch spatial sigma + * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables + * PSD: intra-patch spatial distortion (X, Y) + */ +#ifdef LUMA_raw +#define SST 1 +#define SS 0.5296176863733414 +#define SD vec3(1,1,1) +#define PST 0 +#define PSS 0.0 +#define PSD vec2(1,1) +#else +#define SST 1 +#define SS 0.26295970436981203 +#define SD vec3(1,1,1) +#define PST 0 +#define PSS 0.0 +#define PSD vec2(1,1) +#endif + +/* Kernels + * + * SK: spatial kernel + * RK: range kernel (takes patch differences) + * PSK: intra-patch spatial kernel + * WDK: weight discard kernel + * WD1TK (WD=1 only): weight discard tolerance kernel + * + * List of available kernels: + * + * bicubic + * cos + * gaussian + * lanczos + * quadratic_ (unclamped) + * sinc + * sinc_ (unclamped) + * sinc3 + * sphinx + * sphinx_ (unclamped) + * triangle_ (unclamped) + * triangle + */ +#ifdef LUMA_raw +#define SK gaussian +#define RK gaussian +#define PSK gaussian +#define WDK is_zero +#define WD1TK gaussian +#else +#define SK gaussian +#define RK gaussian +#define PSK gaussian +#define WDK is_zero +#define WD1TK gaussian +#endif + +/* Sampling method + * + * In most cases this shouldn't make any difference, only set to bilinear if + * it's necessary to sample between pixels (e.g., RI=2). + * + * 0: nearest neighbor + * 1: bilinear + */ +#ifdef LUMA_raw +#define SAMPLE 0 +#else +#define SAMPLE 0 +#endif + +/* Research scaling factor + * + * Higher numbers sample more sparsely as the distance from the POI grows. + */ +#ifdef LUMA_raw +#define RSF 0.0 +#else +#define RSF 0.0 +#endif + +// Scaling factor (should match WIDTH/HEIGHT) +#ifdef LUMA_raw +#define SF 1 +#else +#define SF 1 +#endif + +// Use the guide image as the input image +#ifdef LUMA_raw +#define GUIDE_INPUT 0 +#else +#define GUIDE_INPUT 0 +#endif + +/* Visualization + * + * 0: off + * 1: absolute difference between input/output to the power of 0.25 + * 2: difference between input/output centered on 0.5 + * 3: post-WD weight map + * 4: pre-WD weight map + * 5: unsharp mask + * 6: EP + */ +#ifdef LUMA_raw +#define V 0 +#else +#define V 0 +#endif + +// Blur factor (0.0 returns the input image, 1.0 returns the output image) +#ifdef LUMA_raw +#define BF 1.0 +#else +#define BF 1.0 +#endif + +// Force disable textureGather +#ifdef LUMA_raw +#define NG 0 +#else +#define NG 0 +#endif + +// Patch donut (probably useless) +#ifdef LUMA_raw +#define PD 0 +#else +#define PD 0 +#endif + +// Duplicate 1st weight (for luma-guided-chroma) +#ifdef LUMA_raw +#define D1W 0 +#else +#define D1W 0 +#endif + +// Skip patch comparison +#ifdef LUMA_raw +#define SKIP_PATCH 0 +#else +#define SKIP_PATCH 0 +#endif + +// Shader code + +#define EPSILON 1.2e-38 +#define M_PI 3.14159265358979323846 +#define POW2(x) ((x)*(x)) +#define POW3(x) ((x)*(x)*(x)) +#define bicubic_(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0)))) +#define bicubic(x) bicubic_(clamp((x), 0.0, 2.0)) +#define gaussian(x) exp(-1 * POW2(x)) +#define quadratic_(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5)) +#define quadratic(x) quadratic_(clamp((x), 0.0, 1.5)) +#define sinc_(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI)) +#define sinc(x) sinc_(clamp((x), 0.0, 1.0)) +#define sinc3(x) sinc_(clamp((x), 0.0, 3.0)) +#define lanczos(x) (sinc3(x) * sinc(x)) +#define sphinx_(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI)) +#define sphinx(x) sphinx_(clamp((x), 0.0, 1.4302966531242027)) +#define triangle_(x) (1 - (x)) +#define triangle(x) triangle_(clamp((x), 0.0, 1.0)) +#define is_zero(x) int(x == 0) + +// XXX could maybe be better optimized on LGC +#if defined(LUMA_raw) +#define val float +#define val_swizz(v) (v.x) +#define unval(v) vec4(v.x, 0, 0, poi_.a) +#define val_packed val +#define val_pack(v) (v) +#define val_unpack(v) (v) +#elif defined(CHROMA_raw) +#define val vec2 +#define val_swizz(v) (v.xy) +#define unval(v) vec4(v.x, v.y, 0, poi_.a) +#define val_packed uint +#define val_pack(v) packUnorm2x16(v) +#define val_unpack(v) unpackUnorm2x16(v) +#else +#define val vec3 +#define val_swizz(v) (v.xyz) +#define unval(v) vec4(v.x, v.y, v.z, poi_.a) +#define val_packed val +#define val_pack(v) (v) +#define val_unpack(v) (v) +#endif + +#if PS == 6 +const int hp = P/2; +#else +const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes +#endif + +#if RS == 6 +const int hr = R/2; +#else +const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes +#endif + +// patch/research shapes +// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R) +// dots (.) represent samples (pixels) and X represents the pixel-of-interest + +// Z ..... +// Z ..... +// Z ..X.. +// Z ..... +// Z ..... +#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr) + +// (in this instance Z=4) +// Z .... +// Z .... +// Z ..X. +// Z .... +#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr) + +// Z-4 . +// Z-2 ... +// Z ..X.. +#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr) + +// Z-4 . +// Z-2 ... +// hz+1 ..X +#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr) +#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z) + +// Z-4 . +// Z-2 ... +// Z ..X.. +// Z-2 ... +// Z-4 . +#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr) +#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z) + +// +// Z ..X.. +// +#define S_HORIZONTAL(z,hz,incr) for (z.y = 0; z.y <= 0; z.y++) for (z.x = -hz; z.x <= hz; incr) + +// 90 degree rotation of S_HORIZONTAL +#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr) + +// 1 . +// 1 . +// Z ..X.. +// 1 . +// 1 . +#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr) +#define S_PLUS_A(hz,Z) (Z*2 - 1) + +// 3 . . . +// 3 ... +// Z ..X.. +// 3 ... +// 3 . . . +#define S_PLUS_X(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(z.x) + -hz * int(z.x == 0); z.y <= abs(z.x) + hz * int(z.x == 0); incr) +#define S_PLUS_X_A(hz,Z) (Z*4 - 3) + +// 1x1 square +#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++) + +#define T1 (T+1) +#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++) + +#ifdef LUMA_raw +#define _INJ_RF_ _INJ_RF_LUMA +#else +#define _INJ_RF_ RF +#endif + +// donut increment, increments without landing on (0,0,0) +// much faster than a continue statement +#define DINCR(z,c,a) ((z.c += a),(z.c += int(z == vec3(0)))) + +#define R_AREA(a) (a * T1 - 1) + +// research shapes +// XXX would be nice to have the option of temporally-varying research sizes +#if R == 0 || R == 1 +#define FOR_RESEARCH(r) S_1X1(r) +const int r_area = R_AREA(1); +#elif RS == 8 +#define FOR_RESEARCH(r) S_PLUS_X(r,hr,DINCR(r,y,max(1,abs(r.x)))) +const int r_area = R_AREA(S_PLUS_X_A(hr,R)); +#elif RS == 7 +#define FOR_RESEARCH(r) S_PLUS(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(S_PLUS_A(hr,R)); +#elif RS == 6 +#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(R*R); +#elif RS == 5 +#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,DINCR(r,x,1)) +const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); +#elif RS == 4 +#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,DINCR(r,x,1)) +const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); +#elif RS == 3 +#define FOR_RESEARCH(r) S_DIAMOND(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(S_DIAMOND_A(hr,R)); +#elif RS == 2 +#define FOR_RESEARCH(r) S_VERTICAL(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(R); +#elif RS == 1 +#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,DINCR(r,x,1)) +const int r_area = R_AREA(R); +#elif RS == 0 +#define FOR_RESEARCH(r) S_SQUARE(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(R*R); +#endif + +#define RI1 (RI+1) +#define RFI1 (RFI+1) + +#if RI +#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1) +#else +#define FOR_ROTATION +#endif + +#if RFI +#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++) +#else +#define FOR_REFLECTION +#endif + +#if PD +#define PINCR DINCR +#else +#define PINCR(z,c,a) (z.c += a) +#endif + +#define P_AREA(a) (a - PD) + +// patch shapes +#if P == 0 || P == 1 +#define FOR_PATCH(p) S_1X1(p) +const int p_area = P_AREA(1); +#elif PS == 8 +#define FOR_PATCH(p) S_PLUS_X(p,hp,PINCR(p,y,max(1,abs(p.x)))) +const int p_area = P_AREA(S_PLUS_X_A(hp,P)); +#elif PS == 7 +#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(S_PLUS_A(hp,P)); +#elif PS == 6 +#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(P*P); +#elif PS == 5 +#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x,1)) +const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); +#elif PS == 4 +#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x,1)) +const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); +#elif PS == 3 +#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(S_DIAMOND_A(hp,P)); +#elif PS == 2 +#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(P); +#elif PS == 1 +#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x,1)) +const int p_area = P_AREA(P); +#elif PS == 0 +#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(P*P); +#endif + +const float r_scale = 1.0/r_area; +const float p_scale = 1.0/p_area; + +#if SAMPLE == 0 +#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * (vec2(off) + 0.5 - fract((pos) * (size)))) +#else +#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * vec2(off)) +#endif + +#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off) + +#if _INJ_RF_ && defined(LUMA_raw) +#define load2_(off) sample(_INJ_RF_LUMA_tex, _INJ_RF_LUMA_pos, _INJ_RF_LUMA_size, _INJ_RF_LUMA_pt, off) +#define gather_offs(off, off_arr) (_INJ_RF_LUMA_mul * vec4(textureGatherOffsets(_INJ_RF_LUMA_raw, _INJ_RF_LUMA_pos + vec2(off) * _INJ_RF_LUMA_pt, off_arr))) +#define gather(off) _INJ_RF_LUMA_gather(_INJ_RF_LUMA_pos + (off) * _INJ_RF_LUMA_pt, 0) +#elif _INJ_RF_ && D1W +#define load2_(off) sample(_INJ_RF_tex, _INJ_RF_pos, _INJ_RF_size, _INJ_RF_pt, off) +#define gather_offs(off, off_arr) (_INJ_RF_mul * vec4(textureGatherOffsets(_INJ_RF_raw, _INJ_RF_pos + vec2(off) * _INJ_RF_pt, off_arr))) +#define gather(off) _INJ_RF_gather(_INJ_RF_pos + (off) * _INJ_RF_pt, 0) +#elif _INJ_RF_ +#define load2_(off) sample(_INJ_RF_tex, _INJ_RF_pos, _INJ_RF_size, _INJ_RF_pt, off) +#else +#define load2_(off) load_(off) +#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr))) +#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0) +#endif + +#if T +val load(vec3 off) +{ + switch (min(int(off.z), frame)) { + case 0: return val_swizz(load_(off)); + + } +} +val load2(vec3 off) +{ + return off.z == 0 ? val_swizz(load2_(off)) : load(off); +} +#else +#define load(off) val_swizz(load_(off)) +#define load2(off) val_swizz(load2_(off)) +#endif + +val poi2 = load2(vec3(0)); // guide pixel-of-interest +#if GUIDE_INPUT +#define poi poi2 +#else +vec4 poi_ = load_(vec3(0)); +val poi = val_swizz(poi_); // pixel-of-interest +#endif + +#if RI // rotation +vec2 rot(vec2 p, float d) +{ + return vec2( + p.x * cos(radians(d)) - p.y * sin(radians(d)), + p.y * sin(radians(d)) + p.x * cos(radians(d)) + ); +} +#else +#define rot(p, d) (p) +#endif + +#if RFI // reflection +vec2 ref(vec2 p, int d) +{ + switch (d) { + case 0: return p; + case 1: return p * vec2(1, -1); + case 2: return p * vec2(-1, 1); + } +} +#else +#define ref(p, d) (p) +#endif + +#if SST && R >= SST +float spatial_r(vec3 v) +{ + v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size); + return SK(length(v*SD)*SS); +} +#else +#define spatial_r(v) (1) +#endif + +#if PST && P >= PST +#define spatial_p(v) PSK(length(v*PSD)*PSS) +#else +#define spatial_p(v) (1) +#endif + +val range(val pdiff_sq) +{ + const float h = max(S, 0.0) * 0.013; + const float pdiff_scale = 1.0/(h*h); + pdiff_sq = sqrt(pdiff_sq * pdiff_scale); +#if defined(LUMA_raw) + return RK(pdiff_sq); +#elif defined(CHROMA_raw) + return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y)); +#else + return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z)); +#endif +} -vec4 hook() +val patch_comparison(vec3 r, vec3 r2) { - return HOOKED_texOff(0); + vec3 p; + val min_rot = val(p_area); + + FOR_ROTATION FOR_REFLECTION { + val pdiff_sq = val(0); + FOR_PATCH(p) { + vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z); + val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF); + diff_sq *= diff_sq; + diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy); + pdiff_sq += diff_sq; + } + min_rot = min(min_rot, pdiff_sq); + } + + return min_rot * p_scale; } -//!HOOK LUMA -//!HOOK CHROMA -//!BIND _INJ_PREI -//!WIDTH HOOKED.w -//!HEIGHT HOOKED.h -//!DESC Guided filter (I) -//!SAVE _INJ_I +#define NO_GATHER (PD == 0 && NG == 0 && SAMPLE == 0) // never textureGather if any of these conditions are false +#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3 || RI == 7) -vec4 hook() +#if (defined(LUMA_gather) || D1W) && ((PS == 0 || ((PS == 3 || PS == 7) && RI != 7) || PS == 8) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER +// 3x3 diamond/plus patch_comparison_gather +// XXX extend to support arbitrary sizes (probably requires code generation) +// XXX support PSS +const ivec2 offsets_adj[4] = { ivec2(0,-1), ivec2(1,0), ivec2(0,1), ivec2(-1,0) }; +const ivec2 offsets_adj_sf[4] = { ivec2(0,-1) * SF, ivec2(1,0) * SF, ivec2(0,1) * SF, ivec2(-1,0) * SF }; +vec4 poi_patch_adj = gather_offs(0, offsets_adj); +#if PS == 0 || PS == 8 +const ivec2 offsets_diag[4] = { ivec2(-1,-1), ivec2(1,-1), ivec2(1,1), ivec2(-1,1) }; +const ivec2 offsets_diag_sf[4] = { ivec2(-1,-1) * SF, ivec2(1,-1) * SF, ivec2(1,1) * SF, ivec2(-1,1) * SF }; +vec4 poi_patch_diag = gather_offs(0, offsets_diag); +#endif +float patch_comparison_gather(vec3 r, vec3 r2) { -return _INJ_PREI_texOff(0); -} - + float min_rot = p_area - 1; + vec4 transformer_adj = gather_offs(r, offsets_adj_sf); +#if PS == 0 || PS == 8 + vec4 transformer_diag = gather_offs(r, offsets_diag_sf); +#endif + FOR_ROTATION { + FOR_REFLECTION { +#if RFI + /* xxy + * w y + * wzz + */ + switch(rfi) { + case 1: + transformer_adj = transformer_adj.zyxw; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.zyxw; +#endif + break; + case 2: + transformer_adj = transformer_adj.xwzy; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.xwzy; +#endif + break; + } +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (P) -//!BIND HOOKED -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_P + vec4 diff = poi_patch_adj - transformer_adj; +#if PS == 0 || PS == 8 + diff += poi_patch_diag - transformer_diag; +#endif + float diff_sq = dot(diff * diff, vec4(1)); + min_rot = min(diff_sq, min_rot); -vec4 hook() +// un-reflect +#if RFI + switch(rfi) { + case 1: + transformer_adj = transformer_adj.zyxw; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.zyxw; +#endif + break; + case 2: + transformer_adj = transformer_adj.xwzy; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.xwzy; +#endif + break; + } +#endif + } // FOR_REFLECTION +#if RI == 7 + transformer_adj = transformer_adj.wxyz; + // swap adjacents for diagonals + transformer_adj += transformer_diag; + transformer_diag = transformer_adj - transformer_diag; + transformer_adj -= transformer_diag; +#elif RI == 3 + transformer_adj = transformer_adj.wxyz; +#elif RI == 1 + transformer_adj = transformer_adj.zwxy; +#endif +#if RI == 3 && (PS == 0 || PS == 8) + transformer_diag = transformer_diag.wxyz; +#elif RI == 1 && (PS == 0 || PS == 8) + transformer_diag = transformer_diag.zwxy; +#endif + } // FOR_ROTATION + float center_diff = poi2.x - load2(r).x; + return (center_diff * center_diff + min_rot) * p_scale; +} +#elif (defined(LUMA_gather) || D1W) && PS == 4 && P == 3 && RI == 0 && RFI == 0 && NO_GATHER +const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,0), ivec2(1,0) }; +const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,0) * SF, ivec2(1,0) * SF }; +vec4 poi_patch = gather_offs(0, offsets); +float patch_comparison_gather(vec3 r, vec3 r2) { - return HOOKED_texOff(0); + vec4 pdiff = poi_patch - gather_offs(r, offsets_sf); + return dot(pdiff * pdiff, vec4(1)) * p_scale; } - -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (MEANI) -//!BIND _INJ_I -//!WIDTH _INJ_I.w 1.5 / -//!HEIGHT _INJ_I.h 1.5 / -//!SAVE _INJ_MEANI - -vec4 hook() +#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER +// tiled even square patch_comparison_gather +// XXX extend to support odd square? +float patch_comparison_gather(vec3 r, vec3 r2) { -return _INJ_I_texOff(0); + vec2 tile; + float min_rot = p_area; + + /* gather order: + * w z + * x y + */ + float pdiff_sq = 0; + for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) { + vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy); + diff_sq *= diff_sq; + diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)), + spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0))); + pdiff_sq += dot(diff_sq, vec4(1)); + } + min_rot = min(min_rot, pdiff_sq); + + return min_rot * p_scale; } - -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (MEANP) -//!BIND _INJ_P -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_MEANP +#else +#define patch_comparison_gather patch_comparison +#endif vec4 hook() { -return _INJ_P_texOff(0); -} + val total_weight = val(0); + val sum = val(0); + val result = val(0); -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (_INJ_I_SQ) -//!BIND _INJ_I -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_I_SQ + vec3 r = vec3(0); + vec3 p = vec3(0); + vec3 me = vec3(0); -vec4 hook() -{ -return _INJ_I_texOff(0) * _INJ_I_texOff(0); -} +#if T && ME == 1 // temporal & motion estimation + vec3 me_tmp = vec3(0); + float maxweight = 0; +#elif T && ME == 2 // temporal & motion estimation + vec3 me_sum = vec3(0); + float me_weight = 0; +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (_INJ_IXP) -//!BIND _INJ_I -//!BIND _INJ_P -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_IXP +#if AS + val total_weight_s = val(0); + val sum_s = val(0); +#endif -vec4 hook() -{ -return _INJ_I_texOff(0) * _INJ_P_texOff(0); -} +#if WD == 2 // weight discard (mean) + int r_index = 0; + val_packed all_weights[r_area]; + val_packed all_pixels[r_area]; +#elif WD == 1 // weight discard (moving cumulative average) + int r_iter = 1; + val wd_total_weight = val(0); + val wd_sum = val(0); +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (CORRI) -//!BIND _INJ_I_SQ -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_CORRI + FOR_FRAME(r) { + // XXX ME is always a frame behind, should have the option to re-research after applying ME (could do it an arbitrary number of times per frame if desired) +#if T && ME == 1 // temporal & motion estimation max weight + if (r.z > 0) { + me += me_tmp * MEF; + me_tmp = vec3(0); + maxweight = 0; + } +#elif T && ME == 2 // temporal & motion estimation weighted average + if (r.z > 0) { + me += round(me_sum / me_weight * MEF); + me_sum = vec3(0); + me_weight = 0; + } +#endif + FOR_RESEARCH(r) { + // r coords with appropriate transformations applied + vec3 tr = vec3(r.xy + floor(r.xy * RSF), r.z); + float spatial_weight = spatial_r(tr); + tr.xy += me.xy; -vec4 hook() -{ -return _INJ_I_SQ_texOff(0); -} + val px = load(tr); -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (CORRP) -//!BIND _INJ_IXP -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_CORRP +#if SKIP_PATCH + val weight = val(1); +#else + val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(tr, vec3(0))) : patch_comparison(tr, vec3(0)); + val weight = range(pdiff_sq); +#endif -vec4 hook() -{ -return _INJ_IXP_texOff(0); -} +#if T && ME == 1 // temporal & motion estimation max weight + me_tmp = vec3(tr.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); + maxweight = max(maxweight, weight.x); +#elif T && ME == 2 // temporal & motion estimation weighted average + me_sum += vec3(tr.xy,0) * weight.x; + me_weight += weight.x; +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (A) -//!BIND _INJ_MEANI -//!BIND _INJ_MEANP -//!BIND _INJ_CORRI -//!BIND _INJ_CORRP -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_A +#if D1W + weight = val(weight.x); +#endif -#define E 0.0013 + weight *= spatial_weight; -vec4 hook() -{ -vec4 var = _INJ_CORRI_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANI_texOff(0); -vec4 cov = _INJ_CORRP_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANP_texOff(0); - return cov / (var + E); -} +#if AS + spatial_weight *= int(r.z == 0); // ignore temporal + sum_s += px * spatial_weight; + total_weight_s += spatial_weight; +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (B) -//!BIND _INJ_A -//!BIND _INJ_MEANI -//!BIND _INJ_MEANP -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_B +#if WD == 2 // weight discard (mean) + all_weights[r_index] = val_pack(weight); + all_pixels[r_index] = val_pack(px); + r_index++; +#elif WD == 1 // weight discard (moving cumulative average) + val wd_scale = val(1.0/r_iter); + val below_threshold = WDS * abs(min(val(0.0), weight - (total_weight * wd_scale * WDT * WD1TK(sqrt(wd_scale*WDP))))); +#if defined(LUMA_raw) + val wdkf = WDK(below_threshold); +#elif defined(CHROMA_raw) + val wdkf = vec2(WDK(below_threshold.x), WDK(below_threshold.y)); +#else + val wdkf = vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.y)); +#endif + wd_sum += px * weight * wdkf; + wd_total_weight += weight * wdkf; + r_iter++; +#endif -vec4 hook() -{ -return _INJ_MEANP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANI_texOff(0); -} + sum += px * weight; + total_weight += weight; + } // FOR_RESEARCH + } // FOR_FRAME -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (MEANA) -//!BIND _INJ_A -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_MEANA + val avg_weight = total_weight * r_scale; + val old_avg_weight = avg_weight; -vec4 hook() -{ -return _INJ_A_texOff(0); -} +#if WD == 2 // weight discard (mean) + total_weight = val(0); + sum = val(0); -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (MEANB) -//!BIND _INJ_B -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_MEANB + for (int i = 0; i < r_area; i++) { + val weight = val_unpack(all_weights[i]); + val px = val_unpack(all_pixels[i]); -vec4 hook() -{ -return _INJ_B_texOff(0); -} + val below_threshold = WDS * abs(min(val(0.0), weight - (avg_weight * WDT))); +#if defined(LUMA_raw) + weight *= WDK(below_threshold); +#elif defined(CHROMA_raw) + weight *= vec2(WDK(below_threshold.x), WDK(below_threshold.y)); +#else + weight *= vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.z)); +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter -//!BIND HOOKED -//!BIND _INJ_MEANA -//!BIND _INJ_MEANB -//!SAVE RF_LUMA + sum += px * weight; + total_weight += weight; + } +#elif WD == 1 // weight discard (moving cumulative average) + total_weight = wd_total_weight; + sum = wd_sum; +#endif +#if WD // weight discard + avg_weight = total_weight * r_scale; +#endif -vec4 hook() -{ -return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0); + total_weight += SW * spatial_r(vec3(0)); + sum += poi * SW * spatial_r(vec3(0)); + result = val(sum / total_weight); + + // store frames for temporal +#if T > 1 + +#endif +#if T && TRF + imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result)); +#elif T + imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2)); +#endif + +#if AS == 1 // sharpen+denoise +#define AS_base result +#elif AS == 2 // sharpen only +#define AS_base poi +#endif +#if AS + val usm = result - sum_s/total_weight_s; + usm = exp(log(abs(usm))*ASP) * sign(usm); // avoiding pow() since it's buggy on nvidia + usm *= gaussian(abs((AS_base + usm - 0.5) / 1.5) * ASA); + usm *= ASF; + result = AS_base + usm; +#endif + +#if EP // extremes preserve + float luminance = EP_texOff(0).x; + // EPSILON is needed since pow(0,0) is undefined + float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); + result = mix(poi, result, ep_weight); +#else + float ep_weight = 0; +#endif + +#if V == 1 + result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0); +#elif V == 2 + result = (poi - result) * 0.5 + 0.5; +#elif V == 3 // post-WD weight map + result = avg_weight; +#elif V == 4 // pre-WD edge map + result = old_avg_weight; +#elif V == 5 + result = 0.5 + usm; +#elif V == 6 + result = val(1 - ep_weight); +#endif + +// XXX visualize chroma for these +#if defined(CHROMA_raw) && (V == 3 || V == 4 || V == 6) + return vec4(0.5); +#endif + + return unval(mix(poi, result, BF)); } -// End of source code injected from guided.glsl +// End of source code injected from ../nlmeans.glsl //!HOOK LUMA //!HOOK CHROMA @@ -328,26 +2276,12 @@ vec4 hook() return RF_LUMA_texOff(0); } -//!HOOK LUMA -//!HOOK CHROMA -//!BIND LUMA -//!WIDTH LUMA.w 3 / -//!HEIGHT LUMA.h 3 / -//!DESC Non-local means (EP) -//!SAVE EP - -vec4 hook() -{ - return LUMA_texOff(0); -} - //!HOOK LUMA //!HOOK CHROMA //!BIND HOOKED //!BIND RF_LUMA //!BIND RF -//!BIND EP -//!DESC Non-local means (nlmeans_hqx.glsl) +//!DESC Non-local means (nlmeans.glsl) // User variables @@ -358,47 +2292,35 @@ vec4 hook() #ifdef LUMA_raw #define S 2.25 #else -#define S 5.0 +#define S 2.5168955531436197 #endif /* Adaptive sharpening * - * Uses the blur incurred by denoising to perform an unsharp mask, and uses the - * weight map to restrict the sharpening to edges. - * - * If you just want to increase/decrease sharpness then you want to change ASF. + * Performs an unsharp mask by subtracting the spatial kernel's blur from the + * NLM blur. For sharpen+denoise the sharpening is limited to edge areas and + * denoising is done everywhere else. * * Use V=4 to visualize which areas are sharpened (black means sharpen). * * AS: - * - 0 to disable - * - 1 to sharpen+denoise - * - 2 to sharpen only + * - 0: disable + * - 1: sharpen+denoise + * - 2: sharpen only * ASF: Higher numbers make a sharper image - * ASP: Higher numbers use more of the sharp image - * ASW: - * - 0 to use pre-WD weights - * - 1 to use post-WD weights (ASP should be ~2x to compensate) - * ASK: Weight kernel: - * - 0 for power. This is the old method. - * - 1 for sigmoid. This is generally recommended. - * - 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image) - * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map + * ASA: Anti-ringing, higher numbers increase strength + * ASP: Power, lower numbers increase sharpening on lower frequency detail */ #ifdef LUMA_raw #define AS 0 -#define ASF 3.0 -#define ASP 1 -#define ASW 0 -#define ASK 1 -#define ASC 0.0 +#define ASF 0.1625 +#define ASA 5.0 +#define ASP 0.5 #else #define AS 0 -#define ASF 3.0 -#define ASP 1.0 -#define ASW 0 -#define ASK 1 -#define ASC 0.0 +#define ASF 0.1625 +#define ASA 5.0 +#define ASP 0.5 #endif /* Starting weight @@ -409,52 +2331,57 @@ vec4 hook() * EPSILON should be used instead of zero to avoid divide-by-zero errors. */ #ifdef LUMA_raw -#define SW 1.0 +#define SW 1.3011446081346498 #else -#define SW 0.5 +#define SW 1.2219854377433914 #endif /* Weight discard * - * Discard weights that fall below a fraction of the average weight. This culls - * the most dissimilar samples from the blur, yielding a much more pleasant - * result, especially around edges. + * Reduces weights that fall below a fraction of the average weight. This culls + * the most dissimilar samples from the blur, which can yield a better result, + * especially around edges. * * WD: - * - 2: True average. Better quality, but slower and requires GLSL 4.0 or later - * - 1: Moving cumulative average. Inaccurate, tends to blur directionally. + * - 2: Mean. Better quality, but slower and requires GLSL 4.0 or later + * - 1: Moving cumulative average. Fast but inaccurate, blurs directionally. * - 0: Disable * * WDT: Threshold coefficient, higher numbers discard more * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes + * WDS (not for WDK=is_zero): Higher numbers are more eager to reduce weights */ #ifdef LUMA_raw #define WD 2 -#define WDT 0.5 -#define WDP 6.0 +#define WDT 0.11671341022864548 +#define WDP 5.381278367349288 +#define WDS 1.0 #else -#define WD 2 -#define WDT 0.75 -#define WDP 6.0 +#define WD 0 +#define WDT 0.002713346103131793 +#define WDP 5.832936323930807 +#define WDS 1.0 #endif /* Extremes preserve * - * Reduces denoising around very bright/dark areas. + * Reduce denoising in very bright/dark areas. + * + * Disabled by default now. If you want to reenable this, set EP=3/ in + * Makefile.nlm and rebuild. * * The downscaling factor of the EP shader stage affects what is considered a - * bright/dark area. The default of 3 should be fine, it's not recommended to - * change this. + * bright/dark area. * * This is incompatible with RGB. If you have RGB hooks enabled then you will * have to delete the EP shader stage or specify EP=0 through shader_cfg. * * EP: 1 to enable, 0 to disable - * DP: EP strength on dark patches, 0 to fully denoise - * BP: EP strength on bright patches, 0 to fully denoise + * DP: EP strength on dark areas, 0 to fully denoise + * BP: EP strength on bright areas, 0 to fully denoise */ #ifdef LUMA_raw -#define EP 1 +#define EP 0 #define BP 0.75 #define DP 0.25 #else @@ -469,12 +2396,26 @@ vec4 hook() /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* textureGather applicable configurations: + * + * - PS={0,3,7,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2} + * - PS={0,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2} + * - PS=6:RI=0:RFI=0 + * - Currently the only scalable variant + * + * Options which always disable textureGather: + * - NG + * - SAMPLE + * - PD + * + * Running without textureGather may be much slower. + */ + /* Patch & research sizes * - * Patch size should be an odd number greater than or equal to 3. Higher values - * are slower and not always better. + * P should be an odd number. Higher values are slower and not always better. * - * Research size be an odd number greater than or equal to 3. Higher values are + * R should be an odd number greater than or equal to 3. Higher values are * generally better, but slower, blurrier, and gives diminishing returns. */ #ifdef LUMA_raw @@ -492,8 +2433,6 @@ vec4 hook() * * PS applies applies to patches, RS applies to research zones. * - * Be wary of gather optimizations (see the Regarding Speed comment at the top) - * * 0: square (symmetrical) * 1: horizontal line (asymmetric) * 2: vertical line (asymmetric) @@ -502,6 +2441,7 @@ vec4 hook() * 5: truncated triangle (asymmetric on two axis, last row halved) * 6: even sized square (asymmetric on two axis) * 7: plus (symmetrical) + * 8: plus X (symmetrical) */ #ifdef LUMA_raw #define RS 3 @@ -516,8 +2456,8 @@ vec4 hook() * This setting is dependent on code generation from shader_cfg, so this * setting can only be enabled via shader_cfg. * - * Compares the pixel-of-interest against a guide, which could be a downscaled - * image or the output of another shader + * Computes weights on a guide, which could be a downscaled image or the output + * of another shader, and applies the weights to the original image */ #define RF_LUMA 1 #define RF 1 @@ -531,6 +2471,9 @@ vec4 hook() * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc. * + * Consider setting SAMPLE=1 if setting RI to a setting that would require + * sampling between pixels. + * * RI: Rotational invariance * RFI (0 to 2): Reflectional invariance */ @@ -598,14 +2541,14 @@ vec4 hook() */ #ifdef LUMA_raw #define SST 1 -#define SS 0.25 +#define SS 0.5296176863733414 #define SD vec3(1,1,1) #define PST 0 #define PSS 0.0 #define PSD vec2(1,1) #else #define SST 1 -#define SS 0.25 +#define SS 0.26295970436981203 #define SD vec3(1,1,1) #define PST 0 #define PSS 0.0 @@ -617,6 +2560,8 @@ vec4 hook() * SK: spatial kernel * RK: range kernel (takes patch differences) * PSK: intra-patch spatial kernel + * WDK: weight discard kernel + * WD1TK (WD=1 only): weight discard tolerance kernel * * List of available kernels: * @@ -624,18 +2569,51 @@ vec4 hook() * cos * gaussian * lanczos - * quadratic + * quadratic_ (unclamped) * sinc + * sinc_ (unclamped) + * sinc3 * sphinx + * sphinx_ (unclamped) + * triangle_ (unclamped) + * triangle */ #ifdef LUMA_raw #define SK gaussian #define RK gaussian #define PSK gaussian +#define WDK is_zero +#define WD1TK gaussian #else #define SK gaussian #define RK gaussian #define PSK gaussian +#define WDK is_zero +#define WD1TK gaussian +#endif + +/* Sampling method + * + * In most cases this shouldn't make any difference, only set to bilinear if + * it's necessary to sample between pixels (e.g., RI=2). + * + * 0: nearest neighbor + * 1: bilinear + */ +#ifdef LUMA_raw +#define SAMPLE 0 +#else +#define SAMPLE 0 +#endif + +/* Research scaling factor + * + * Higher numbers sample more sparsely as the distance from the POI grows. + */ +#ifdef LUMA_raw +#define RSF 0.0 +#else +#define RSF 0.0 #endif // Scaling factor (should match WIDTH/HEIGHT) @@ -645,13 +2623,22 @@ vec4 hook() #define SF 1 #endif +// Use the guide image as the input image +#ifdef LUMA_raw +#define GUIDE_INPUT 0 +#else +#define GUIDE_INPUT 0 +#endif + /* Visualization * * 0: off * 1: absolute difference between input/output to the power of 0.25 * 2: difference between input/output centered on 0.5 - * 3: avg_weight - * 4: edge map (based on the relevant AS settings) + * 3: post-WD weight map + * 4: pre-WD weight map + * 5: unsharp mask + * 6: EP */ #ifdef LUMA_raw #define V 0 @@ -696,37 +2683,44 @@ vec4 hook() // Shader code -#define EPSILON 0.00000000001 +#define EPSILON 1.2e-38 #define M_PI 3.14159265358979323846 #define POW2(x) ((x)*(x)) #define POW3(x) ((x)*(x)*(x)) -#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0)))) +#define bicubic_(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0)))) +#define bicubic(x) bicubic_(clamp((x), 0.0, 2.0)) #define gaussian(x) exp(-1 * POW2(x)) -#define lanczos(x) POW2(sinc(x)) -#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5)) -#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI)) -#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI)) +#define quadratic_(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5)) +#define quadratic(x) quadratic_(clamp((x), 0.0, 1.5)) +#define sinc_(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI)) +#define sinc(x) sinc_(clamp((x), 0.0, 1.0)) +#define sinc3(x) sinc_(clamp((x), 0.0, 3.0)) +#define lanczos(x) (sinc3(x) * sinc(x)) +#define sphinx_(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI)) +#define sphinx(x) sphinx_(clamp((x), 0.0, 1.4302966531242027)) +#define triangle_(x) (1 - (x)) +#define triangle(x) triangle_(clamp((x), 0.0, 1.0)) +#define is_zero(x) int(x == 0) // XXX could maybe be better optimized on LGC -// XXX return original alpha component instead of 1.0 #if defined(LUMA_raw) #define val float #define val_swizz(v) (v.x) -#define unval(v) vec4(v.x, 0, 0, 1.0) +#define unval(v) vec4(v.x, 0, 0, poi_.a) #define val_packed val #define val_pack(v) (v) #define val_unpack(v) (v) #elif defined(CHROMA_raw) #define val vec2 #define val_swizz(v) (v.xy) -#define unval(v) vec4(v.x, v.y, 0, 1.0) +#define unval(v) vec4(v.x, v.y, 0, poi_.a) #define val_packed uint #define val_pack(v) packUnorm2x16(v) #define val_unpack(v) unpackUnorm2x16(v) #else #define val vec3 #define val_swizz(v) (v.xyz) -#define unval(v) vec4(v.x, v.y, v.z, 1.0) +#define unval(v) vec4(v.x, v.y, v.z, poi_.a) #define val_packed val #define val_pack(v) (v) #define val_unpack(v) (v) @@ -744,10 +2738,6 @@ const int hr = R/2; const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes #endif -// donut increment, increments without landing on (0,0,0) -// much faster than a continue statement -#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0)))) - // patch/research shapes // each shape is depicted in a comment, where Z=5 (Z corresponds to P or R) // dots (.) represent samples (pixels) and X represents the pixel-of-interest @@ -788,7 +2778,7 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res // // Z ..X.. // -#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++) +#define S_HORIZONTAL(z,hz,incr) for (z.y = 0; z.y <= 0; z.y++) for (z.x = -hz; z.x <= hz; incr) // 90 degree rotation of S_HORIZONTAL #define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr) @@ -801,19 +2791,13 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res #define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr) #define S_PLUS_A(hz,Z) (Z*2 - 1) -// XXX implement S_PLUS w/ an X overlayed: // 3 . . . // 3 ... // Z ..X.. // 3 ... // 3 . . . - -// XXX implement an X shape: -// 2 . . -// 2 . . -// 1 X -// 2 . . -// 2 . . +#define S_PLUS_X(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(z.x) + -hz * int(z.x == 0); z.y <= abs(z.x) + hz * int(z.x == 0); incr) +#define S_PLUS_X_A(hz,Z) (Z*4 - 3) // 1x1 square #define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++) @@ -827,43 +2811,43 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res #define RF_ RF #endif -// Skip comparing the pixel-of-interest against itself, unless RF is enabled -#if RF_ -#define RINCR(z,c) (z.c++) -#else -#define RINCR DINCR -#endif +// donut increment, increments without landing on (0,0,0) +// much faster than a continue statement +#define DINCR(z,c,a) ((z.c += a),(z.c += int(z == vec3(0)))) -#define R_AREA(a) (a * T1 + RF_-1) +#define R_AREA(a) (a * T1 - 1) // research shapes // XXX would be nice to have the option of temporally-varying research sizes #if R == 0 || R == 1 #define FOR_RESEARCH(r) S_1X1(r) const int r_area = R_AREA(1); +#elif RS == 8 +#define FOR_RESEARCH(r) S_PLUS_X(r,hr,DINCR(r,y,max(1,abs(r.x)))) +const int r_area = R_AREA(S_PLUS_X_A(hr,R)); #elif RS == 7 -#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_PLUS(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(S_PLUS_A(hr,R)); #elif RS == 6 -#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(R*R); #elif RS == 5 -#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x)) +#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,DINCR(r,x,1)) const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); #elif RS == 4 -#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x)) +#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,DINCR(r,x,1)) const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); #elif RS == 3 -#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_DIAMOND(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(S_DIAMOND_A(hr,R)); #elif RS == 2 -#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_VERTICAL(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(R); #elif RS == 1 -#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x)) +#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,DINCR(r,x,1)) const int r_area = R_AREA(R); #elif RS == 0 -#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_SQUARE(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(R*R); #endif @@ -885,7 +2869,7 @@ const int r_area = R_AREA(R*R); #if PD #define PINCR DINCR #else -#define PINCR(z,c) (z.c++) +#define PINCR(z,c,a) (z.c += a) #endif #define P_AREA(a) (a - PD) @@ -894,36 +2878,44 @@ const int r_area = R_AREA(R*R); #if P == 0 || P == 1 #define FOR_PATCH(p) S_1X1(p) const int p_area = P_AREA(1); +#elif PS == 8 +#define FOR_PATCH(p) S_PLUS_X(p,hp,PINCR(p,y,max(1,abs(p.x)))) +const int p_area = P_AREA(S_PLUS_X_A(hp,P)); #elif PS == 7 -#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(S_PLUS_A(hp,P)); #elif PS == 6 -#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(P*P); #elif PS == 5 -#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x)) +#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x,1)) const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); #elif PS == 4 -#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x)) +#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x,1)) const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); #elif PS == 3 -#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(S_DIAMOND_A(hp,P)); #elif PS == 2 -#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(P); #elif PS == 1 -#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x)) +#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x,1)) const int p_area = P_AREA(P); #elif PS == 0 -#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(P*P); #endif const float r_scale = 1.0/r_area; const float p_scale = 1.0/p_area; -#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size))) +#if SAMPLE == 0 +#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * (vec2(off) + 0.5 - fract((pos) * (size)))) +#else +#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * vec2(off)) +#endif + #define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off) #if RF_ && defined(LUMA_raw) @@ -959,8 +2951,13 @@ val load2(vec3 off) #define load2(off) val_swizz(load2_(off)) #endif -val poi = load(vec3(0)); // pixel-of-interest val poi2 = load2(vec3(0)); // guide pixel-of-interest +#if GUIDE_INPUT +#define poi poi2 +#else +vec4 poi_ = load_(vec3(0)); +val poi = val_swizz(poi_); // pixel-of-interest +#endif #if RI // rotation vec2 rot(vec2 p, float d) @@ -1005,7 +3002,7 @@ float spatial_r(vec3 v) val range(val pdiff_sq) { - const float h = S*0.013; + const float h = max(S, 0.0) * 0.013; const float pdiff_scale = 1.0/(h*h); pdiff_sq = sqrt(pdiff_sq * pdiff_scale); #if defined(LUMA_raw) @@ -1015,10 +3012,6 @@ val range(val pdiff_sq) #else return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z)); #endif - //return exp(-pdiff_sq * pdiff_scale); - - // weight function from the NLM paper, it's not very good - //return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale); } val patch_comparison(vec3 r, vec3 r2) @@ -1041,42 +3034,104 @@ val patch_comparison(vec3 r, vec3 r2) return min_rot * p_scale; } -#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false -#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3) +#define NO_GATHER (PD == 0 && NG == 0 && SAMPLE == 0) // never textureGather if any of these conditions are false +#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3 || RI == 7) -#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER +#if (defined(LUMA_gather) || D1W) && ((PS == 0 || ((PS == 3 || PS == 7) && RI != 7) || PS == 8) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER // 3x3 diamond/plus patch_comparison_gather // XXX extend to support arbitrary sizes (probably requires code generation) -// XXX extend to support 3x3 square // XXX support PSS -const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) }; -const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF }; -vec4 poi_patch = gather_offs(0, offsets); +const ivec2 offsets_adj[4] = { ivec2(0,-1), ivec2(1,0), ivec2(0,1), ivec2(-1,0) }; +const ivec2 offsets_adj_sf[4] = { ivec2(0,-1) * SF, ivec2(1,0) * SF, ivec2(0,1) * SF, ivec2(-1,0) * SF }; +vec4 poi_patch_adj = gather_offs(0, offsets_adj); +#if PS == 0 || PS == 8 +const ivec2 offsets_diag[4] = { ivec2(-1,-1), ivec2(1,-1), ivec2(1,1), ivec2(-1,1) }; +const ivec2 offsets_diag_sf[4] = { ivec2(-1,-1) * SF, ivec2(1,-1) * SF, ivec2(1,1) * SF, ivec2(-1,1) * SF }; +vec4 poi_patch_diag = gather_offs(0, offsets_diag); +#endif float patch_comparison_gather(vec3 r, vec3 r2) { float min_rot = p_area - 1; - vec4 transformer = gather_offs(r, offsets_sf); + vec4 transformer_adj = gather_offs(r, offsets_adj_sf); +#if PS == 0 || PS == 8 + vec4 transformer_diag = gather_offs(r, offsets_diag_sf); +#endif FOR_ROTATION { FOR_REFLECTION { - float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1)); +#if RFI + /* xxy + * w y + * wzz + */ + switch(rfi) { + case 1: + transformer_adj = transformer_adj.zyxw; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.zyxw; +#endif + break; + case 2: + transformer_adj = transformer_adj.xwzy; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.xwzy; +#endif + break; + } +#endif + + vec4 diff = poi_patch_adj - transformer_adj; +#if PS == 0 || PS == 8 + diff += poi_patch_diag - transformer_diag; +#endif + float diff_sq = dot(diff * diff, vec4(1)); min_rot = min(diff_sq, min_rot); + +// un-reflect #if RFI switch(rfi) { - case 0: transformer = transformer.zyxw; break; - case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror - case 2: transformer = transformer.zyxw; break; // undoes last mirror + case 1: + transformer_adj = transformer_adj.zyxw; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.zyxw; +#endif + break; + case 2: + transformer_adj = transformer_adj.xwzy; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.xwzy; +#endif + break; } #endif - } -#if RI == 3 - transformer = transformer.wxyz; + } // FOR_REFLECTION +#if RI == 7 + transformer_adj = transformer_adj.wxyz; + // swap adjacents for diagonals + transformer_adj += transformer_diag; + transformer_diag = transformer_adj - transformer_diag; + transformer_adj -= transformer_diag; +#elif RI == 3 + transformer_adj = transformer_adj.wxyz; #elif RI == 1 - transformer = transformer.zwxy; + transformer_adj = transformer_adj.zwxy; #endif - } - float center_diff_sq = poi2.x - load2(r).x; - center_diff_sq *= center_diff_sq; - return (min_rot + center_diff_sq) * p_scale; +#if RI == 3 && (PS == 0 || PS == 8) + transformer_diag = transformer_diag.wxyz; +#elif RI == 1 && (PS == 0 || PS == 8) + transformer_diag = transformer_diag.zwxy; +#endif + } // FOR_ROTATION + float center_diff = poi2.x - load2(r).x; + return (center_diff * center_diff + min_rot) * p_scale; +} +#elif (defined(LUMA_gather) || D1W) && PS == 4 && P == 3 && RI == 0 && RFI == 0 && NO_GATHER +const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,0), ivec2(1,0) }; +const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,0) * SF, ivec2(1,0) * SF }; +vec4 poi_patch = gather_offs(0, offsets); +float patch_comparison_gather(vec3 r, vec3 r2) +{ + vec4 pdiff = poi_patch - gather_offs(r, offsets_sf); + return dot(pdiff * pdiff, vec4(1)) * p_scale; } #elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER // tiled even square patch_comparison_gather @@ -1124,18 +3179,23 @@ vec4 hook() float me_weight = 0; #endif -#if WD == 2 // weight discard +#if AS + val total_weight_s = val(0); + val sum_s = val(0); +#endif + +#if WD == 2 // weight discard (mean) int r_index = 0; val_packed all_weights[r_area]; val_packed all_pixels[r_area]; -#elif WD == 1 // weight discard - val no_weights = val(0); - val discard_total_weight = val(0); - val discard_sum = val(0); +#elif WD == 1 // weight discard (moving cumulative average) + int r_iter = 1; + val wd_total_weight = val(0); + val wd_sum = val(0); #endif FOR_FRAME(r) { - // XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired) + // XXX ME is always a frame behind, should have the option to re-research after applying ME (could do it an arbitrary number of times per frame if desired) #if T && ME == 1 // temporal & motion estimation max weight if (r.z > 0) { me += me_tmp * MEF; @@ -1149,19 +3209,26 @@ vec4 hook() me_weight = 0; } #endif - FOR_RESEARCH(r) { // main NLM logic + FOR_RESEARCH(r) { + // r coords with appropriate transformations applied + vec3 tr = vec3(r.xy + floor(r.xy * RSF), r.z); + float spatial_weight = spatial_r(tr); + tr.xy += me.xy; + + val px = load(tr); + #if SKIP_PATCH val weight = val(1); #else - val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0)); + val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(tr, vec3(0))) : patch_comparison(tr, vec3(0)); val weight = range(pdiff_sq); #endif #if T && ME == 1 // temporal & motion estimation max weight - me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); + me_tmp = vec3(tr.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); maxweight = max(maxweight, weight.x); #elif T && ME == 2 // temporal & motion estimation weighted average - me_sum += vec3(r.xy,0) * weight.x; + me_sum += vec3(tr.xy,0) * weight.x; me_weight += weight.x; #endif @@ -1169,21 +3236,34 @@ vec4 hook() weight = val(weight.x); #endif - weight *= spatial_r(r); + weight *= spatial_weight; -#if WD == 2 // weight discard +#if AS + spatial_weight *= int(r.z == 0); // ignore temporal + sum_s += px * spatial_weight; + total_weight_s += spatial_weight; +#endif + +#if WD == 2 // weight discard (mean) all_weights[r_index] = val_pack(weight); - all_pixels[r_index] = val_pack(load(r+me)); + all_pixels[r_index] = val_pack(px); r_index++; -#elif WD == 1 // weight discard - val wd_scale = 1.0/max(no_weights, 1); - val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight); - discard_sum += load(r+me) * weight * (1 - keeps); - discard_total_weight += weight * (1 - keeps); - no_weights += keeps; +#elif WD == 1 // weight discard (moving cumulative average) + val wd_scale = val(1.0/r_iter); + val below_threshold = WDS * abs(min(val(0.0), weight - (total_weight * wd_scale * WDT * WD1TK(sqrt(wd_scale*WDP))))); +#if defined(LUMA_raw) + val wdkf = WDK(below_threshold); +#elif defined(CHROMA_raw) + val wdkf = vec2(WDK(below_threshold.x), WDK(below_threshold.y)); +#else + val wdkf = vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.y)); +#endif + wd_sum += px * weight * wdkf; + wd_total_weight += weight * wdkf; + r_iter++; #endif - sum += load(r+me) * weight; + sum += px * weight; total_weight += weight; } // FOR_RESEARCH } // FOR_FRAME @@ -1191,37 +3271,37 @@ vec4 hook() val avg_weight = total_weight * r_scale; val old_avg_weight = avg_weight; -#if WD == 2 // true average +#if WD == 2 // weight discard (mean) total_weight = val(0); sum = val(0); - val no_weights = val(0); for (int i = 0; i < r_area; i++) { - val w = val_unpack(all_weights[i]); + val weight = val_unpack(all_weights[i]); val px = val_unpack(all_pixels[i]); - val keeps = step(avg_weight*WDT, w); - w *= keeps; - sum += px * w; - total_weight += w; - no_weights += keeps; + val below_threshold = WDS * abs(min(val(0.0), weight - (avg_weight * WDT))); +#if defined(LUMA_raw) + weight *= WDK(below_threshold); +#elif defined(CHROMA_raw) + weight *= vec2(WDK(below_threshold.x), WDK(below_threshold.y)); +#else + weight *= vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.z)); +#endif + + sum += px * weight; + total_weight += weight; } -#elif WD == 1 // moving cumulative average - total_weight -= discard_total_weight; - sum -= discard_sum; +#elif WD == 1 // weight discard (moving cumulative average) + total_weight = wd_total_weight; + sum = wd_sum; #endif #if WD // weight discard - avg_weight = total_weight / no_weights; + avg_weight = total_weight * r_scale; #endif total_weight += SW * spatial_r(vec3(0)); sum += poi * SW * spatial_r(vec3(0)); - -#if V == 3 // weight map - result = val(avg_weight); -#else // mean result = val(sum / total_weight); -#endif // store frames for temporal #if T > 1 @@ -1233,27 +3313,17 @@ vec4 hook() imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2)); #endif -#if ASW == 0 // pre-WD weights -#define AS_weight old_avg_weight -#elif ASW == 1 // post-WD weights -#define AS_weight avg_weight -#endif - -#if ASK == 0 - val sharpening_strength = pow(AS_weight, val(ASP)); -#elif ASK == 1 - val sharpening_strength = mix( - pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)), - AS_weight, ASC); - // XXX normalize the result to account for a negative ASC? -#elif ASK == 2 - val sharpening_strength = val(ASP); -#endif - #if AS == 1 // sharpen+denoise - val sharpened = result + (poi - result) * ASF; +#define AS_base result #elif AS == 2 // sharpen only - val sharpened = poi + (poi - result) * ASF; +#define AS_base poi +#endif +#if AS + val usm = result - sum_s/total_weight_s; + usm = exp(log(abs(usm))*ASP) * sign(usm); // avoiding pow() since it's buggy on nvidia + usm *= gaussian(abs((AS_base + usm - 0.5) / 1.5) * ASA); + usm *= ASF; + result = AS_base + usm; #endif #if EP // extremes preserve @@ -1261,26 +3331,27 @@ vec4 hook() // EPSILON is needed since pow(0,0) is undefined float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); result = mix(poi, result, ep_weight); -#endif - -#if AS == 1 // sharpen+denoise - result = mix(sharpened, result, sharpening_strength); -#elif AS == 2 // sharpen only - result = mix(sharpened, poi, sharpening_strength); -#endif - -#if V == 4 // edge map - result = sharpening_strength; -#endif - -#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations - return vec4(0.5); +#else + float ep_weight = 0; #endif #if V == 1 result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0); #elif V == 2 result = (poi - result) * 0.5 + 0.5; +#elif V == 3 // post-WD weight map + result = avg_weight; +#elif V == 4 // pre-WD edge map + result = old_avg_weight; +#elif V == 5 + result = 0.5 + usm; +#elif V == 6 + result = val(1 - ep_weight); +#endif + +// XXX visualize chroma for these +#if defined(CHROMA_raw) && (V == 3 || V == 4 || V == 6) + return vec4(0.5); #endif return unval(mix(poi, result, BF)); diff --git a/portable_config/shaders/nlmeans_lgc.glsl b/portable_config/shaders/nlmeans_lgc.glsl deleted file mode 100644 index 384d3a88..00000000 --- a/portable_config/shaders/nlmeans_lgc.glsl +++ /dev/null @@ -1,1043 +0,0 @@ -/* vi: ft=c - * - * Based on vf_nlmeans.c from FFmpeg. - * - * Copyright (c) 2022 an3223 - * Copyright (c) 2016 Clément Bœsch - * - * This program is free software: you can redistribute it and/or modify it - * under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 2.1 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License - * for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program. If not, see . - */ - -// Description: nlmeans_lgc.glsl: Experimental luma-guided chroma denoising, kinda similar to KrigBilateral - -/* The recommended usage of this shader and its variant profiles is to add them - * to input.conf and then dispatch the appropriate shader via a keybind during - * media playback. Here is an example input.conf entry: - * - * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)" - * - * These shaders can also be enabled by default in mpv.conf, for example: - * - * glsl-shaders='~~/shaders/nlmeans.glsl' - * - * Both of the examples above assume the shaders are located in a subdirectory - * named "shaders" within mpv's config directory. Refer to the mpv - * documentation for more details. - * - * This shader is highly configurable via user variables below. Although the - * default settings should offer good quality at a reasonable speed, you are - * encouraged to tweak them to your preferences. Be mindful that certain - * settings may greatly affect speed. - * - * Denoising is most useful for noisy content. If there is no perceptible - * noise, you probably won't see a positive difference. - * - * The default settings are generally tuned for low noise and high detail - * preservation. The "medium" and "heavy" profiles are tuned for higher levels - * of noise. - * - * The denoiser will not work properly if the content has been upscaled - * beforehand (whether it was done by you or not). In such cases, consider - * issuing a command to downscale in the mpv console (backtick ` key): - * - * vf toggle scale=-2:720 - * - * ...replacing 720 with whatever resolution seems appropriate. Rerun the - * command to undo the downscale. It may take some trial-and-error to find the - * proper resolution. - */ - -/* Regarding speed - * - * Speed may vary wildly for different vo and gpu-api settings. Generally - * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this - * may be different for your system. - * - * If your GPU doesn't support textureGather, or if you are on a version of mpv - * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile - * - * If you plan on tinkering with NLM's settings, read below: - * - * textureGather only applies to luma and limited to the these configurations: - * - * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2} - * - Default, very fast, rotations and reflections should be free - * - If this is unusually slow then try changing gpu-api and vo - * - If it's still slow, try setting RI/RFI to 0. - * - * - PS=6:RI={0,1,3}:RFI={0,1,2} - * - Currently the only scalable variant - * - Patch shape is asymmetric on two axis - * - Rotations should have very little speed impact - * - Reflections may have a significant speed impact - * - * Options which always disable textureGather: - * - PD - * - NG - */ - -//!HOOK CHROMA -//!BIND LUMA -//!WIDTH LUMA.w -//!HEIGHT LUMA.h -//!DESC Non-local means (RF, share) -//!SAVE RF - -vec4 hook() -{ - return LUMA_texOff(0); -} - -//!HOOK CHROMA -//!BIND HOOKED -//!BIND RF -//!DESC Non-local means (nlmeans_lgc.glsl) - -// User variables - -// It is generally preferable to denoise luma and chroma differently, so the -// user variables for luma and chroma are split. - -// Denoising factor (level of blur, higher means more blur) -#ifdef LUMA_raw -#define S 11.66 -#else -#define S 11.66 -#endif - -/* Adaptive sharpening - * - * Uses the blur incurred by denoising to perform an unsharp mask, and uses the - * weight map to restrict the sharpening to edges. - * - * If you just want to increase/decrease sharpness then you want to change ASF. - * - * Use V=4 to visualize which areas are sharpened (black means sharpen). - * - * AS: - * - 0 to disable - * - 1 to sharpen+denoise - * - 2 to sharpen only - * ASF: Higher numbers make a sharper image - * ASP: Higher numbers use more of the sharp image - * ASW: - * - 0 to use pre-WD weights - * - 1 to use post-WD weights (ASP should be ~2x to compensate) - * ASK: Weight kernel: - * - 0 for power. This is the old method. - * - 1 for sigmoid. This is generally recommended. - * - 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image) - * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map - */ -#ifdef LUMA_raw -#define AS 0 -#define ASF 3.0 -#define ASP 1.0 -#define ASW 0 -#define ASK 1 -#define ASC 0.0 -#else -#define AS 0 -#define ASF 3.0 -#define ASP 1.0 -#define ASW 0 -#define ASK 1 -#define ASC 0.0 -#endif - -/* Starting weight - * - * Also known as the center weight. This represents the weight of the - * pixel-of-interest. Lower numbers may help handle heavy noise & ringing. - * - * EPSILON should be used instead of zero to avoid divide-by-zero errors. - */ -#ifdef LUMA_raw -#define SW 0.75 -#else -#define SW 0.75 -#endif - -/* Weight discard - * - * Discard weights that fall below a fraction of the average weight. This culls - * the most dissimilar samples from the blur, yielding a much more pleasant - * result, especially around edges. - * - * WD: - * - 2: True average. Better quality, but slower and requires GLSL 4.0 or later - * - 1: Moving cumulative average. Inaccurate, tends to blur directionally. - * - 0: Disable - * - * WDT: Threshold coefficient, higher numbers discard more - * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes - */ -#ifdef LUMA_raw -#define WD 0 -#define WDT 0.5 -#define WDP 6.0 -#else -#define WD 0 -#define WDT 0.75 -#define WDP 6.0 -#endif - -/* Extremes preserve - * - * Reduces denoising around very bright/dark areas. - * - * The downscaling factor of the EP shader stage affects what is considered a - * bright/dark area. The default of 3 should be fine, it's not recommended to - * change this. - * - * This is incompatible with RGB. If you have RGB hooks enabled then you will - * have to delete the EP shader stage or specify EP=0 through shader_cfg. - * - * EP: 1 to enable, 0 to disable - * DP: EP strength on dark patches, 0 to fully denoise - * BP: EP strength on bright patches, 0 to fully denoise - */ -#ifdef LUMA_raw -#define EP 0 -#define BP 0.75 -#define DP 0.25 -#else -#define EP 0 -#define BP 0.0 -#define DP 0.0 -#endif - -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ - -/* Patch & research sizes - * - * Patch size should be an odd number greater than or equal to 3. Higher values - * are slower and not always better. - * - * Research size be an odd number greater than or equal to 3. Higher values are - * generally better, but slower, blurrier, and gives diminishing returns. - */ -#ifdef LUMA_raw -#define P 3 -#define R 5 -#else -#define P 3 -#define R 5 -#endif - -/* Patch and research shapes - * - * Different shapes have different speed and quality characteristics. Every - * shape (besides square) is smaller than square. - * - * PS applies applies to patches, RS applies to research zones. - * - * Be wary of gather optimizations (see the Regarding Speed comment at the top) - * - * 0: square (symmetrical) - * 1: horizontal line (asymmetric) - * 2: vertical line (asymmetric) - * 3: diamond (symmetrical) - * 4: triangle (asymmetric, pointing upward) - * 5: truncated triangle (asymmetric on two axis, last row halved) - * 6: even sized square (asymmetric on two axis) - * 7: plus (symmetrical) - */ -#ifdef LUMA_raw -#define RS 3 -#define PS 3 -#else -#define RS 3 -#define PS 3 -#endif - -/* Robust filtering - * - * This setting is dependent on code generation from shader_cfg, so this - * setting can only be enabled via shader_cfg. - * - * Compares the pixel-of-interest against a guide, which could be a downscaled - * image or the output of another shader - */ -#define RF_LUMA 0 -#define RF 1 - -/* Rotational/reflectional invariance - * - * Number of rotations/reflections to try for each patch comparison. Can be - * slow, but improves feature preservation. More rotations/reflections gives - * diminishing returns. The most similar rotation/reflection will be used. - * - * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a - * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc. - * - * RI: Rotational invariance - * RFI (0 to 2): Reflectional invariance - */ -#ifdef LUMA_raw -#define RI 3 -#define RFI 2 -#else -#define RI 3 -#define RFI 2 -#endif - -/* Temporal denoising - * - * This setting is dependent on code generation from shader_cfg, so this - * setting can only be enabled via shader_cfg. - * - * Caveats: - * - Slower: - * - Each frame needs to be researched (more samples & more math) - * - Gather optimizations only apply to the current frame - * - Requires vo=gpu-next - * - Luma-only (this is a bug) - * - Buggy - * - * May cause motion blur and may struggle more with noise that persists across - * multiple frames (e.g., from compression or duplicate frames), but can work - * very well on high quality video. - * - * Motion estimation (ME) should improve quality without impacting speed. - * - * T: number of frames used - * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg - * MEF: estimate factor, compensates for ME being one frame behind - * TRF: compare against the denoised frames - */ -#ifdef LUMA_raw -#define T 0 -#define ME 1 -#define MEF 2 -#define TRF 0 -#else -#define T 0 -#define ME 0 -#define MEF 2 -#define TRF 0 -#endif - -/* Spatial kernel - * - * Increasing the spatial denoising factor (SS) reduces the weight of further - * pixels. - * - * Spatial distortion instructs the spatial kernel to view that axis as - * closer/further, for instance SD=(1,1,0.5) would make the temporal axis - * appear closer and increase blur between frames. - * - * The intra-patch variants are supposed to help with larger patch sizes. - * - * SST: enables spatial kernel if R>=PST, 0 fully disables - * SS: spatial sigma - * SD: spatial distortion (X, Y, time) - * PSS: intra-patch spatial sigma - * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables - * PSD: intra-patch spatial distortion (X, Y) - */ -#ifdef LUMA_raw -#define SST 1 -#define SS 0.25 -#define SD vec3(1,1,1) -#define PST 0 -#define PSS 0.0 -#define PSD vec2(1,1) -#else -#define SST 1 -#define SS 0.25 -#define SD vec3(1,1,1) -#define PST 0 -#define PSS 0.0 -#define PSD vec2(1,1) -#endif - -/* Kernels - * - * SK: spatial kernel - * RK: range kernel (takes patch differences) - * PSK: intra-patch spatial kernel - * - * List of available kernels: - * - * bicubic - * cos - * gaussian - * lanczos - * quadratic - * sinc - * sphinx - */ -#ifdef LUMA_raw -#define SK gaussian -#define RK gaussian -#define PSK gaussian -#else -#define SK gaussian -#define RK gaussian -#define PSK gaussian -#endif - -// Scaling factor (should match WIDTH/HEIGHT) -#ifdef LUMA_raw -#define SF 1 -#else -#define SF 1 -#endif - -/* Visualization - * - * 0: off - * 1: absolute difference between input/output to the power of 0.25 - * 2: difference between input/output centered on 0.5 - * 3: avg_weight - * 4: edge map (based on the relevant AS settings) - */ -#ifdef LUMA_raw -#define V 0 -#else -#define V 0 -#endif - -// Blur factor (0.0 returns the input image, 1.0 returns the output image) -#ifdef LUMA_raw -#define BF 1.0 -#else -#define BF 1.0 -#endif - -// Force disable textureGather -#ifdef LUMA_raw -#define NG 0 -#else -#define NG 0 -#endif - -// Patch donut (probably useless) -#ifdef LUMA_raw -#define PD 0 -#else -#define PD 0 -#endif - -// Duplicate 1st weight (for luma-guided-chroma) -#ifdef LUMA_raw -#define D1W 1 -#else -#define D1W 1 -#endif - -// Skip patch comparison -#ifdef LUMA_raw -#define SKIP_PATCH 0 -#else -#define SKIP_PATCH 0 -#endif - -// Shader code - -#define EPSILON 0.00000000001 -#define M_PI 3.14159265358979323846 -#define POW2(x) ((x)*(x)) -#define POW3(x) ((x)*(x)*(x)) -#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0)))) -#define gaussian(x) exp(-1 * POW2(x)) -#define lanczos(x) POW2(sinc(x)) -#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5)) -#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI)) -#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI)) - -// XXX could maybe be better optimized on LGC -// XXX return original alpha component instead of 1.0 -#if defined(LUMA_raw) -#define val float -#define val_swizz(v) (v.x) -#define unval(v) vec4(v.x, 0, 0, 1.0) -#define val_packed val -#define val_pack(v) (v) -#define val_unpack(v) (v) -#elif defined(CHROMA_raw) -#define val vec2 -#define val_swizz(v) (v.xy) -#define unval(v) vec4(v.x, v.y, 0, 1.0) -#define val_packed uint -#define val_pack(v) packUnorm2x16(v) -#define val_unpack(v) unpackUnorm2x16(v) -#else -#define val vec3 -#define val_swizz(v) (v.xyz) -#define unval(v) vec4(v.x, v.y, v.z, 1.0) -#define val_packed val -#define val_pack(v) (v) -#define val_unpack(v) (v) -#endif - -#if PS == 6 -const int hp = P/2; -#else -const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes -#endif - -#if RS == 6 -const int hr = R/2; -#else -const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes -#endif - -// donut increment, increments without landing on (0,0,0) -// much faster than a continue statement -#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0)))) - -// patch/research shapes -// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R) -// dots (.) represent samples (pixels) and X represents the pixel-of-interest - -// Z ..... -// Z ..... -// Z ..X.. -// Z ..... -// Z ..... -#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr) - -// (in this instance Z=4) -// Z .... -// Z .... -// Z ..X. -// Z .... -#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr) - -// Z-4 . -// Z-2 ... -// Z ..X.. -#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr) - -// Z-4 . -// Z-2 ... -// hz+1 ..X -#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr) -#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z) - -// Z-4 . -// Z-2 ... -// Z ..X.. -// Z-2 ... -// Z-4 . -#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr) -#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z) - -// -// Z ..X.. -// -#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++) - -// 90 degree rotation of S_HORIZONTAL -#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr) - -// 1 . -// 1 . -// Z ..X.. -// 1 . -// 1 . -#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr) -#define S_PLUS_A(hz,Z) (Z*2 - 1) - -// XXX implement S_PLUS w/ an X overlayed: -// 3 . . . -// 3 ... -// Z ..X.. -// 3 ... -// 3 . . . - -// XXX implement an X shape: -// 2 . . -// 2 . . -// 1 X -// 2 . . -// 2 . . - -// 1x1 square -#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++) - -#define T1 (T+1) -#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++) - -#ifdef LUMA_raw -#define RF_ RF_LUMA -#else -#define RF_ RF -#endif - -// Skip comparing the pixel-of-interest against itself, unless RF is enabled -#if RF_ -#define RINCR(z,c) (z.c++) -#else -#define RINCR DINCR -#endif - -#define R_AREA(a) (a * T1 + RF_-1) - -// research shapes -// XXX would be nice to have the option of temporally-varying research sizes -#if R == 0 || R == 1 -#define FOR_RESEARCH(r) S_1X1(r) -const int r_area = R_AREA(1); -#elif RS == 7 -#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y)) -const int r_area = R_AREA(S_PLUS_A(hr,R)); -#elif RS == 6 -#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y)) -const int r_area = R_AREA(R*R); -#elif RS == 5 -#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x)) -const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); -#elif RS == 4 -#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x)) -const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); -#elif RS == 3 -#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y)) -const int r_area = R_AREA(S_DIAMOND_A(hr,R)); -#elif RS == 2 -#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y)) -const int r_area = R_AREA(R); -#elif RS == 1 -#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x)) -const int r_area = R_AREA(R); -#elif RS == 0 -#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y)) -const int r_area = R_AREA(R*R); -#endif - -#define RI1 (RI+1) -#define RFI1 (RFI+1) - -#if RI -#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1) -#else -#define FOR_ROTATION -#endif - -#if RFI -#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++) -#else -#define FOR_REFLECTION -#endif - -#if PD -#define PINCR DINCR -#else -#define PINCR(z,c) (z.c++) -#endif - -#define P_AREA(a) (a - PD) - -// patch shapes -#if P == 0 || P == 1 -#define FOR_PATCH(p) S_1X1(p) -const int p_area = P_AREA(1); -#elif PS == 7 -#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y)) -const int p_area = P_AREA(S_PLUS_A(hp,P)); -#elif PS == 6 -#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y)) -const int p_area = P_AREA(P*P); -#elif PS == 5 -#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x)) -const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); -#elif PS == 4 -#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x)) -const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); -#elif PS == 3 -#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y)) -const int p_area = P_AREA(S_DIAMOND_A(hp,P)); -#elif PS == 2 -#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y)) -const int p_area = P_AREA(P); -#elif PS == 1 -#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x)) -const int p_area = P_AREA(P); -#elif PS == 0 -#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y)) -const int p_area = P_AREA(P*P); -#endif - -const float r_scale = 1.0/r_area; -const float p_scale = 1.0/p_area; - -#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size))) -#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off) - -#if RF_ && defined(LUMA_raw) -#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off) -#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr))) -#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0) -#elif RF_ && D1W -#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off) -#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr))) -#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0) -#elif RF_ -#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off) -#else -#define load2_(off) load_(off) -#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr))) -#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0) -#endif - -#if T -val load(vec3 off) -{ - switch (min(int(off.z), frame)) { - case 0: return val_swizz(load_(off)); - - } -} -val load2(vec3 off) -{ - return off.z == 0 ? val_swizz(load2_(off)) : load(off); -} -#else -#define load(off) val_swizz(load_(off)) -#define load2(off) val_swizz(load2_(off)) -#endif - -val poi = load(vec3(0)); // pixel-of-interest -val poi2 = load2(vec3(0)); // guide pixel-of-interest - -#if RI // rotation -vec2 rot(vec2 p, float d) -{ - return vec2( - p.x * cos(radians(d)) - p.y * sin(radians(d)), - p.y * sin(radians(d)) + p.x * cos(radians(d)) - ); -} -#else -#define rot(p, d) (p) -#endif - -#if RFI // reflection -vec2 ref(vec2 p, int d) -{ - switch (d) { - case 0: return p; - case 1: return p * vec2(1, -1); - case 2: return p * vec2(-1, 1); - } -} -#else -#define ref(p, d) (p) -#endif - -#if SST && R >= SST -float spatial_r(vec3 v) -{ - v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size); - return SK(length(v*SD)*SS); -} -#else -#define spatial_r(v) (1) -#endif - -#if PST && P >= PST -#define spatial_p(v) PSK(length(v*PSD)*PSS) -#else -#define spatial_p(v) (1) -#endif - -val range(val pdiff_sq) -{ - const float h = S*0.013; - const float pdiff_scale = 1.0/(h*h); - pdiff_sq = sqrt(pdiff_sq * pdiff_scale); -#if defined(LUMA_raw) - return RK(pdiff_sq); -#elif defined(CHROMA_raw) - return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y)); -#else - return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z)); -#endif - //return exp(-pdiff_sq * pdiff_scale); - - // weight function from the NLM paper, it's not very good - //return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale); -} - -val patch_comparison(vec3 r, vec3 r2) -{ - vec3 p; - val min_rot = val(p_area); - - FOR_ROTATION FOR_REFLECTION { - val pdiff_sq = val(0); - FOR_PATCH(p) { - vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z); - val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF); - diff_sq *= diff_sq; - diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy); - pdiff_sq += diff_sq; - } - min_rot = min(min_rot, pdiff_sq); - } - - return min_rot * p_scale; -} - -#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false -#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3) - -#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER -// 3x3 diamond/plus patch_comparison_gather -// XXX extend to support arbitrary sizes (probably requires code generation) -// XXX extend to support 3x3 square -// XXX support PSS -const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) }; -const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF }; -vec4 poi_patch = gather_offs(0, offsets); -float patch_comparison_gather(vec3 r, vec3 r2) -{ - float min_rot = p_area - 1; - vec4 transformer = gather_offs(r, offsets_sf); - FOR_ROTATION { - FOR_REFLECTION { - float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1)); - min_rot = min(diff_sq, min_rot); -#if RFI - switch(rfi) { - case 0: transformer = transformer.zyxw; break; - case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror - case 2: transformer = transformer.zyxw; break; // undoes last mirror - } -#endif - } -#if RI == 3 - transformer = transformer.wxyz; -#elif RI == 1 - transformer = transformer.zwxy; -#endif - } - float center_diff_sq = poi2.x - load2(r).x; - center_diff_sq *= center_diff_sq; - return (min_rot + center_diff_sq) * p_scale; -} -#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER -// tiled even square patch_comparison_gather -// XXX extend to support odd square? -float patch_comparison_gather(vec3 r, vec3 r2) -{ - vec2 tile; - float min_rot = p_area; - - /* gather order: - * w z - * x y - */ - float pdiff_sq = 0; - for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) { - vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy); - diff_sq *= diff_sq; - diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)), - spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0))); - pdiff_sq += dot(diff_sq, vec4(1)); - } - min_rot = min(min_rot, pdiff_sq); - - return min_rot * p_scale; -} -#else -#define patch_comparison_gather patch_comparison -#endif - -vec4 hook() -{ - val total_weight = val(0); - val sum = val(0); - val result = val(0); - - vec3 r = vec3(0); - vec3 p = vec3(0); - vec3 me = vec3(0); - -#if T && ME == 1 // temporal & motion estimation - vec3 me_tmp = vec3(0); - float maxweight = 0; -#elif T && ME == 2 // temporal & motion estimation - vec3 me_sum = vec3(0); - float me_weight = 0; -#endif - -#if WD == 2 // weight discard - int r_index = 0; - val_packed all_weights[r_area]; - val_packed all_pixels[r_area]; -#elif WD == 1 // weight discard - val no_weights = val(0); - val discard_total_weight = val(0); - val discard_sum = val(0); -#endif - - FOR_FRAME(r) { - // XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired) -#if T && ME == 1 // temporal & motion estimation max weight - if (r.z > 0) { - me += me_tmp * MEF; - me_tmp = vec3(0); - maxweight = 0; - } -#elif T && ME == 2 // temporal & motion estimation weighted average - if (r.z > 0) { - me += round(me_sum / me_weight * MEF); - me_sum = vec3(0); - me_weight = 0; - } -#endif - FOR_RESEARCH(r) { // main NLM logic -#if SKIP_PATCH - val weight = val(1); -#else - val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0)); - val weight = range(pdiff_sq); -#endif - -#if T && ME == 1 // temporal & motion estimation max weight - me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); - maxweight = max(maxweight, weight.x); -#elif T && ME == 2 // temporal & motion estimation weighted average - me_sum += vec3(r.xy,0) * weight.x; - me_weight += weight.x; -#endif - -#if D1W - weight = val(weight.x); -#endif - - weight *= spatial_r(r); - -#if WD == 2 // weight discard - all_weights[r_index] = val_pack(weight); - all_pixels[r_index] = val_pack(load(r+me)); - r_index++; -#elif WD == 1 // weight discard - val wd_scale = 1.0/max(no_weights, 1); - val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight); - discard_sum += load(r+me) * weight * (1 - keeps); - discard_total_weight += weight * (1 - keeps); - no_weights += keeps; -#endif - - sum += load(r+me) * weight; - total_weight += weight; - } // FOR_RESEARCH - } // FOR_FRAME - - val avg_weight = total_weight * r_scale; - val old_avg_weight = avg_weight; - -#if WD == 2 // true average - total_weight = val(0); - sum = val(0); - val no_weights = val(0); - - for (int i = 0; i < r_area; i++) { - val w = val_unpack(all_weights[i]); - val px = val_unpack(all_pixels[i]); - val keeps = step(avg_weight*WDT, w); - - w *= keeps; - sum += px * w; - total_weight += w; - no_weights += keeps; - } -#elif WD == 1 // moving cumulative average - total_weight -= discard_total_weight; - sum -= discard_sum; -#endif -#if WD // weight discard - avg_weight = total_weight / no_weights; -#endif - - total_weight += SW * spatial_r(vec3(0)); - sum += poi * SW * spatial_r(vec3(0)); - -#if V == 3 // weight map - result = val(avg_weight); -#else // mean - result = val(sum / total_weight); -#endif - - // store frames for temporal -#if T > 1 - -#endif -#if T && TRF - imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result)); -#elif T - imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2)); -#endif - -#if ASW == 0 // pre-WD weights -#define AS_weight old_avg_weight -#elif ASW == 1 // post-WD weights -#define AS_weight avg_weight -#endif - -#if ASK == 0 - val sharpening_strength = pow(AS_weight, val(ASP)); -#elif ASK == 1 - val sharpening_strength = mix( - pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)), - AS_weight, ASC); - // XXX normalize the result to account for a negative ASC? -#elif ASK == 2 - val sharpening_strength = val(ASP); -#endif - -#if AS == 1 // sharpen+denoise - val sharpened = result + (poi - result) * ASF; -#elif AS == 2 // sharpen only - val sharpened = poi + (poi - result) * ASF; -#endif - -#if EP // extremes preserve - float luminance = EP_texOff(0).x; - // EPSILON is needed since pow(0,0) is undefined - float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); - result = mix(poi, result, ep_weight); -#endif - -#if AS == 1 // sharpen+denoise - result = mix(sharpened, result, sharpening_strength); -#elif AS == 2 // sharpen only - result = mix(sharpened, poi, sharpening_strength); -#endif - -#if V == 4 // edge map - result = sharpening_strength; -#endif - -#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations - return vec4(0.5); -#endif - -#if V == 1 - result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0); -#elif V == 2 - result = (poi - result) * 0.5 + 0.5; -#endif - - return unval(mix(poi, result, BF)); -} - diff --git a/portable_config/shaders/nlmeans_lq.glsl b/portable_config/shaders/nlmeans_lq.glsl deleted file mode 100644 index 80eaf745..00000000 --- a/portable_config/shaders/nlmeans_lq.glsl +++ /dev/null @@ -1,1086 +0,0 @@ -/* vi: ft=c - * - * Based on vf_nlmeans.c from FFmpeg. - * - * Copyright (c) 2022 an3223 - * Copyright (c) 2016 Clément Bœsch - * - * This program is free software: you can redistribute it and/or modify it - * under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 2.1 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License - * for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program. If not, see . - */ - -// Description: nlmeans_lq.glsl: Faster, but lower quality. - -/* The recommended usage of this shader and its variant profiles is to add them - * to input.conf and then dispatch the appropriate shader via a keybind during - * media playback. Here is an example input.conf entry: - * - * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)" - * - * These shaders can also be enabled by default in mpv.conf, for example: - * - * glsl-shaders='~~/shaders/nlmeans.glsl' - * - * Both of the examples above assume the shaders are located in a subdirectory - * named "shaders" within mpv's config directory. Refer to the mpv - * documentation for more details. - * - * This shader is highly configurable via user variables below. Although the - * default settings should offer good quality at a reasonable speed, you are - * encouraged to tweak them to your preferences. Be mindful that certain - * settings may greatly affect speed. - * - * Denoising is most useful for noisy content. If there is no perceptible - * noise, you probably won't see a positive difference. - * - * The default settings are generally tuned for low noise and high detail - * preservation. The "medium" and "heavy" profiles are tuned for higher levels - * of noise. - * - * The denoiser will not work properly if the content has been upscaled - * beforehand (whether it was done by you or not). In such cases, consider - * issuing a command to downscale in the mpv console (backtick ` key): - * - * vf toggle scale=-2:720 - * - * ...replacing 720 with whatever resolution seems appropriate. Rerun the - * command to undo the downscale. It may take some trial-and-error to find the - * proper resolution. - */ - -/* Regarding speed - * - * Speed may vary wildly for different vo and gpu-api settings. Generally - * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this - * may be different for your system. - * - * If your GPU doesn't support textureGather, or if you are on a version of mpv - * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile - * - * If you plan on tinkering with NLM's settings, read below: - * - * textureGather only applies to luma and limited to the these configurations: - * - * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2} - * - Default, very fast, rotations and reflections should be free - * - If this is unusually slow then try changing gpu-api and vo - * - If it's still slow, try setting RI/RFI to 0. - * - * - PS=6:RI={0,1,3}:RFI={0,1,2} - * - Currently the only scalable variant - * - Patch shape is asymmetric on two axis - * - Rotations should have very little speed impact - * - Reflections may have a significant speed impact - * - * Options which always disable textureGather: - * - PD - * - NG - */ - -//!HOOK LUMA -//!HOOK CHROMA -//!BIND HOOKED -//!WIDTH HOOKED.w 1.25 / -//!HEIGHT HOOKED.h 1.25 / -//!DESC Non-local means (PRERF) -//!SAVE PRERF_LUMA - -vec4 hook() -{ - return HOOKED_texOff(0); -} - -//!HOOK LUMA -//!HOOK CHROMA -//!BIND PRERF_LUMA -//!WIDTH HOOKED.w -//!HEIGHT HOOKED.h -//!DESC Non-local means (RF) -//!SAVE RF_LUMA - -vec4 hook() -{ - return PRERF_LUMA_texOff(0); -} - -//!HOOK LUMA -//!HOOK CHROMA -//!BIND RF_LUMA -//!WIDTH RF_LUMA.w -//!HEIGHT RF_LUMA.h -//!DESC Non-local means (RF, share) -//!SAVE RF - -vec4 hook() -{ - return RF_LUMA_texOff(0); -} - -//!HOOK LUMA -//!HOOK CHROMA -//!BIND LUMA -//!WIDTH LUMA.w 3 / -//!HEIGHT LUMA.h 3 / -//!DESC Non-local means (EP) -//!SAVE EP - -vec4 hook() -{ - return LUMA_texOff(0); -} - -//!HOOK LUMA -//!HOOK CHROMA -//!BIND HOOKED -//!BIND RF_LUMA -//!BIND RF -//!BIND EP -//!DESC Non-local means (nlmeans_lq.glsl) - -// User variables - -// It is generally preferable to denoise luma and chroma differently, so the -// user variables for luma and chroma are split. - -// Denoising factor (level of blur, higher means more blur) -#ifdef LUMA_raw -#define S 1.25 -#else -#define S 5.0 -#endif - -/* Adaptive sharpening - * - * Uses the blur incurred by denoising to perform an unsharp mask, and uses the - * weight map to restrict the sharpening to edges. - * - * If you just want to increase/decrease sharpness then you want to change ASF. - * - * Use V=4 to visualize which areas are sharpened (black means sharpen). - * - * AS: - * - 0 to disable - * - 1 to sharpen+denoise - * - 2 to sharpen only - * ASF: Higher numbers make a sharper image - * ASP: Higher numbers use more of the sharp image - * ASW: - * - 0 to use pre-WD weights - * - 1 to use post-WD weights (ASP should be ~2x to compensate) - * ASK: Weight kernel: - * - 0 for power. This is the old method. - * - 1 for sigmoid. This is generally recommended. - * - 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image) - * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map - */ -#ifdef LUMA_raw -#define AS 0 -#define ASF 3.0 -#define ASP 1.0 -#define ASW 0 -#define ASK 1 -#define ASC 0.0 -#else -#define AS 0 -#define ASF 3.0 -#define ASP 1.0 -#define ASW 0 -#define ASK 1 -#define ASC 0.0 -#endif - -/* Starting weight - * - * Also known as the center weight. This represents the weight of the - * pixel-of-interest. Lower numbers may help handle heavy noise & ringing. - * - * EPSILON should be used instead of zero to avoid divide-by-zero errors. - */ -#ifdef LUMA_raw -#define SW 1.0 -#else -#define SW 0.5 -#endif - -/* Weight discard - * - * Discard weights that fall below a fraction of the average weight. This culls - * the most dissimilar samples from the blur, yielding a much more pleasant - * result, especially around edges. - * - * WD: - * - 2: True average. Better quality, but slower and requires GLSL 4.0 or later - * - 1: Moving cumulative average. Inaccurate, tends to blur directionally. - * - 0: Disable - * - * WDT: Threshold coefficient, higher numbers discard more - * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes - */ -#ifdef LUMA_raw -#define WD 1 -#define WDT 0.5 -#define WDP 6.0 -#else -#define WD 1 -#define WDT 0.75 -#define WDP 6.0 -#endif - -/* Extremes preserve - * - * Reduces denoising around very bright/dark areas. - * - * The downscaling factor of the EP shader stage affects what is considered a - * bright/dark area. The default of 3 should be fine, it's not recommended to - * change this. - * - * This is incompatible with RGB. If you have RGB hooks enabled then you will - * have to delete the EP shader stage or specify EP=0 through shader_cfg. - * - * EP: 1 to enable, 0 to disable - * DP: EP strength on dark patches, 0 to fully denoise - * BP: EP strength on bright patches, 0 to fully denoise - */ -#ifdef LUMA_raw -#define EP 1 -#define BP 0.75 -#define DP 0.25 -#else -#define EP 0 -#define BP 0.0 -#define DP 0.0 -#endif - -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ -/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ - -/* Patch & research sizes - * - * Patch size should be an odd number greater than or equal to 3. Higher values - * are slower and not always better. - * - * Research size be an odd number greater than or equal to 3. Higher values are - * generally better, but slower, blurrier, and gives diminishing returns. - */ -#ifdef LUMA_raw -#define P 3 -#define R 3 -#else -#define P 3 -#define R 5 -#endif - -/* Patch and research shapes - * - * Different shapes have different speed and quality characteristics. Every - * shape (besides square) is smaller than square. - * - * PS applies applies to patches, RS applies to research zones. - * - * Be wary of gather optimizations (see the Regarding Speed comment at the top) - * - * 0: square (symmetrical) - * 1: horizontal line (asymmetric) - * 2: vertical line (asymmetric) - * 3: diamond (symmetrical) - * 4: triangle (asymmetric, pointing upward) - * 5: truncated triangle (asymmetric on two axis, last row halved) - * 6: even sized square (asymmetric on two axis) - * 7: plus (symmetrical) - */ -#ifdef LUMA_raw -#define RS 3 -#define PS 3 -#else -#define RS 3 -#define PS 3 -#endif - -/* Robust filtering - * - * This setting is dependent on code generation from shader_cfg, so this - * setting can only be enabled via shader_cfg. - * - * Compares the pixel-of-interest against a guide, which could be a downscaled - * image or the output of another shader - */ -#define RF_LUMA 1 -#define RF 1 - -/* Rotational/reflectional invariance - * - * Number of rotations/reflections to try for each patch comparison. Can be - * slow, but improves feature preservation. More rotations/reflections gives - * diminishing returns. The most similar rotation/reflection will be used. - * - * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a - * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc. - * - * RI: Rotational invariance - * RFI (0 to 2): Reflectional invariance - */ -#ifdef LUMA_raw -#define RI 0 -#define RFI 0 -#else -#define RI 0 -#define RFI 0 -#endif - -/* Temporal denoising - * - * This setting is dependent on code generation from shader_cfg, so this - * setting can only be enabled via shader_cfg. - * - * Caveats: - * - Slower: - * - Each frame needs to be researched (more samples & more math) - * - Gather optimizations only apply to the current frame - * - Requires vo=gpu-next - * - Luma-only (this is a bug) - * - Buggy - * - * May cause motion blur and may struggle more with noise that persists across - * multiple frames (e.g., from compression or duplicate frames), but can work - * very well on high quality video. - * - * Motion estimation (ME) should improve quality without impacting speed. - * - * T: number of frames used - * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg - * MEF: estimate factor, compensates for ME being one frame behind - * TRF: compare against the denoised frames - */ -#ifdef LUMA_raw -#define T 0 -#define ME 1 -#define MEF 2 -#define TRF 0 -#else -#define T 0 -#define ME 0 -#define MEF 2 -#define TRF 0 -#endif - -/* Spatial kernel - * - * Increasing the spatial denoising factor (SS) reduces the weight of further - * pixels. - * - * Spatial distortion instructs the spatial kernel to view that axis as - * closer/further, for instance SD=(1,1,0.5) would make the temporal axis - * appear closer and increase blur between frames. - * - * The intra-patch variants are supposed to help with larger patch sizes. - * - * SST: enables spatial kernel if R>=PST, 0 fully disables - * SS: spatial sigma - * SD: spatial distortion (X, Y, time) - * PSS: intra-patch spatial sigma - * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables - * PSD: intra-patch spatial distortion (X, Y) - */ -#ifdef LUMA_raw -#define SST 1 -#define SS 0.25 -#define SD vec3(1,1,1) -#define PST 0 -#define PSS 0.0 -#define PSD vec2(1,1) -#else -#define SST 1 -#define SS 0.25 -#define SD vec3(1,1,1) -#define PST 0 -#define PSS 0.0 -#define PSD vec2(1,1) -#endif - -/* Kernels - * - * SK: spatial kernel - * RK: range kernel (takes patch differences) - * PSK: intra-patch spatial kernel - * - * List of available kernels: - * - * bicubic - * cos - * gaussian - * lanczos - * quadratic - * sinc - * sphinx - */ -#ifdef LUMA_raw -#define SK gaussian -#define RK gaussian -#define PSK gaussian -#else -#define SK gaussian -#define RK gaussian -#define PSK gaussian -#endif - -// Scaling factor (should match WIDTH/HEIGHT) -#ifdef LUMA_raw -#define SF 1 -#else -#define SF 1 -#endif - -/* Visualization - * - * 0: off - * 1: absolute difference between input/output to the power of 0.25 - * 2: difference between input/output centered on 0.5 - * 3: avg_weight - * 4: edge map (based on the relevant AS settings) - */ -#ifdef LUMA_raw -#define V 0 -#else -#define V 0 -#endif - -// Blur factor (0.0 returns the input image, 1.0 returns the output image) -#ifdef LUMA_raw -#define BF 1.0 -#else -#define BF 1.0 -#endif - -// Force disable textureGather -#ifdef LUMA_raw -#define NG 0 -#else -#define NG 0 -#endif - -// Patch donut (probably useless) -#ifdef LUMA_raw -#define PD 0 -#else -#define PD 0 -#endif - -// Duplicate 1st weight (for luma-guided-chroma) -#ifdef LUMA_raw -#define D1W 0 -#else -#define D1W 0 -#endif - -// Skip patch comparison -#ifdef LUMA_raw -#define SKIP_PATCH 0 -#else -#define SKIP_PATCH 0 -#endif - -// Shader code - -#define EPSILON 0.00000000001 -#define M_PI 3.14159265358979323846 -#define POW2(x) ((x)*(x)) -#define POW3(x) ((x)*(x)*(x)) -#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0)))) -#define gaussian(x) exp(-1 * POW2(x)) -#define lanczos(x) POW2(sinc(x)) -#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5)) -#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI)) -#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI)) - -// XXX could maybe be better optimized on LGC -// XXX return original alpha component instead of 1.0 -#if defined(LUMA_raw) -#define val float -#define val_swizz(v) (v.x) -#define unval(v) vec4(v.x, 0, 0, 1.0) -#define val_packed val -#define val_pack(v) (v) -#define val_unpack(v) (v) -#elif defined(CHROMA_raw) -#define val vec2 -#define val_swizz(v) (v.xy) -#define unval(v) vec4(v.x, v.y, 0, 1.0) -#define val_packed uint -#define val_pack(v) packUnorm2x16(v) -#define val_unpack(v) unpackUnorm2x16(v) -#else -#define val vec3 -#define val_swizz(v) (v.xyz) -#define unval(v) vec4(v.x, v.y, v.z, 1.0) -#define val_packed val -#define val_pack(v) (v) -#define val_unpack(v) (v) -#endif - -#if PS == 6 -const int hp = P/2; -#else -const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes -#endif - -#if RS == 6 -const int hr = R/2; -#else -const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes -#endif - -// donut increment, increments without landing on (0,0,0) -// much faster than a continue statement -#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0)))) - -// patch/research shapes -// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R) -// dots (.) represent samples (pixels) and X represents the pixel-of-interest - -// Z ..... -// Z ..... -// Z ..X.. -// Z ..... -// Z ..... -#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr) - -// (in this instance Z=4) -// Z .... -// Z .... -// Z ..X. -// Z .... -#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr) - -// Z-4 . -// Z-2 ... -// Z ..X.. -#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr) - -// Z-4 . -// Z-2 ... -// hz+1 ..X -#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr) -#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z) - -// Z-4 . -// Z-2 ... -// Z ..X.. -// Z-2 ... -// Z-4 . -#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr) -#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z) - -// -// Z ..X.. -// -#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++) - -// 90 degree rotation of S_HORIZONTAL -#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr) - -// 1 . -// 1 . -// Z ..X.. -// 1 . -// 1 . -#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr) -#define S_PLUS_A(hz,Z) (Z*2 - 1) - -// XXX implement S_PLUS w/ an X overlayed: -// 3 . . . -// 3 ... -// Z ..X.. -// 3 ... -// 3 . . . - -// XXX implement an X shape: -// 2 . . -// 2 . . -// 1 X -// 2 . . -// 2 . . - -// 1x1 square -#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++) - -#define T1 (T+1) -#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++) - -#ifdef LUMA_raw -#define RF_ RF_LUMA -#else -#define RF_ RF -#endif - -// Skip comparing the pixel-of-interest against itself, unless RF is enabled -#if RF_ -#define RINCR(z,c) (z.c++) -#else -#define RINCR DINCR -#endif - -#define R_AREA(a) (a * T1 + RF_-1) - -// research shapes -// XXX would be nice to have the option of temporally-varying research sizes -#if R == 0 || R == 1 -#define FOR_RESEARCH(r) S_1X1(r) -const int r_area = R_AREA(1); -#elif RS == 7 -#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y)) -const int r_area = R_AREA(S_PLUS_A(hr,R)); -#elif RS == 6 -#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y)) -const int r_area = R_AREA(R*R); -#elif RS == 5 -#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x)) -const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); -#elif RS == 4 -#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x)) -const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); -#elif RS == 3 -#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y)) -const int r_area = R_AREA(S_DIAMOND_A(hr,R)); -#elif RS == 2 -#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y)) -const int r_area = R_AREA(R); -#elif RS == 1 -#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x)) -const int r_area = R_AREA(R); -#elif RS == 0 -#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y)) -const int r_area = R_AREA(R*R); -#endif - -#define RI1 (RI+1) -#define RFI1 (RFI+1) - -#if RI -#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1) -#else -#define FOR_ROTATION -#endif - -#if RFI -#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++) -#else -#define FOR_REFLECTION -#endif - -#if PD -#define PINCR DINCR -#else -#define PINCR(z,c) (z.c++) -#endif - -#define P_AREA(a) (a - PD) - -// patch shapes -#if P == 0 || P == 1 -#define FOR_PATCH(p) S_1X1(p) -const int p_area = P_AREA(1); -#elif PS == 7 -#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y)) -const int p_area = P_AREA(S_PLUS_A(hp,P)); -#elif PS == 6 -#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y)) -const int p_area = P_AREA(P*P); -#elif PS == 5 -#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x)) -const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); -#elif PS == 4 -#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x)) -const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); -#elif PS == 3 -#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y)) -const int p_area = P_AREA(S_DIAMOND_A(hp,P)); -#elif PS == 2 -#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y)) -const int p_area = P_AREA(P); -#elif PS == 1 -#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x)) -const int p_area = P_AREA(P); -#elif PS == 0 -#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y)) -const int p_area = P_AREA(P*P); -#endif - -const float r_scale = 1.0/r_area; -const float p_scale = 1.0/p_area; - -#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size))) -#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off) - -#if RF_ && defined(LUMA_raw) -#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off) -#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr))) -#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0) -#elif RF_ && D1W -#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off) -#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr))) -#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0) -#elif RF_ -#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off) -#else -#define load2_(off) load_(off) -#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr))) -#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0) -#endif - -#if T -val load(vec3 off) -{ - switch (min(int(off.z), frame)) { - case 0: return val_swizz(load_(off)); - - } -} -val load2(vec3 off) -{ - return off.z == 0 ? val_swizz(load2_(off)) : load(off); -} -#else -#define load(off) val_swizz(load_(off)) -#define load2(off) val_swizz(load2_(off)) -#endif - -val poi = load(vec3(0)); // pixel-of-interest -val poi2 = load2(vec3(0)); // guide pixel-of-interest - -#if RI // rotation -vec2 rot(vec2 p, float d) -{ - return vec2( - p.x * cos(radians(d)) - p.y * sin(radians(d)), - p.y * sin(radians(d)) + p.x * cos(radians(d)) - ); -} -#else -#define rot(p, d) (p) -#endif - -#if RFI // reflection -vec2 ref(vec2 p, int d) -{ - switch (d) { - case 0: return p; - case 1: return p * vec2(1, -1); - case 2: return p * vec2(-1, 1); - } -} -#else -#define ref(p, d) (p) -#endif - -#if SST && R >= SST -float spatial_r(vec3 v) -{ - v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size); - return SK(length(v*SD)*SS); -} -#else -#define spatial_r(v) (1) -#endif - -#if PST && P >= PST -#define spatial_p(v) PSK(length(v*PSD)*PSS) -#else -#define spatial_p(v) (1) -#endif - -val range(val pdiff_sq) -{ - const float h = S*0.013; - const float pdiff_scale = 1.0/(h*h); - pdiff_sq = sqrt(pdiff_sq * pdiff_scale); -#if defined(LUMA_raw) - return RK(pdiff_sq); -#elif defined(CHROMA_raw) - return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y)); -#else - return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z)); -#endif - //return exp(-pdiff_sq * pdiff_scale); - - // weight function from the NLM paper, it's not very good - //return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale); -} - -val patch_comparison(vec3 r, vec3 r2) -{ - vec3 p; - val min_rot = val(p_area); - - FOR_ROTATION FOR_REFLECTION { - val pdiff_sq = val(0); - FOR_PATCH(p) { - vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z); - val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF); - diff_sq *= diff_sq; - diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy); - pdiff_sq += diff_sq; - } - min_rot = min(min_rot, pdiff_sq); - } - - return min_rot * p_scale; -} - -#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false -#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3) - -#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER -// 3x3 diamond/plus patch_comparison_gather -// XXX extend to support arbitrary sizes (probably requires code generation) -// XXX extend to support 3x3 square -// XXX support PSS -const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) }; -const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF }; -vec4 poi_patch = gather_offs(0, offsets); -float patch_comparison_gather(vec3 r, vec3 r2) -{ - float min_rot = p_area - 1; - vec4 transformer = gather_offs(r, offsets_sf); - FOR_ROTATION { - FOR_REFLECTION { - float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1)); - min_rot = min(diff_sq, min_rot); -#if RFI - switch(rfi) { - case 0: transformer = transformer.zyxw; break; - case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror - case 2: transformer = transformer.zyxw; break; // undoes last mirror - } -#endif - } -#if RI == 3 - transformer = transformer.wxyz; -#elif RI == 1 - transformer = transformer.zwxy; -#endif - } - float center_diff_sq = poi2.x - load2(r).x; - center_diff_sq *= center_diff_sq; - return (min_rot + center_diff_sq) * p_scale; -} -#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER -// tiled even square patch_comparison_gather -// XXX extend to support odd square? -float patch_comparison_gather(vec3 r, vec3 r2) -{ - vec2 tile; - float min_rot = p_area; - - /* gather order: - * w z - * x y - */ - float pdiff_sq = 0; - for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) { - vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy); - diff_sq *= diff_sq; - diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)), - spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0))); - pdiff_sq += dot(diff_sq, vec4(1)); - } - min_rot = min(min_rot, pdiff_sq); - - return min_rot * p_scale; -} -#else -#define patch_comparison_gather patch_comparison -#endif - -vec4 hook() -{ - val total_weight = val(0); - val sum = val(0); - val result = val(0); - - vec3 r = vec3(0); - vec3 p = vec3(0); - vec3 me = vec3(0); - -#if T && ME == 1 // temporal & motion estimation - vec3 me_tmp = vec3(0); - float maxweight = 0; -#elif T && ME == 2 // temporal & motion estimation - vec3 me_sum = vec3(0); - float me_weight = 0; -#endif - -#if WD == 2 // weight discard - int r_index = 0; - val_packed all_weights[r_area]; - val_packed all_pixels[r_area]; -#elif WD == 1 // weight discard - val no_weights = val(0); - val discard_total_weight = val(0); - val discard_sum = val(0); -#endif - - FOR_FRAME(r) { - // XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired) -#if T && ME == 1 // temporal & motion estimation max weight - if (r.z > 0) { - me += me_tmp * MEF; - me_tmp = vec3(0); - maxweight = 0; - } -#elif T && ME == 2 // temporal & motion estimation weighted average - if (r.z > 0) { - me += round(me_sum / me_weight * MEF); - me_sum = vec3(0); - me_weight = 0; - } -#endif - FOR_RESEARCH(r) { // main NLM logic -#if SKIP_PATCH - val weight = val(1); -#else - val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0)); - val weight = range(pdiff_sq); -#endif - -#if T && ME == 1 // temporal & motion estimation max weight - me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); - maxweight = max(maxweight, weight.x); -#elif T && ME == 2 // temporal & motion estimation weighted average - me_sum += vec3(r.xy,0) * weight.x; - me_weight += weight.x; -#endif - -#if D1W - weight = val(weight.x); -#endif - - weight *= spatial_r(r); - -#if WD == 2 // weight discard - all_weights[r_index] = val_pack(weight); - all_pixels[r_index] = val_pack(load(r+me)); - r_index++; -#elif WD == 1 // weight discard - val wd_scale = 1.0/max(no_weights, 1); - val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight); - discard_sum += load(r+me) * weight * (1 - keeps); - discard_total_weight += weight * (1 - keeps); - no_weights += keeps; -#endif - - sum += load(r+me) * weight; - total_weight += weight; - } // FOR_RESEARCH - } // FOR_FRAME - - val avg_weight = total_weight * r_scale; - val old_avg_weight = avg_weight; - -#if WD == 2 // true average - total_weight = val(0); - sum = val(0); - val no_weights = val(0); - - for (int i = 0; i < r_area; i++) { - val w = val_unpack(all_weights[i]); - val px = val_unpack(all_pixels[i]); - val keeps = step(avg_weight*WDT, w); - - w *= keeps; - sum += px * w; - total_weight += w; - no_weights += keeps; - } -#elif WD == 1 // moving cumulative average - total_weight -= discard_total_weight; - sum -= discard_sum; -#endif -#if WD // weight discard - avg_weight = total_weight / no_weights; -#endif - - total_weight += SW * spatial_r(vec3(0)); - sum += poi * SW * spatial_r(vec3(0)); - -#if V == 3 // weight map - result = val(avg_weight); -#else // mean - result = val(sum / total_weight); -#endif - - // store frames for temporal -#if T > 1 - -#endif -#if T && TRF - imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result)); -#elif T - imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2)); -#endif - -#if ASW == 0 // pre-WD weights -#define AS_weight old_avg_weight -#elif ASW == 1 // post-WD weights -#define AS_weight avg_weight -#endif - -#if ASK == 0 - val sharpening_strength = pow(AS_weight, val(ASP)); -#elif ASK == 1 - val sharpening_strength = mix( - pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)), - AS_weight, ASC); - // XXX normalize the result to account for a negative ASC? -#elif ASK == 2 - val sharpening_strength = val(ASP); -#endif - -#if AS == 1 // sharpen+denoise - val sharpened = result + (poi - result) * ASF; -#elif AS == 2 // sharpen only - val sharpened = poi + (poi - result) * ASF; -#endif - -#if EP // extremes preserve - float luminance = EP_texOff(0).x; - // EPSILON is needed since pow(0,0) is undefined - float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); - result = mix(poi, result, ep_weight); -#endif - -#if AS == 1 // sharpen+denoise - result = mix(sharpened, result, sharpening_strength); -#elif AS == 2 // sharpen only - result = mix(sharpened, poi, sharpening_strength); -#endif - -#if V == 4 // edge map - result = sharpening_strength; -#endif - -#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations - return vec4(0.5); -#endif - -#if V == 1 - result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0); -#elif V == 2 - result = (poi - result) * 0.5 + 0.5; -#endif - - return unval(mix(poi, result, BF)); -} - diff --git a/portable_config/shaders/nlmeans_temporal.glsl b/portable_config/shaders/nlmeans_temporal.glsl index a3bf340d..c3d16f66 100644 --- a/portable_config/shaders/nlmeans_temporal.glsl +++ b/portable_config/shaders/nlmeans_temporal.glsl @@ -21,299 +21,1121 @@ // Description: nlmeans_temporal.glsl: Very experimental and buggy, limited to vo=gpu-next. -/* The recommended usage of this shader and its variant profiles is to add them - * to input.conf and then dispatch the appropriate shader via a keybind during - * media playback. Here is an example input.conf entry: +/* This shader is highly configurable via user variables below. Although the + * default settings should offer good quality at a reasonable speed, you are + * encouraged to tweak them to your preferences. + */ + +// The following is shader code injected from ../LQ/nlmeans.glsl +/* vi: ft=c * - * F4 no-osd change-list glsl-shaders toggle "~~/shaders/nlmeans_luma.glsl"; show-text "Non-local means (LUMA only)" + * Based on vf_nlmeans.c from FFmpeg. * - * These shaders can also be enabled by default in mpv.conf, for example: + * Copyright (c) 2022 an3223 + * Copyright (c) 2016 Clément Bœsch * - * glsl-shaders='~~/shaders/nlmeans.glsl' + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 2.1 of the License, or (at + * your option) any later version. * - * Both of the examples above assume the shaders are located in a subdirectory - * named "shaders" within mpv's config directory. Refer to the mpv - * documentation for more details. + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. * - * This shader is highly configurable via user variables below. Although the + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + */ + +// Description: nlmeans.glsl: Faster, but lower quality. + +/* This shader is highly configurable via user variables below. Although the * default settings should offer good quality at a reasonable speed, you are - * encouraged to tweak them to your preferences. Be mindful that certain - * settings may greatly affect speed. + * encouraged to tweak them to your preferences. + */ + +//!HOOK LUMA +//!HOOK CHROMA +//!BIND HOOKED +//!DESC Non-local means (nlmeans.glsl) +//!SAVE RF_LUMA + +// User variables + +// It is generally preferable to denoise luma and chroma differently, so the +// user variables for luma and chroma are split. + +// Denoising factor (level of blur, higher means more blur) +#ifdef LUMA_raw +#define S 3.5968056672833097 +#else +#define S 5.191526541606411 +#endif + +/* Adaptive sharpening + * + * Performs an unsharp mask by subtracting the spatial kernel's blur from the + * NLM blur. For sharpen+denoise the sharpening is limited to edge areas and + * denoising is done everywhere else. + * + * Use V=4 to visualize which areas are sharpened (black means sharpen). * - * Denoising is most useful for noisy content. If there is no perceptible - * noise, you probably won't see a positive difference. + * AS: + * - 0: disable + * - 1: sharpen+denoise + * - 2: sharpen only + * ASF: Higher numbers make a sharper image + * ASA: Anti-ringing, higher numbers increase strength + * ASP: Power, lower numbers increase sharpening on lower frequency detail + */ +#ifdef LUMA_raw +#define AS 0 +#define ASF 0.1625 +#define ASA 5.0 +#define ASP 0.5 +#else +#define AS 0 +#define ASF 0.1625 +#define ASA 5.0 +#define ASP 0.5 +#endif + +/* Starting weight * - * The default settings are generally tuned for low noise and high detail - * preservation. The "medium" and "heavy" profiles are tuned for higher levels - * of noise. + * Also known as the center weight. This represents the weight of the + * pixel-of-interest. Lower numbers may help handle heavy noise & ringing. * - * The denoiser will not work properly if the content has been upscaled - * beforehand (whether it was done by you or not). In such cases, consider - * issuing a command to downscale in the mpv console (backtick ` key): + * EPSILON should be used instead of zero to avoid divide-by-zero errors. + */ +#ifdef LUMA_raw +#define SW 0.7392620481427672 +#else +#define SW 0.6448288408806067 +#endif + +/* Weight discard * - * vf toggle scale=-2:720 + * Reduces weights that fall below a fraction of the average weight. This culls + * the most dissimilar samples from the blur, which can yield a better result, + * especially around edges. + * + * WD: + * - 2: Mean. Better quality, but slower and requires GLSL 4.0 or later + * - 1: Moving cumulative average. Fast but inaccurate, blurs directionally. + * - 0: Disable * - * ...replacing 720 with whatever resolution seems appropriate. Rerun the - * command to undo the downscale. It may take some trial-and-error to find the - * proper resolution. + * WDT: Threshold coefficient, higher numbers discard more + * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes + * WDS (not for WDK=is_zero): Higher numbers are more eager to reduce weights */ +#ifdef LUMA_raw +#define WD 1 +#define WDT 0.580415381682815 +#define WDP 5.381278367349288 +#define WDS 1.0 +#else +#define WD 1 +#define WDT 0.913447511792627 +#define WDP 5.832936323930807 +#define WDS 1.0 +#endif -/* Regarding speed +/* Extremes preserve * - * Speed may vary wildly for different vo and gpu-api settings. Generally - * vo=gpu-next and gpu-api=vulkan are recommended for the best speed, but this - * may be different for your system. + * Reduce denoising in very bright/dark areas. * - * If your GPU doesn't support textureGather, or if you are on a version of mpv - * prior to 0.35.0, then consider setting RI/RFI to 0, or try the LQ profile + * Disabled by default now. If you want to reenable this, set EP=3/ in + * Makefile.nlm and rebuild. * - * If you plan on tinkering with NLM's settings, read below: + * The downscaling factor of the EP shader stage affects what is considered a + * bright/dark area. * - * textureGather only applies to luma and limited to the these configurations: + * This is incompatible with RGB. If you have RGB hooks enabled then you will + * have to delete the EP shader stage or specify EP=0 through shader_cfg. * - * - PS={3,7}:P=3:PST=0:RI={0,1,3}:RFI={0,1,2} - * - Default, very fast, rotations and reflections should be free - * - If this is unusually slow then try changing gpu-api and vo - * - If it's still slow, try setting RI/RFI to 0. + * EP: 1 to enable, 0 to disable + * DP: EP strength on dark areas, 0 to fully denoise + * BP: EP strength on bright areas, 0 to fully denoise + */ +#ifdef LUMA_raw +#define EP 0 +#define BP 0.75 +#define DP 0.25 +#else +#define EP 0 +#define BP 0.0 +#define DP 0.0 +#endif + +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ + +/* textureGather applicable configurations: * - * - PS=6:RI={0,1,3}:RFI={0,1,2} + * - PS={0,3,7,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2} + * - PS={0,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2} + * - PS=6:RI=0:RFI=0 * - Currently the only scalable variant - * - Patch shape is asymmetric on two axis - * - Rotations should have very little speed impact - * - Reflections may have a significant speed impact * * Options which always disable textureGather: - * - PD - * - NG + * - NG + * - SAMPLE + * - PD + * + * Running without textureGather may be much slower. */ -// The following is shader code injected from guided.glsl -/* vi: ft=c +/* Patch & research sizes * - * Copyright (c) 2022 an3223 + * P should be an odd number. Higher values are slower and not always better. * - * This program is free software: you can redistribute it and/or modify it - * under the terms of the GNU Lesser General Public License as published by - * the Free Software Foundation, either version 2.1 of the License, or (at - * your option) any later version. + * R should be an odd number greater than or equal to 3. Higher values are + * generally better, but slower, blurrier, and gives diminishing returns. + */ +#ifdef LUMA_raw +#define P 3 +#define R 5 +#else +#define P 3 +#define R 5 +#endif + +/* Patch and research shapes * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License - * for more details. + * Different shapes have different speed and quality characteristics. Every + * shape (besides square) is smaller than square. * - * You should have received a copy of the GNU Lesser General Public License - * along with this program. If not, see . + * PS applies applies to patches, RS applies to research zones. + * + * 0: square (symmetrical) + * 1: horizontal line (asymmetric) + * 2: vertical line (asymmetric) + * 3: diamond (symmetrical) + * 4: triangle (asymmetric, pointing upward) + * 5: truncated triangle (asymmetric on two axis, last row halved) + * 6: even sized square (asymmetric on two axis) + * 7: plus (symmetrical) + * 8: plus X (symmetrical) */ +#ifdef LUMA_raw +#define RS 3 +#define PS 4 +#else +#define RS 3 +#define PS 3 +#endif -// Description: guided.glsl: Guided by the downscaled image +/* Robust filtering + * + * This setting is dependent on code generation from shader_cfg, so this + * setting can only be enabled via shader_cfg. + * + * Computes weights on a guide, which could be a downscaled image or the output + * of another shader, and applies the weights to the original image + */ +#define RF_LUMA 0 +#define RF 0 -/* The radius can be adjusted with the MEANI stage's downscaling factor. - * Higher numbers give a bigger radius. +/* Rotational/reflectional invariance + * + * Number of rotations/reflections to try for each patch comparison. Can be + * slow, but improves feature preservation. More rotations/reflections gives + * diminishing returns. The most similar rotation/reflection will be used. * - * The E variable can be found in the A stage. + * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a + * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc. * - * The subsampling (fast guided filter) can be adjusted with the I stage's - * downscaling factor. Higher numbers are faster. + * Consider setting SAMPLE=1 if setting RI to a setting that would require + * sampling between pixels. * - * The guide's subsampling can be adjusted with the PREI stage's downscaling - * factor. Higher numbers downscale more. + * RI: Rotational invariance + * RFI (0 to 2): Reflectional invariance */ +#ifdef LUMA_raw +#define RI 0 +#define RFI 0 +#else +#define RI 0 +#define RFI 0 +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!BIND HOOKED -//!WIDTH HOOKED.w 1.25 / -//!HEIGHT HOOKED.h 1.25 / -//!DESC Guided filter (PREI) -//!SAVE _INJ_PREI +/* Temporal denoising + * + * This setting is dependent on code generation from shader_cfg, so this + * setting can only be enabled via shader_cfg. + * + * Caveats: + * - Slower: + * - Each frame needs to be researched (more samples & more math) + * - Gather optimizations only apply to the current frame + * - Requires vo=gpu-next + * - Luma-only (this is a bug) + * - Buggy + * + * May cause motion blur and may struggle more with noise that persists across + * multiple frames (e.g., from compression or duplicate frames), but can work + * very well on high quality video. + * + * Motion estimation (ME) should improve quality without impacting speed. + * + * T: number of frames used + * ME: motion estimation, 0 for none, 1 for max weight, 2 for weighted avg + * MEF: estimate factor, compensates for ME being one frame behind + * TRF: compare against the denoised frames + */ +#ifdef LUMA_raw +#define T 0 +#define ME 1 +#define MEF 2 +#define TRF 0 +#else +#define T 0 +#define ME 0 +#define MEF 2 +#define TRF 0 +#endif + +/* Spatial kernel + * + * Increasing the spatial denoising factor (SS) reduces the weight of further + * pixels. + * + * Spatial distortion instructs the spatial kernel to view that axis as + * closer/further, for instance SD=(1,1,0.5) would make the temporal axis + * appear closer and increase blur between frames. + * + * The intra-patch variants are supposed to help with larger patch sizes. + * + * SST: enables spatial kernel if R>=PST, 0 fully disables + * SS: spatial sigma + * SD: spatial distortion (X, Y, time) + * PSS: intra-patch spatial sigma + * PST: enables intra-patch spatial kernel if P>=PST, 0 fully disables + * PSD: intra-patch spatial distortion (X, Y) + */ +#ifdef LUMA_raw +#define SST 1 +#define SS 0.49764743714339127 +#define SD vec3(1,1,1) +#define PST 0 +#define PSS 0.0 +#define PSD vec2(1,1) +#else +#define SST 1 +#define SS 0.32091162692066677 +#define SD vec3(1,1,1) +#define PST 0 +#define PSS 0.0 +#define PSD vec2(1,1) +#endif + +/* Kernels + * + * SK: spatial kernel + * RK: range kernel (takes patch differences) + * PSK: intra-patch spatial kernel + * WDK: weight discard kernel + * WD1TK (WD=1 only): weight discard tolerance kernel + * + * List of available kernels: + * + * bicubic + * cos + * gaussian + * lanczos + * quadratic_ (unclamped) + * sinc + * sinc_ (unclamped) + * sinc3 + * sphinx + * sphinx_ (unclamped) + * triangle_ (unclamped) + * triangle + */ +#ifdef LUMA_raw +#define SK gaussian +#define RK gaussian +#define PSK gaussian +#define WDK is_zero +#define WD1TK gaussian +#else +#define SK gaussian +#define RK gaussian +#define PSK gaussian +#define WDK is_zero +#define WD1TK gaussian +#endif + +/* Sampling method + * + * In most cases this shouldn't make any difference, only set to bilinear if + * it's necessary to sample between pixels (e.g., RI=2). + * + * 0: nearest neighbor + * 1: bilinear + */ +#ifdef LUMA_raw +#define SAMPLE 0 +#else +#define SAMPLE 0 +#endif + +/* Research scaling factor + * + * Higher numbers sample more sparsely as the distance from the POI grows. + */ +#ifdef LUMA_raw +#define RSF 0.0 +#else +#define RSF 0.0 +#endif + +// Scaling factor (should match WIDTH/HEIGHT) +#ifdef LUMA_raw +#define SF 1 +#else +#define SF 1 +#endif + +// Use the guide image as the input image +#ifdef LUMA_raw +#define GUIDE_INPUT 0 +#else +#define GUIDE_INPUT 0 +#endif + +/* Visualization + * + * 0: off + * 1: absolute difference between input/output to the power of 0.25 + * 2: difference between input/output centered on 0.5 + * 3: post-WD weight map + * 4: pre-WD weight map + * 5: unsharp mask + * 6: EP + */ +#ifdef LUMA_raw +#define V 0 +#else +#define V 0 +#endif + +// Blur factor (0.0 returns the input image, 1.0 returns the output image) +#ifdef LUMA_raw +#define BF 1.0 +#else +#define BF 1.0 +#endif + +// Force disable textureGather +#ifdef LUMA_raw +#define NG 0 +#else +#define NG 0 +#endif + +// Patch donut (probably useless) +#ifdef LUMA_raw +#define PD 0 +#else +#define PD 0 +#endif + +// Duplicate 1st weight (for luma-guided-chroma) +#ifdef LUMA_raw +#define D1W 0 +#else +#define D1W 0 +#endif + +// Skip patch comparison +#ifdef LUMA_raw +#define SKIP_PATCH 0 +#else +#define SKIP_PATCH 0 +#endif + +// Shader code + +#define EPSILON 1.2e-38 +#define M_PI 3.14159265358979323846 +#define POW2(x) ((x)*(x)) +#define POW3(x) ((x)*(x)*(x)) +#define bicubic_(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0)))) +#define bicubic(x) bicubic_(clamp((x), 0.0, 2.0)) +#define gaussian(x) exp(-1 * POW2(x)) +#define quadratic_(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5)) +#define quadratic(x) quadratic_(clamp((x), 0.0, 1.5)) +#define sinc_(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI)) +#define sinc(x) sinc_(clamp((x), 0.0, 1.0)) +#define sinc3(x) sinc_(clamp((x), 0.0, 3.0)) +#define lanczos(x) (sinc3(x) * sinc(x)) +#define sphinx_(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI)) +#define sphinx(x) sphinx_(clamp((x), 0.0, 1.4302966531242027)) +#define triangle_(x) (1 - (x)) +#define triangle(x) triangle_(clamp((x), 0.0, 1.0)) +#define is_zero(x) int(x == 0) + +// XXX could maybe be better optimized on LGC +#if defined(LUMA_raw) +#define val float +#define val_swizz(v) (v.x) +#define unval(v) vec4(v.x, 0, 0, poi_.a) +#define val_packed val +#define val_pack(v) (v) +#define val_unpack(v) (v) +#elif defined(CHROMA_raw) +#define val vec2 +#define val_swizz(v) (v.xy) +#define unval(v) vec4(v.x, v.y, 0, poi_.a) +#define val_packed uint +#define val_pack(v) packUnorm2x16(v) +#define val_unpack(v) unpackUnorm2x16(v) +#else +#define val vec3 +#define val_swizz(v) (v.xyz) +#define unval(v) vec4(v.x, v.y, v.z, poi_.a) +#define val_packed val +#define val_pack(v) (v) +#define val_unpack(v) (v) +#endif + +#if PS == 6 +const int hp = P/2; +#else +const float hp = int(P/2) - 0.5*(1-(P%2)); // sample between pixels for even patch sizes +#endif + +#if RS == 6 +const int hr = R/2; +#else +const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes +#endif + +// patch/research shapes +// each shape is depicted in a comment, where Z=5 (Z corresponds to P or R) +// dots (.) represent samples (pixels) and X represents the pixel-of-interest + +// Z ..... +// Z ..... +// Z ..X.. +// Z ..... +// Z ..... +#define S_SQUARE(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz; z.y <= hz; incr) + +// (in this instance Z=4) +// Z .... +// Z .... +// Z ..X. +// Z .... +#define S_SQUARE_EVEN(z,hz,incr) for (z.x = -hz; z.x < hz; z.x++) for (z.y = -hz; z.y < hz; incr) + +// Z-4 . +// Z-2 ... +// Z ..X.. +#define S_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz); incr) + +// Z-4 . +// Z-2 ... +// hz+1 ..X +#define S_TRUNC_TRIANGLE(z,hz,incr) for (z.y = -hz; z.y <= 0; z.y++) for (z.x = -abs(abs(z.y) - hz); z.x <= abs(abs(z.y) - hz)*int(z.y!=0); incr) +#define S_TRIANGLE_A(hz,Z) int(hz*hz+Z) + +// Z-4 . +// Z-2 ... +// Z ..X.. +// Z-2 ... +// Z-4 . +#define S_DIAMOND(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(abs(z.x) - hz); z.y <= abs(abs(z.x) - hz); incr) +#define S_DIAMOND_A(hz,Z) int(hz*hz*2+Z) + +// +// Z ..X.. +// +#define S_HORIZONTAL(z,hz,incr) for (z.y = 0; z.y <= 0; z.y++) for (z.x = -hz; z.x <= hz; incr) + +// 90 degree rotation of S_HORIZONTAL +#define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr) + +// 1 . +// 1 . +// Z ..X.. +// 1 . +// 1 . +#define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr) +#define S_PLUS_A(hz,Z) (Z*2 - 1) + +// 3 . . . +// 3 ... +// Z ..X.. +// 3 ... +// 3 . . . +#define S_PLUS_X(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(z.x) + -hz * int(z.x == 0); z.y <= abs(z.x) + hz * int(z.x == 0); incr) +#define S_PLUS_X_A(hz,Z) (Z*4 - 3) + +// 1x1 square +#define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++) + +#define T1 (T+1) +#define FOR_FRAME(r) for (r.z = 0; r.z < T1; r.z++) + +#ifdef LUMA_raw +#define RF_ RF_LUMA +#else +#define RF_ RF +#endif + +// donut increment, increments without landing on (0,0,0) +// much faster than a continue statement +#define DINCR(z,c,a) ((z.c += a),(z.c += int(z == vec3(0)))) + +#define R_AREA(a) (a * T1 - 1) + +// research shapes +// XXX would be nice to have the option of temporally-varying research sizes +#if R == 0 || R == 1 +#define FOR_RESEARCH(r) S_1X1(r) +const int r_area = R_AREA(1); +#elif RS == 8 +#define FOR_RESEARCH(r) S_PLUS_X(r,hr,DINCR(r,y,max(1,abs(r.x)))) +const int r_area = R_AREA(S_PLUS_X_A(hr,R)); +#elif RS == 7 +#define FOR_RESEARCH(r) S_PLUS(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(S_PLUS_A(hr,R)); +#elif RS == 6 +#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(R*R); +#elif RS == 5 +#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,DINCR(r,x,1)) +const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); +#elif RS == 4 +#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,DINCR(r,x,1)) +const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); +#elif RS == 3 +#define FOR_RESEARCH(r) S_DIAMOND(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(S_DIAMOND_A(hr,R)); +#elif RS == 2 +#define FOR_RESEARCH(r) S_VERTICAL(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(R); +#elif RS == 1 +#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,DINCR(r,x,1)) +const int r_area = R_AREA(R); +#elif RS == 0 +#define FOR_RESEARCH(r) S_SQUARE(r,hr,DINCR(r,y,1)) +const int r_area = R_AREA(R*R); +#endif + +#define RI1 (RI+1) +#define RFI1 (RFI+1) + +#if RI +#define FOR_ROTATION for (float ri = 0; ri < 360; ri+=360.0/RI1) +#else +#define FOR_ROTATION +#endif + +#if RFI +#define FOR_REFLECTION for (int rfi = 0; rfi < RFI1; rfi++) +#else +#define FOR_REFLECTION +#endif + +#if PD +#define PINCR DINCR +#else +#define PINCR(z,c,a) (z.c += a) +#endif + +#define P_AREA(a) (a - PD) + +// patch shapes +#if P == 0 || P == 1 +#define FOR_PATCH(p) S_1X1(p) +const int p_area = P_AREA(1); +#elif PS == 8 +#define FOR_PATCH(p) S_PLUS_X(p,hp,PINCR(p,y,max(1,abs(p.x)))) +const int p_area = P_AREA(S_PLUS_X_A(hp,P)); +#elif PS == 7 +#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(S_PLUS_A(hp,P)); +#elif PS == 6 +#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(P*P); +#elif PS == 5 +#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x,1)) +const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); +#elif PS == 4 +#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x,1)) +const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); +#elif PS == 3 +#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(S_DIAMOND_A(hp,P)); +#elif PS == 2 +#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(P); +#elif PS == 1 +#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x,1)) +const int p_area = P_AREA(P); +#elif PS == 0 +#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y,1)) +const int p_area = P_AREA(P*P); +#endif + +const float r_scale = 1.0/r_area; +const float p_scale = 1.0/p_area; + +#if SAMPLE == 0 +#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * (vec2(off) + 0.5 - fract((pos) * (size)))) +#else +#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * vec2(off)) +#endif + +#define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off) + +#if RF_ && defined(LUMA_raw) +#define load2_(off) sample(RF_LUMA_tex, RF_LUMA_pos, RF_LUMA_size, RF_LUMA_pt, off) +#define gather_offs(off, off_arr) (RF_LUMA_mul * vec4(textureGatherOffsets(RF_LUMA_raw, RF_LUMA_pos + vec2(off) * RF_LUMA_pt, off_arr))) +#define gather(off) RF_LUMA_gather(RF_LUMA_pos + (off) * RF_LUMA_pt, 0) +#elif RF_ && D1W +#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off) +#define gather_offs(off, off_arr) (RF_mul * vec4(textureGatherOffsets(RF_raw, RF_pos + vec2(off) * RF_pt, off_arr))) +#define gather(off) RF_gather(RF_pos + (off) * RF_pt, 0) +#elif RF_ +#define load2_(off) sample(RF_tex, RF_pos, RF_size, RF_pt, off) +#else +#define load2_(off) load_(off) +#define gather_offs(off, off_arr) (HOOKED_mul * vec4(textureGatherOffsets(HOOKED_raw, HOOKED_pos + vec2(off) * HOOKED_pt, off_arr))) +#define gather(off) HOOKED_gather(HOOKED_pos + (off)*HOOKED_pt, 0) +#endif + +#if T +val load(vec3 off) +{ + switch (min(int(off.z), frame)) { + case 0: return val_swizz(load_(off)); + + } +} +val load2(vec3 off) +{ + return off.z == 0 ? val_swizz(load2_(off)) : load(off); +} +#else +#define load(off) val_swizz(load_(off)) +#define load2(off) val_swizz(load2_(off)) +#endif + +val poi2 = load2(vec3(0)); // guide pixel-of-interest +#if GUIDE_INPUT +#define poi poi2 +#else +vec4 poi_ = load_(vec3(0)); +val poi = val_swizz(poi_); // pixel-of-interest +#endif + +#if RI // rotation +vec2 rot(vec2 p, float d) +{ + return vec2( + p.x * cos(radians(d)) - p.y * sin(radians(d)), + p.y * sin(radians(d)) + p.x * cos(radians(d)) + ); +} +#else +#define rot(p, d) (p) +#endif + +#if RFI // reflection +vec2 ref(vec2 p, int d) +{ + switch (d) { + case 0: return p; + case 1: return p * vec2(1, -1); + case 2: return p * vec2(-1, 1); + } +} +#else +#define ref(p, d) (p) +#endif + +#if SST && R >= SST +float spatial_r(vec3 v) +{ + v.xy += 0.5 - fract(HOOKED_pos*HOOKED_size); + return SK(length(v*SD)*SS); +} +#else +#define spatial_r(v) (1) +#endif + +#if PST && P >= PST +#define spatial_p(v) PSK(length(v*PSD)*PSS) +#else +#define spatial_p(v) (1) +#endif + +val range(val pdiff_sq) +{ + const float h = max(S, 0.0) * 0.013; + const float pdiff_scale = 1.0/(h*h); + pdiff_sq = sqrt(pdiff_sq * pdiff_scale); +#if defined(LUMA_raw) + return RK(pdiff_sq); +#elif defined(CHROMA_raw) + return vec2(RK(pdiff_sq.x), RK(pdiff_sq.y)); +#else + return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z)); +#endif +} -vec4 hook() +val patch_comparison(vec3 r, vec3 r2) { - return HOOKED_texOff(0); + vec3 p; + val min_rot = val(p_area); + + FOR_ROTATION FOR_REFLECTION { + val pdiff_sq = val(0); + FOR_PATCH(p) { + vec3 transformed_p = vec3(ref(rot(p.xy, ri), rfi), p.z); + val diff_sq = load2(p + r2) - load2((transformed_p + r) * SF); + diff_sq *= diff_sq; + diff_sq = 1 - (1 - diff_sq) * spatial_p(p.xy); + pdiff_sq += diff_sq; + } + min_rot = min(min_rot, pdiff_sq); + } + + return min_rot * p_scale; } -//!HOOK LUMA -//!HOOK CHROMA -//!BIND _INJ_PREI -//!WIDTH HOOKED.w -//!HEIGHT HOOKED.h -//!DESC Guided filter (I) -//!SAVE _INJ_I +#define NO_GATHER (PD == 0 && NG == 0 && SAMPLE == 0) // never textureGather if any of these conditions are false +#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3 || RI == 7) -vec4 hook() +#if (defined(LUMA_gather) || D1W) && ((PS == 0 || ((PS == 3 || PS == 7) && RI != 7) || PS == 8) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER +// 3x3 diamond/plus patch_comparison_gather +// XXX extend to support arbitrary sizes (probably requires code generation) +// XXX support PSS +const ivec2 offsets_adj[4] = { ivec2(0,-1), ivec2(1,0), ivec2(0,1), ivec2(-1,0) }; +const ivec2 offsets_adj_sf[4] = { ivec2(0,-1) * SF, ivec2(1,0) * SF, ivec2(0,1) * SF, ivec2(-1,0) * SF }; +vec4 poi_patch_adj = gather_offs(0, offsets_adj); +#if PS == 0 || PS == 8 +const ivec2 offsets_diag[4] = { ivec2(-1,-1), ivec2(1,-1), ivec2(1,1), ivec2(-1,1) }; +const ivec2 offsets_diag_sf[4] = { ivec2(-1,-1) * SF, ivec2(1,-1) * SF, ivec2(1,1) * SF, ivec2(-1,1) * SF }; +vec4 poi_patch_diag = gather_offs(0, offsets_diag); +#endif +float patch_comparison_gather(vec3 r, vec3 r2) { -return _INJ_PREI_texOff(0); -} - + float min_rot = p_area - 1; + vec4 transformer_adj = gather_offs(r, offsets_adj_sf); +#if PS == 0 || PS == 8 + vec4 transformer_diag = gather_offs(r, offsets_diag_sf); +#endif + FOR_ROTATION { + FOR_REFLECTION { +#if RFI + /* xxy + * w y + * wzz + */ + switch(rfi) { + case 1: + transformer_adj = transformer_adj.zyxw; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.zyxw; +#endif + break; + case 2: + transformer_adj = transformer_adj.xwzy; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.xwzy; +#endif + break; + } +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (P) -//!BIND HOOKED -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_P + vec4 diff = poi_patch_adj - transformer_adj; +#if PS == 0 || PS == 8 + diff += poi_patch_diag - transformer_diag; +#endif + float diff_sq = dot(diff * diff, vec4(1)); + min_rot = min(diff_sq, min_rot); -vec4 hook() +// un-reflect +#if RFI + switch(rfi) { + case 1: + transformer_adj = transformer_adj.zyxw; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.zyxw; +#endif + break; + case 2: + transformer_adj = transformer_adj.xwzy; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.xwzy; +#endif + break; + } +#endif + } // FOR_REFLECTION +#if RI == 7 + transformer_adj = transformer_adj.wxyz; + // swap adjacents for diagonals + transformer_adj += transformer_diag; + transformer_diag = transformer_adj - transformer_diag; + transformer_adj -= transformer_diag; +#elif RI == 3 + transformer_adj = transformer_adj.wxyz; +#elif RI == 1 + transformer_adj = transformer_adj.zwxy; +#endif +#if RI == 3 && (PS == 0 || PS == 8) + transformer_diag = transformer_diag.wxyz; +#elif RI == 1 && (PS == 0 || PS == 8) + transformer_diag = transformer_diag.zwxy; +#endif + } // FOR_ROTATION + float center_diff = poi2.x - load2(r).x; + return (center_diff * center_diff + min_rot) * p_scale; +} +#elif (defined(LUMA_gather) || D1W) && PS == 4 && P == 3 && RI == 0 && RFI == 0 && NO_GATHER +const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,0), ivec2(1,0) }; +const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,0) * SF, ivec2(1,0) * SF }; +vec4 poi_patch = gather_offs(0, offsets); +float patch_comparison_gather(vec3 r, vec3 r2) { - return HOOKED_texOff(0); + vec4 pdiff = poi_patch - gather_offs(r, offsets_sf); + return dot(pdiff * pdiff, vec4(1)) * p_scale; } - -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (MEANI) -//!BIND _INJ_I -//!WIDTH _INJ_I.w 1.5 / -//!HEIGHT _INJ_I.h 1.5 / -//!SAVE _INJ_MEANI - -vec4 hook() +#elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER +// tiled even square patch_comparison_gather +// XXX extend to support odd square? +float patch_comparison_gather(vec3 r, vec3 r2) { -return _INJ_I_texOff(0); + vec2 tile; + float min_rot = p_area; + + /* gather order: + * w z + * x y + */ + float pdiff_sq = 0; + for (tile.x = -hp; tile.x < hp; tile.x+=2) for (tile.y = -hp; tile.y < hp; tile.y+=2) { + vec4 diff_sq = gather(tile + r.xy) - gather(tile + r2.xy); + diff_sq *= diff_sq; + diff_sq = 1 - (1 - diff_sq) * vec4(spatial_p(tile+vec2(0,1)), spatial_p(tile+vec2(1,1)), + spatial_p(tile+vec2(1,0)), spatial_p(tile+vec2(0,0))); + pdiff_sq += dot(diff_sq, vec4(1)); + } + min_rot = min(min_rot, pdiff_sq); + + return min_rot * p_scale; } - -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (MEANP) -//!BIND _INJ_P -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_MEANP +#else +#define patch_comparison_gather patch_comparison +#endif vec4 hook() { -return _INJ_P_texOff(0); -} + val total_weight = val(0); + val sum = val(0); + val result = val(0); -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (_INJ_I_SQ) -//!BIND _INJ_I -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_I_SQ + vec3 r = vec3(0); + vec3 p = vec3(0); + vec3 me = vec3(0); -vec4 hook() -{ -return _INJ_I_texOff(0) * _INJ_I_texOff(0); -} +#if T && ME == 1 // temporal & motion estimation + vec3 me_tmp = vec3(0); + float maxweight = 0; +#elif T && ME == 2 // temporal & motion estimation + vec3 me_sum = vec3(0); + float me_weight = 0; +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (_INJ_IXP) -//!BIND _INJ_I -//!BIND _INJ_P -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_IXP +#if AS + val total_weight_s = val(0); + val sum_s = val(0); +#endif -vec4 hook() -{ -return _INJ_I_texOff(0) * _INJ_P_texOff(0); -} +#if WD == 2 // weight discard (mean) + int r_index = 0; + val_packed all_weights[r_area]; + val_packed all_pixels[r_area]; +#elif WD == 1 // weight discard (moving cumulative average) + int r_iter = 1; + val wd_total_weight = val(0); + val wd_sum = val(0); +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (CORRI) -//!BIND _INJ_I_SQ -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_CORRI + FOR_FRAME(r) { + // XXX ME is always a frame behind, should have the option to re-research after applying ME (could do it an arbitrary number of times per frame if desired) +#if T && ME == 1 // temporal & motion estimation max weight + if (r.z > 0) { + me += me_tmp * MEF; + me_tmp = vec3(0); + maxweight = 0; + } +#elif T && ME == 2 // temporal & motion estimation weighted average + if (r.z > 0) { + me += round(me_sum / me_weight * MEF); + me_sum = vec3(0); + me_weight = 0; + } +#endif + FOR_RESEARCH(r) { + // r coords with appropriate transformations applied + vec3 tr = vec3(r.xy + floor(r.xy * RSF), r.z); + float spatial_weight = spatial_r(tr); + tr.xy += me.xy; -vec4 hook() -{ -return _INJ_I_SQ_texOff(0); -} + val px = load(tr); -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (CORRP) -//!BIND _INJ_IXP -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_CORRP +#if SKIP_PATCH + val weight = val(1); +#else + val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(tr, vec3(0))) : patch_comparison(tr, vec3(0)); + val weight = range(pdiff_sq); +#endif -vec4 hook() -{ -return _INJ_IXP_texOff(0); -} +#if T && ME == 1 // temporal & motion estimation max weight + me_tmp = vec3(tr.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); + maxweight = max(maxweight, weight.x); +#elif T && ME == 2 // temporal & motion estimation weighted average + me_sum += vec3(tr.xy,0) * weight.x; + me_weight += weight.x; +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (A) -//!BIND _INJ_MEANI -//!BIND _INJ_MEANP -//!BIND _INJ_CORRI -//!BIND _INJ_CORRP -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_A +#if D1W + weight = val(weight.x); +#endif -#define E 0.0013 + weight *= spatial_weight; -vec4 hook() -{ -vec4 var = _INJ_CORRI_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANI_texOff(0); -vec4 cov = _INJ_CORRP_texOff(0) - _INJ_MEANI_texOff(0) * _INJ_MEANP_texOff(0); - return cov / (var + E); -} +#if AS + spatial_weight *= int(r.z == 0); // ignore temporal + sum_s += px * spatial_weight; + total_weight_s += spatial_weight; +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (B) -//!BIND _INJ_A -//!BIND _INJ_MEANI -//!BIND _INJ_MEANP -//!WIDTH _INJ_I.w -//!HEIGHT _INJ_I.h -//!SAVE _INJ_B +#if WD == 2 // weight discard (mean) + all_weights[r_index] = val_pack(weight); + all_pixels[r_index] = val_pack(px); + r_index++; +#elif WD == 1 // weight discard (moving cumulative average) + val wd_scale = val(1.0/r_iter); + val below_threshold = WDS * abs(min(val(0.0), weight - (total_weight * wd_scale * WDT * WD1TK(sqrt(wd_scale*WDP))))); +#if defined(LUMA_raw) + val wdkf = WDK(below_threshold); +#elif defined(CHROMA_raw) + val wdkf = vec2(WDK(below_threshold.x), WDK(below_threshold.y)); +#else + val wdkf = vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.y)); +#endif + wd_sum += px * weight * wdkf; + wd_total_weight += weight * wdkf; + r_iter++; +#endif -vec4 hook() -{ -return _INJ_MEANP_texOff(0) - _INJ_A_texOff(0) * _INJ_MEANI_texOff(0); -} + sum += px * weight; + total_weight += weight; + } // FOR_RESEARCH + } // FOR_FRAME -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (MEANA) -//!BIND _INJ_A -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_MEANA + val avg_weight = total_weight * r_scale; + val old_avg_weight = avg_weight; -vec4 hook() -{ -return _INJ_A_texOff(0); -} +#if WD == 2 // weight discard (mean) + total_weight = val(0); + sum = val(0); -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter (MEANB) -//!BIND _INJ_B -//!WIDTH _INJ_MEANI.w -//!HEIGHT _INJ_MEANI.h -//!SAVE _INJ_MEANB + for (int i = 0; i < r_area; i++) { + val weight = val_unpack(all_weights[i]); + val px = val_unpack(all_pixels[i]); -vec4 hook() -{ -return _INJ_B_texOff(0); -} + val below_threshold = WDS * abs(min(val(0.0), weight - (avg_weight * WDT))); +#if defined(LUMA_raw) + weight *= WDK(below_threshold); +#elif defined(CHROMA_raw) + weight *= vec2(WDK(below_threshold.x), WDK(below_threshold.y)); +#else + weight *= vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.z)); +#endif -//!HOOK LUMA -//!HOOK CHROMA -//!DESC Guided filter -//!BIND HOOKED -//!BIND _INJ_MEANA -//!BIND _INJ_MEANB -//!SAVE RF_LUMA + sum += px * weight; + total_weight += weight; + } +#elif WD == 1 // weight discard (moving cumulative average) + total_weight = wd_total_weight; + sum = wd_sum; +#endif +#if WD // weight discard + avg_weight = total_weight * r_scale; +#endif -vec4 hook() -{ -return _INJ_MEANA_texOff(0) * HOOKED_texOff(0) + _INJ_MEANB_texOff(0); + total_weight += SW * spatial_r(vec3(0)); + sum += poi * SW * spatial_r(vec3(0)); + result = val(sum / total_weight); + + // store frames for temporal +#if T > 1 + +#endif +#if T && TRF + imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(result)); +#elif T + imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2)); +#endif + +#if AS == 1 // sharpen+denoise +#define AS_base result +#elif AS == 2 // sharpen only +#define AS_base poi +#endif +#if AS + val usm = result - sum_s/total_weight_s; + usm = exp(log(abs(usm))*ASP) * sign(usm); // avoiding pow() since it's buggy on nvidia + usm *= gaussian(abs((AS_base + usm - 0.5) / 1.5) * ASA); + usm *= ASF; + result = AS_base + usm; +#endif + +#if EP // extremes preserve + float luminance = EP_texOff(0).x; + // EPSILON is needed since pow(0,0) is undefined + float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); + result = mix(poi, result, ep_weight); +#else + float ep_weight = 0; +#endif + +#if V == 1 + result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0); +#elif V == 2 + result = (poi - result) * 0.5 + 0.5; +#elif V == 3 // post-WD weight map + result = avg_weight; +#elif V == 4 // pre-WD edge map + result = old_avg_weight; +#elif V == 5 + result = 0.5 + usm; +#elif V == 6 + result = val(1 - ep_weight); +#endif + +// XXX visualize chroma for these +#if defined(CHROMA_raw) && (V == 3 || V == 4 || V == 6) + return vec4(0.5); +#endif + + return unval(mix(poi, result, BF)); } -// End of source code injected from guided.glsl +// End of source code injected from ../LQ/nlmeans.glsl //!HOOK LUMA //!HOOK CHROMA @@ -328,25 +1150,11 @@ vec4 hook() return RF_LUMA_texOff(0); } -//!HOOK LUMA -//!HOOK CHROMA -//!BIND LUMA -//!WIDTH LUMA.w 3 / -//!HEIGHT LUMA.h 3 / -//!DESC Non-local means (EP) -//!SAVE EP - -vec4 hook() -{ - return LUMA_texOff(0); -} - //!HOOK LUMA //!HOOK CHROMA //!BIND HOOKED //!BIND RF_LUMA //!BIND RF -//!BIND EP //!BIND PREV1 //!BIND PREV2 //!DESC Non-local means (nlmeans_temporal.glsl) @@ -358,49 +1166,37 @@ vec4 hook() // Denoising factor (level of blur, higher means more blur) #ifdef LUMA_raw -#define S 2.0 +#define S 2.0522687499802097 #else -#define S 5.0 +#define S 2.5168955531436197 #endif /* Adaptive sharpening * - * Uses the blur incurred by denoising to perform an unsharp mask, and uses the - * weight map to restrict the sharpening to edges. - * - * If you just want to increase/decrease sharpness then you want to change ASF. + * Performs an unsharp mask by subtracting the spatial kernel's blur from the + * NLM blur. For sharpen+denoise the sharpening is limited to edge areas and + * denoising is done everywhere else. * * Use V=4 to visualize which areas are sharpened (black means sharpen). * * AS: - * - 0 to disable - * - 1 to sharpen+denoise - * - 2 to sharpen only + * - 0: disable + * - 1: sharpen+denoise + * - 2: sharpen only * ASF: Higher numbers make a sharper image - * ASP: Higher numbers use more of the sharp image - * ASW: - * - 0 to use pre-WD weights - * - 1 to use post-WD weights (ASP should be ~2x to compensate) - * ASK: Weight kernel: - * - 0 for power. This is the old method. - * - 1 for sigmoid. This is generally recommended. - * - 2 for constant (non-adaptive, w/ ASP=0 this sharpens the entire image) - * ASC (only for ASK=1, range 0-1): Reduces the contrast of the edge map + * ASA: Anti-ringing, higher numbers increase strength + * ASP: Power, lower numbers increase sharpening on lower frequency detail */ #ifdef LUMA_raw #define AS 0 -#define ASF 3.0 -#define ASP 1.0 -#define ASW 0 -#define ASK 1 -#define ASC 0.0 +#define ASF 0.1625 +#define ASA 5.0 +#define ASP 0.5 #else #define AS 0 -#define ASF 3.0 -#define ASP 1.0 -#define ASW 0 -#define ASK 1 -#define ASC 0.0 +#define ASF 0.1625 +#define ASA 5.0 +#define ASP 0.5 #endif /* Starting weight @@ -411,52 +1207,57 @@ vec4 hook() * EPSILON should be used instead of zero to avoid divide-by-zero errors. */ #ifdef LUMA_raw -#define SW 1.0 +#define SW 1.3011446081346498 #else -#define SW 0.5 +#define SW 1.2219854377433914 #endif /* Weight discard * - * Discard weights that fall below a fraction of the average weight. This culls - * the most dissimilar samples from the blur, yielding a much more pleasant - * result, especially around edges. + * Reduces weights that fall below a fraction of the average weight. This culls + * the most dissimilar samples from the blur, which can yield a better result, + * especially around edges. * * WD: - * - 2: True average. Better quality, but slower and requires GLSL 4.0 or later - * - 1: Moving cumulative average. Inaccurate, tends to blur directionally. + * - 2: Mean. Better quality, but slower and requires GLSL 4.0 or later + * - 1: Moving cumulative average. Fast but inaccurate, blurs directionally. * - 0: Disable * * WDT: Threshold coefficient, higher numbers discard more * WDP (only for WD=1): Increasing reduces the threshold for small sample sizes + * WDS (not for WDK=is_zero): Higher numbers are more eager to reduce weights */ #ifdef LUMA_raw #define WD 1 -#define WDT 0.5 -#define WDP 6.0 +#define WDT 0.11671341022864548 +#define WDP 5.381278367349288 +#define WDS 1.0 #else -#define WD 2 -#define WDT 0.75 -#define WDP 6.0 +#define WD 0 +#define WDT 0.002713346103131793 +#define WDP 5.832936323930807 +#define WDS 1.0 #endif /* Extremes preserve * - * Reduces denoising around very bright/dark areas. + * Reduce denoising in very bright/dark areas. + * + * Disabled by default now. If you want to reenable this, set EP=3/ in + * Makefile.nlm and rebuild. * * The downscaling factor of the EP shader stage affects what is considered a - * bright/dark area. The default of 3 should be fine, it's not recommended to - * change this. + * bright/dark area. * * This is incompatible with RGB. If you have RGB hooks enabled then you will * have to delete the EP shader stage or specify EP=0 through shader_cfg. * * EP: 1 to enable, 0 to disable - * DP: EP strength on dark patches, 0 to fully denoise - * BP: EP strength on bright patches, 0 to fully denoise + * DP: EP strength on dark areas, 0 to fully denoise + * BP: EP strength on bright areas, 0 to fully denoise */ #ifdef LUMA_raw -#define EP 1 +#define EP 0 #define BP 0.75 #define DP 0.25 #else @@ -471,12 +1272,26 @@ vec4 hook() /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ /* ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS * ADVANCED OPTIONS */ +/* textureGather applicable configurations: + * + * - PS={0,3,7,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2} + * - PS={0,8}:P=3:PST=0:RI={0,1,3,7}:RFI={0,1,2} + * - PS=6:RI=0:RFI=0 + * - Currently the only scalable variant + * + * Options which always disable textureGather: + * - NG + * - SAMPLE + * - PD + * + * Running without textureGather may be much slower. + */ + /* Patch & research sizes * - * Patch size should be an odd number greater than or equal to 3. Higher values - * are slower and not always better. + * P should be an odd number. Higher values are slower and not always better. * - * Research size be an odd number greater than or equal to 3. Higher values are + * R should be an odd number greater than or equal to 3. Higher values are * generally better, but slower, blurrier, and gives diminishing returns. */ #ifdef LUMA_raw @@ -494,8 +1309,6 @@ vec4 hook() * * PS applies applies to patches, RS applies to research zones. * - * Be wary of gather optimizations (see the Regarding Speed comment at the top) - * * 0: square (symmetrical) * 1: horizontal line (asymmetric) * 2: vertical line (asymmetric) @@ -504,6 +1317,7 @@ vec4 hook() * 5: truncated triangle (asymmetric on two axis, last row halved) * 6: even sized square (asymmetric on two axis) * 7: plus (symmetrical) + * 8: plus X (symmetrical) */ #ifdef LUMA_raw #define RS 3 @@ -518,8 +1332,8 @@ vec4 hook() * This setting is dependent on code generation from shader_cfg, so this * setting can only be enabled via shader_cfg. * - * Compares the pixel-of-interest against a guide, which could be a downscaled - * image or the output of another shader + * Computes weights on a guide, which could be a downscaled image or the output + * of another shader, and applies the weights to the original image */ #define RF_LUMA 1 #define RF 1 @@ -533,6 +1347,9 @@ vec4 hook() * The angle in degrees of each rotation is 360/(RI+1), so RI=1 will do a * single 180 degree rotation, RI=3 will do three 90 degree rotations, etc. * + * Consider setting SAMPLE=1 if setting RI to a setting that would require + * sampling between pixels. + * * RI: Rotational invariance * RFI (0 to 2): Reflectional invariance */ @@ -600,14 +1417,14 @@ vec4 hook() */ #ifdef LUMA_raw #define SST 1 -#define SS 0.25 +#define SS 0.5296176863733414 #define SD vec3(1,1,1) #define PST 0 #define PSS 0.0 #define PSD vec2(1,1) #else #define SST 1 -#define SS 0.25 +#define SS 0.26295970436981203 #define SD vec3(1,1,1) #define PST 0 #define PSS 0.0 @@ -619,6 +1436,8 @@ vec4 hook() * SK: spatial kernel * RK: range kernel (takes patch differences) * PSK: intra-patch spatial kernel + * WDK: weight discard kernel + * WD1TK (WD=1 only): weight discard tolerance kernel * * List of available kernels: * @@ -626,18 +1445,51 @@ vec4 hook() * cos * gaussian * lanczos - * quadratic + * quadratic_ (unclamped) * sinc + * sinc_ (unclamped) + * sinc3 * sphinx + * sphinx_ (unclamped) + * triangle_ (unclamped) + * triangle */ #ifdef LUMA_raw #define SK gaussian #define RK gaussian #define PSK gaussian +#define WDK is_zero +#define WD1TK gaussian #else #define SK gaussian #define RK gaussian #define PSK gaussian +#define WDK is_zero +#define WD1TK gaussian +#endif + +/* Sampling method + * + * In most cases this shouldn't make any difference, only set to bilinear if + * it's necessary to sample between pixels (e.g., RI=2). + * + * 0: nearest neighbor + * 1: bilinear + */ +#ifdef LUMA_raw +#define SAMPLE 0 +#else +#define SAMPLE 0 +#endif + +/* Research scaling factor + * + * Higher numbers sample more sparsely as the distance from the POI grows. + */ +#ifdef LUMA_raw +#define RSF 0.0 +#else +#define RSF 0.0 #endif // Scaling factor (should match WIDTH/HEIGHT) @@ -647,13 +1499,22 @@ vec4 hook() #define SF 1 #endif +// Use the guide image as the input image +#ifdef LUMA_raw +#define GUIDE_INPUT 0 +#else +#define GUIDE_INPUT 0 +#endif + /* Visualization * * 0: off * 1: absolute difference between input/output to the power of 0.25 * 2: difference between input/output centered on 0.5 - * 3: avg_weight - * 4: edge map (based on the relevant AS settings) + * 3: post-WD weight map + * 4: pre-WD weight map + * 5: unsharp mask + * 6: EP */ #ifdef LUMA_raw #define V 0 @@ -698,37 +1559,44 @@ vec4 hook() // Shader code -#define EPSILON 0.00000000001 +#define EPSILON 1.2e-38 #define M_PI 3.14159265358979323846 #define POW2(x) ((x)*(x)) #define POW3(x) ((x)*(x)*(x)) -#define bicubic(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0)))) +#define bicubic_(x) ((1.0/6.0) * (POW3((x)+2) - 4 * POW3((x)+1) + 6 * POW3(x) - 4 * POW3(max((x)-1, 0)))) +#define bicubic(x) bicubic_(clamp((x), 0.0, 2.0)) #define gaussian(x) exp(-1 * POW2(x)) -#define lanczos(x) POW2(sinc(x)) -#define quadratic(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5)) -#define sinc(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI)) -#define sphinx(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI)) +#define quadratic_(x) ((x) < 0.5 ? 0.75 - POW2(x) : 0.5 * POW2((x) - 1.5)) +#define quadratic(x) quadratic_(clamp((x), 0.0, 1.5)) +#define sinc_(x) ((x) < 1e-8 ? 1.0 : sin((x)*M_PI) / ((x)*M_PI)) +#define sinc(x) sinc_(clamp((x), 0.0, 1.0)) +#define sinc3(x) sinc_(clamp((x), 0.0, 3.0)) +#define lanczos(x) (sinc3(x) * sinc(x)) +#define sphinx_(x) ((x) < 1e-8 ? 1.0 : 3.0 * (sin((x)*M_PI) - (x)*M_PI * cos((x)*M_PI)) / POW3((x)*M_PI)) +#define sphinx(x) sphinx_(clamp((x), 0.0, 1.4302966531242027)) +#define triangle_(x) (1 - (x)) +#define triangle(x) triangle_(clamp((x), 0.0, 1.0)) +#define is_zero(x) int(x == 0) // XXX could maybe be better optimized on LGC -// XXX return original alpha component instead of 1.0 #if defined(LUMA_raw) #define val float #define val_swizz(v) (v.x) -#define unval(v) vec4(v.x, 0, 0, 1.0) +#define unval(v) vec4(v.x, 0, 0, poi_.a) #define val_packed val #define val_pack(v) (v) #define val_unpack(v) (v) #elif defined(CHROMA_raw) #define val vec2 #define val_swizz(v) (v.xy) -#define unval(v) vec4(v.x, v.y, 0, 1.0) +#define unval(v) vec4(v.x, v.y, 0, poi_.a) #define val_packed uint #define val_pack(v) packUnorm2x16(v) #define val_unpack(v) unpackUnorm2x16(v) #else #define val vec3 #define val_swizz(v) (v.xyz) -#define unval(v) vec4(v.x, v.y, v.z, 1.0) +#define unval(v) vec4(v.x, v.y, v.z, poi_.a) #define val_packed val #define val_pack(v) (v) #define val_unpack(v) (v) @@ -746,10 +1614,6 @@ const int hr = R/2; const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even research sizes #endif -// donut increment, increments without landing on (0,0,0) -// much faster than a continue statement -#define DINCR(z,c) (z.c++,(z.c += int(z == vec3(0)))) - // patch/research shapes // each shape is depicted in a comment, where Z=5 (Z corresponds to P or R) // dots (.) represent samples (pixels) and X represents the pixel-of-interest @@ -790,7 +1654,7 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res // // Z ..X.. // -#define S_HORIZONTAL(z,hz,incr) for (z.x = -hz; z.x <= hz; incr) for (z.y = 0; z.y <= 0; z.y++) +#define S_HORIZONTAL(z,hz,incr) for (z.y = 0; z.y <= 0; z.y++) for (z.x = -hz; z.x <= hz; incr) // 90 degree rotation of S_HORIZONTAL #define S_VERTICAL(z,hz,incr) for (z.x = 0; z.x <= 0; z.x++) for (z.y = -hz; z.y <= hz; incr) @@ -803,19 +1667,13 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res #define S_PLUS(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -hz * int(z.x == 0); z.y <= hz * int(z.x == 0); incr) #define S_PLUS_A(hz,Z) (Z*2 - 1) -// XXX implement S_PLUS w/ an X overlayed: // 3 . . . // 3 ... // Z ..X.. // 3 ... // 3 . . . - -// XXX implement an X shape: -// 2 . . -// 2 . . -// 1 X -// 2 . . -// 2 . . +#define S_PLUS_X(z,hz,incr) for (z.x = -hz; z.x <= hz; z.x++) for (z.y = -abs(z.x) + -hz * int(z.x == 0); z.y <= abs(z.x) + hz * int(z.x == 0); incr) +#define S_PLUS_X_A(hz,Z) (Z*4 - 3) // 1x1 square #define S_1X1(z) for (z = vec3(0); z.x <= 0; z.x++) @@ -829,43 +1687,43 @@ const float hr = int(R/2) - 0.5*(1-(R%2)); // sample between pixels for even res #define RF_ RF #endif -// Skip comparing the pixel-of-interest against itself, unless RF is enabled -#if RF_ -#define RINCR(z,c) (z.c++) -#else -#define RINCR DINCR -#endif +// donut increment, increments without landing on (0,0,0) +// much faster than a continue statement +#define DINCR(z,c,a) ((z.c += a),(z.c += int(z == vec3(0)))) -#define R_AREA(a) (a * T1 + RF_-1) +#define R_AREA(a) (a * T1 - 1) // research shapes // XXX would be nice to have the option of temporally-varying research sizes #if R == 0 || R == 1 #define FOR_RESEARCH(r) S_1X1(r) const int r_area = R_AREA(1); +#elif RS == 8 +#define FOR_RESEARCH(r) S_PLUS_X(r,hr,DINCR(r,y,max(1,abs(r.x)))) +const int r_area = R_AREA(S_PLUS_X_A(hr,R)); #elif RS == 7 -#define FOR_RESEARCH(r) S_PLUS(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_PLUS(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(S_PLUS_A(hr,R)); #elif RS == 6 -#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_SQUARE_EVEN(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(R*R); #elif RS == 5 -#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,RINCR(r,x)) +#define FOR_RESEARCH(r) S_TRUNC_TRIANGLE(r,hr,DINCR(r,x,1)) const int r_area = R_AREA(S_TRIANGLE_A(hr,hr)); #elif RS == 4 -#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,RINCR(r,x)) +#define FOR_RESEARCH(r) S_TRIANGLE(r,hr,DINCR(r,x,1)) const int r_area = R_AREA(S_TRIANGLE_A(hr,R)); #elif RS == 3 -#define FOR_RESEARCH(r) S_DIAMOND(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_DIAMOND(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(S_DIAMOND_A(hr,R)); #elif RS == 2 -#define FOR_RESEARCH(r) S_VERTICAL(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_VERTICAL(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(R); #elif RS == 1 -#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,RINCR(r,x)) +#define FOR_RESEARCH(r) S_HORIZONTAL(r,hr,DINCR(r,x,1)) const int r_area = R_AREA(R); #elif RS == 0 -#define FOR_RESEARCH(r) S_SQUARE(r,hr,RINCR(r,y)) +#define FOR_RESEARCH(r) S_SQUARE(r,hr,DINCR(r,y,1)) const int r_area = R_AREA(R*R); #endif @@ -887,7 +1745,7 @@ const int r_area = R_AREA(R*R); #if PD #define PINCR DINCR #else -#define PINCR(z,c) (z.c++) +#define PINCR(z,c,a) (z.c += a) #endif #define P_AREA(a) (a - PD) @@ -896,36 +1754,44 @@ const int r_area = R_AREA(R*R); #if P == 0 || P == 1 #define FOR_PATCH(p) S_1X1(p) const int p_area = P_AREA(1); +#elif PS == 8 +#define FOR_PATCH(p) S_PLUS_X(p,hp,PINCR(p,y,max(1,abs(p.x)))) +const int p_area = P_AREA(S_PLUS_X_A(hp,P)); #elif PS == 7 -#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_PLUS(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(S_PLUS_A(hp,P)); #elif PS == 6 -#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_SQUARE_EVEN(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(P*P); #elif PS == 5 -#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x)) +#define FOR_PATCH(p) S_TRUNC_TRIANGLE(p,hp,PINCR(p,x,1)) const int p_area = P_AREA(S_TRIANGLE_A(hp,hp)); #elif PS == 4 -#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x)) +#define FOR_PATCH(p) S_TRIANGLE(p,hp,PINCR(p,x,1)) const int p_area = P_AREA(S_TRIANGLE_A(hp,P)); #elif PS == 3 -#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_DIAMOND(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(S_DIAMOND_A(hp,P)); #elif PS == 2 -#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_VERTICAL(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(P); #elif PS == 1 -#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x)) +#define FOR_PATCH(p) S_HORIZONTAL(p,hp,PINCR(p,x,1)) const int p_area = P_AREA(P); #elif PS == 0 -#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y)) +#define FOR_PATCH(p) S_SQUARE(p,hp,PINCR(p,y,1)) const int p_area = P_AREA(P*P); #endif const float r_scale = 1.0/r_area; const float p_scale = 1.0/p_area; -#define sample(tex, pos, size, pt, off) tex(pos + pt * (vec2(off) + 0.5 - fract(pos*size))) +#if SAMPLE == 0 +#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * (vec2(off) + 0.5 - fract((pos) * (size)))) +#else +#define sample(tex, pos, size, pt, off) tex((pos) + (pt) * vec2(off)) +#endif + #define load_(off) sample(HOOKED_tex, HOOKED_pos, HOOKED_size, HOOKED_pt, off) #if RF_ && defined(LUMA_raw) @@ -962,8 +1828,13 @@ val load2(vec3 off) #define load2(off) val_swizz(load2_(off)) #endif -val poi = load(vec3(0)); // pixel-of-interest val poi2 = load2(vec3(0)); // guide pixel-of-interest +#if GUIDE_INPUT +#define poi poi2 +#else +vec4 poi_ = load_(vec3(0)); +val poi = val_swizz(poi_); // pixel-of-interest +#endif #if RI // rotation vec2 rot(vec2 p, float d) @@ -1008,7 +1879,7 @@ float spatial_r(vec3 v) val range(val pdiff_sq) { - const float h = S*0.013; + const float h = max(S, 0.0) * 0.013; const float pdiff_scale = 1.0/(h*h); pdiff_sq = sqrt(pdiff_sq * pdiff_scale); #if defined(LUMA_raw) @@ -1018,10 +1889,6 @@ val range(val pdiff_sq) #else return vec3(RK(pdiff_sq.x), RK(pdiff_sq.y), RK(pdiff_sq.z)); #endif - //return exp(-pdiff_sq * pdiff_scale); - - // weight function from the NLM paper, it's not very good - //return exp(-max(pdiff_sq - 2*S*S, 0.0) * pdiff_scale); } val patch_comparison(vec3 r, vec3 r2) @@ -1044,42 +1911,104 @@ val patch_comparison(vec3 r, vec3 r2) return min_rot * p_scale; } -#define NO_GATHER (PD == 0 && NG == 0) // never textureGather if any of these conditions are false -#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3) +#define NO_GATHER (PD == 0 && NG == 0 && SAMPLE == 0) // never textureGather if any of these conditions are false +#define REGULAR_ROTATIONS (RI == 0 || RI == 1 || RI == 3 || RI == 7) -#if (defined(LUMA_gather) || D1W) && ((PS == 3 || PS == 7) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER +#if (defined(LUMA_gather) || D1W) && ((PS == 0 || ((PS == 3 || PS == 7) && RI != 7) || PS == 8) && P == 3) && PST == 0 && REGULAR_ROTATIONS && NO_GATHER // 3x3 diamond/plus patch_comparison_gather // XXX extend to support arbitrary sizes (probably requires code generation) -// XXX extend to support 3x3 square // XXX support PSS -const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,1), ivec2(1,0) }; -const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,1) * SF, ivec2(1,0) * SF }; -vec4 poi_patch = gather_offs(0, offsets); +const ivec2 offsets_adj[4] = { ivec2(0,-1), ivec2(1,0), ivec2(0,1), ivec2(-1,0) }; +const ivec2 offsets_adj_sf[4] = { ivec2(0,-1) * SF, ivec2(1,0) * SF, ivec2(0,1) * SF, ivec2(-1,0) * SF }; +vec4 poi_patch_adj = gather_offs(0, offsets_adj); +#if PS == 0 || PS == 8 +const ivec2 offsets_diag[4] = { ivec2(-1,-1), ivec2(1,-1), ivec2(1,1), ivec2(-1,1) }; +const ivec2 offsets_diag_sf[4] = { ivec2(-1,-1) * SF, ivec2(1,-1) * SF, ivec2(1,1) * SF, ivec2(-1,1) * SF }; +vec4 poi_patch_diag = gather_offs(0, offsets_diag); +#endif float patch_comparison_gather(vec3 r, vec3 r2) { float min_rot = p_area - 1; - vec4 transformer = gather_offs(r, offsets_sf); + vec4 transformer_adj = gather_offs(r, offsets_adj_sf); +#if PS == 0 || PS == 8 + vec4 transformer_diag = gather_offs(r, offsets_diag_sf); +#endif FOR_ROTATION { FOR_REFLECTION { - float diff_sq = dot((poi_patch - transformer) * (poi_patch - transformer), vec4(1)); +#if RFI + /* xxy + * w y + * wzz + */ + switch(rfi) { + case 1: + transformer_adj = transformer_adj.zyxw; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.zyxw; +#endif + break; + case 2: + transformer_adj = transformer_adj.xwzy; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.xwzy; +#endif + break; + } +#endif + + vec4 diff = poi_patch_adj - transformer_adj; +#if PS == 0 || PS == 8 + diff += poi_patch_diag - transformer_diag; +#endif + float diff_sq = dot(diff * diff, vec4(1)); min_rot = min(diff_sq, min_rot); + +// un-reflect #if RFI switch(rfi) { - case 0: transformer = transformer.zyxw; break; - case 1: transformer = transformer.zwxy; break; // undoes last mirror, performs another mirror - case 2: transformer = transformer.zyxw; break; // undoes last mirror + case 1: + transformer_adj = transformer_adj.zyxw; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.zyxw; +#endif + break; + case 2: + transformer_adj = transformer_adj.xwzy; +#if PS == 0 || PS == 8 + transformer_diag = transformer_diag.xwzy; +#endif + break; } #endif - } -#if RI == 3 - transformer = transformer.wxyz; + } // FOR_REFLECTION +#if RI == 7 + transformer_adj = transformer_adj.wxyz; + // swap adjacents for diagonals + transformer_adj += transformer_diag; + transformer_diag = transformer_adj - transformer_diag; + transformer_adj -= transformer_diag; +#elif RI == 3 + transformer_adj = transformer_adj.wxyz; #elif RI == 1 - transformer = transformer.zwxy; + transformer_adj = transformer_adj.zwxy; #endif - } - float center_diff_sq = poi2.x - load2(r).x; - center_diff_sq *= center_diff_sq; - return (min_rot + center_diff_sq) * p_scale; +#if RI == 3 && (PS == 0 || PS == 8) + transformer_diag = transformer_diag.wxyz; +#elif RI == 1 && (PS == 0 || PS == 8) + transformer_diag = transformer_diag.zwxy; +#endif + } // FOR_ROTATION + float center_diff = poi2.x - load2(r).x; + return (center_diff * center_diff + min_rot) * p_scale; +} +#elif (defined(LUMA_gather) || D1W) && PS == 4 && P == 3 && RI == 0 && RFI == 0 && NO_GATHER +const ivec2 offsets[4] = { ivec2(0,-1), ivec2(-1,0), ivec2(0,0), ivec2(1,0) }; +const ivec2 offsets_sf[4] = { ivec2(0,-1) * SF, ivec2(-1,0) * SF, ivec2(0,0) * SF, ivec2(1,0) * SF }; +vec4 poi_patch = gather_offs(0, offsets); +float patch_comparison_gather(vec3 r, vec3 r2) +{ + vec4 pdiff = poi_patch - gather_offs(r, offsets_sf); + return dot(pdiff * pdiff, vec4(1)) * p_scale; } #elif (defined(LUMA_gather) || D1W) && PS == 6 && RI == 0 && RFI == 0 && NO_GATHER // tiled even square patch_comparison_gather @@ -1127,18 +2056,23 @@ vec4 hook() float me_weight = 0; #endif -#if WD == 2 // weight discard +#if AS + val total_weight_s = val(0); + val sum_s = val(0); +#endif + +#if WD == 2 // weight discard (mean) int r_index = 0; val_packed all_weights[r_area]; val_packed all_pixels[r_area]; -#elif WD == 1 // weight discard - val no_weights = val(0); - val discard_total_weight = val(0); - val discard_sum = val(0); +#elif WD == 1 // weight discard (moving cumulative average) + int r_iter = 1; + val wd_total_weight = val(0); + val wd_sum = val(0); #endif FOR_FRAME(r) { - // XXX ME is always a frame behind, should have to option to re-research after applying ME (could do it an arbitrary number of times per frame if desired) + // XXX ME is always a frame behind, should have the option to re-research after applying ME (could do it an arbitrary number of times per frame if desired) #if T && ME == 1 // temporal & motion estimation max weight if (r.z > 0) { me += me_tmp * MEF; @@ -1152,19 +2086,26 @@ vec4 hook() me_weight = 0; } #endif - FOR_RESEARCH(r) { // main NLM logic + FOR_RESEARCH(r) { + // r coords with appropriate transformations applied + vec3 tr = vec3(r.xy + floor(r.xy * RSF), r.z); + float spatial_weight = spatial_r(tr); + tr.xy += me.xy; + + val px = load(tr); + #if SKIP_PATCH val weight = val(1); #else - val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(r+me, vec3(0))) : patch_comparison(r+me, vec3(0)); + val pdiff_sq = (r.z == 0) ? val(patch_comparison_gather(tr, vec3(0))) : patch_comparison(tr, vec3(0)); val weight = range(pdiff_sq); #endif #if T && ME == 1 // temporal & motion estimation max weight - me_tmp = vec3(r.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); + me_tmp = vec3(tr.xy,0) * step(maxweight, weight.x) + me_tmp * (1 - step(maxweight, weight.x)); maxweight = max(maxweight, weight.x); #elif T && ME == 2 // temporal & motion estimation weighted average - me_sum += vec3(r.xy,0) * weight.x; + me_sum += vec3(tr.xy,0) * weight.x; me_weight += weight.x; #endif @@ -1172,21 +2113,34 @@ vec4 hook() weight = val(weight.x); #endif - weight *= spatial_r(r); + weight *= spatial_weight; -#if WD == 2 // weight discard +#if AS + spatial_weight *= int(r.z == 0); // ignore temporal + sum_s += px * spatial_weight; + total_weight_s += spatial_weight; +#endif + +#if WD == 2 // weight discard (mean) all_weights[r_index] = val_pack(weight); - all_pixels[r_index] = val_pack(load(r+me)); + all_pixels[r_index] = val_pack(px); r_index++; -#elif WD == 1 // weight discard - val wd_scale = 1.0/max(no_weights, 1); - val keeps = step(total_weight*wd_scale * WDT*exp(-wd_scale*WDP), weight); - discard_sum += load(r+me) * weight * (1 - keeps); - discard_total_weight += weight * (1 - keeps); - no_weights += keeps; +#elif WD == 1 // weight discard (moving cumulative average) + val wd_scale = val(1.0/r_iter); + val below_threshold = WDS * abs(min(val(0.0), weight - (total_weight * wd_scale * WDT * WD1TK(sqrt(wd_scale*WDP))))); +#if defined(LUMA_raw) + val wdkf = WDK(below_threshold); +#elif defined(CHROMA_raw) + val wdkf = vec2(WDK(below_threshold.x), WDK(below_threshold.y)); +#else + val wdkf = vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.y)); +#endif + wd_sum += px * weight * wdkf; + wd_total_weight += weight * wdkf; + r_iter++; #endif - sum += load(r+me) * weight; + sum += px * weight; total_weight += weight; } // FOR_RESEARCH } // FOR_FRAME @@ -1194,37 +2148,37 @@ vec4 hook() val avg_weight = total_weight * r_scale; val old_avg_weight = avg_weight; -#if WD == 2 // true average +#if WD == 2 // weight discard (mean) total_weight = val(0); sum = val(0); - val no_weights = val(0); for (int i = 0; i < r_area; i++) { - val w = val_unpack(all_weights[i]); + val weight = val_unpack(all_weights[i]); val px = val_unpack(all_pixels[i]); - val keeps = step(avg_weight*WDT, w); - w *= keeps; - sum += px * w; - total_weight += w; - no_weights += keeps; + val below_threshold = WDS * abs(min(val(0.0), weight - (avg_weight * WDT))); +#if defined(LUMA_raw) + weight *= WDK(below_threshold); +#elif defined(CHROMA_raw) + weight *= vec2(WDK(below_threshold.x), WDK(below_threshold.y)); +#else + weight *= vec3(WDK(below_threshold.x), WDK(below_threshold.y), WDK(below_threshold.z)); +#endif + + sum += px * weight; + total_weight += weight; } -#elif WD == 1 // moving cumulative average - total_weight -= discard_total_weight; - sum -= discard_sum; +#elif WD == 1 // weight discard (moving cumulative average) + total_weight = wd_total_weight; + sum = wd_sum; #endif #if WD // weight discard - avg_weight = total_weight / no_weights; + avg_weight = total_weight * r_scale; #endif total_weight += SW * spatial_r(vec3(0)); sum += poi * SW * spatial_r(vec3(0)); - -#if V == 3 // weight map - result = val(avg_weight); -#else // mean result = val(sum / total_weight); -#endif // store frames for temporal #if T > 1 @@ -1236,27 +2190,17 @@ vec4 hook() imageStore(PREV1, ivec2(HOOKED_pos*imageSize(PREV1)), unval(poi2)); #endif -#if ASW == 0 // pre-WD weights -#define AS_weight old_avg_weight -#elif ASW == 1 // post-WD weights -#define AS_weight avg_weight -#endif - -#if ASK == 0 - val sharpening_strength = pow(AS_weight, val(ASP)); -#elif ASK == 1 - val sharpening_strength = mix( - pow(smoothstep(0.0, 1.0, AS_weight), val(ASP)), - AS_weight, ASC); - // XXX normalize the result to account for a negative ASC? -#elif ASK == 2 - val sharpening_strength = val(ASP); -#endif - #if AS == 1 // sharpen+denoise - val sharpened = result + (poi - result) * ASF; +#define AS_base result #elif AS == 2 // sharpen only - val sharpened = poi + (poi - result) * ASF; +#define AS_base poi +#endif +#if AS + val usm = result - sum_s/total_weight_s; + usm = exp(log(abs(usm))*ASP) * sign(usm); // avoiding pow() since it's buggy on nvidia + usm *= gaussian(abs((AS_base + usm - 0.5) / 1.5) * ASA); + usm *= ASF; + result = AS_base + usm; #endif #if EP // extremes preserve @@ -1264,26 +2208,27 @@ vec4 hook() // EPSILON is needed since pow(0,0) is undefined float ep_weight = pow(max(min(1-luminance, luminance)*2, EPSILON), (luminance < 0.5 ? DP : BP)); result = mix(poi, result, ep_weight); -#endif - -#if AS == 1 // sharpen+denoise - result = mix(sharpened, result, sharpening_strength); -#elif AS == 2 // sharpen only - result = mix(sharpened, poi, sharpening_strength); -#endif - -#if V == 4 // edge map - result = sharpening_strength; -#endif - -#if (V == 3 || V == 4) && defined(CHROMA_raw) // drop chroma for these visualizations - return vec4(0.5); +#else + float ep_weight = 0; #endif #if V == 1 result = clamp(pow(abs(poi - result), val(0.25)), 0.0, 1.0); #elif V == 2 result = (poi - result) * 0.5 + 0.5; +#elif V == 3 // post-WD weight map + result = avg_weight; +#elif V == 4 // pre-WD edge map + result = old_avg_weight; +#elif V == 5 + result = 0.5 + usm; +#elif V == 6 + result = val(1 - ep_weight); +#endif + +// XXX visualize chroma for these +#if defined(CHROMA_raw) && (V == 3 || V == 4 || V == 6) + return vec4(0.5); #endif return unval(mix(poi, result, BF)); diff --git a/portable_config/vs/SR_ESRGAN_DML.vpy b/portable_config/vs/SR_ESRGAN_DML.vpy index 3cf78190..7dbb75c3 100644 --- a/portable_config/vs/SR_ESRGAN_DML.vpy +++ b/portable_config/vs/SR_ESRGAN_DML.vpy @@ -13,14 +13,14 @@ clip = video_in H_Pre = 720 Lt_Hd = False -Model = 5000 +Model = 5005 Gpu = 0 Gpu_T = 2 H_Max = 1440 Lk_Fmt = False ## 整数,预降低处理源高度 ## 是否对超过HD分辨率(720P)的源进行处理 -## <0|2|5000|5001|5002|5003|5004> 使用的模型 +## <0|2|5005|5006|5007> 使用的模型 ## 使用的显卡序号,0为排序一号 ## <1|2|3> 使用的显卡线程数 ## 整数,输出高度限制(填你的显示器高度) diff --git a/portable_config/vs/SR_ESRGAN_NV.vpy b/portable_config/vs/SR_ESRGAN_NV.vpy index da98aaf2..3ac10042 100644 --- a/portable_config/vs/SR_ESRGAN_NV.vpy +++ b/portable_config/vs/SR_ESRGAN_NV.vpy @@ -13,7 +13,7 @@ clip = video_in H_Pre = 720 Lt_Hd = False -Model = 5000 +Model = 5005 Gpu = 0 Gpu_T = 2 St_Eng = False @@ -22,7 +22,7 @@ H_Max = 1440 Lk_Fmt = False ## 整数,预降低处理源高度 ## 是否对超过HD分辨率(720P)的源进行处理 -## <0|2|5000|5001|5002|5003|5004> 使用的模型 +## <0|2|5005|5006|5007> 使用的模型 ## 使用的显卡序号,0为排序一号 ## <1|2|3> 使用的显卡线程数 ## 是否使用静态引擎(需要对不同分辨率的源各进行预处理);动态引擎自适应不同分辨率(64²→DCI2K)