diff --git a/DagorEngine.rev.txt b/DagorEngine.rev.txt index abe0179be..79f0ff72c 100644 --- a/DagorEngine.rev.txt +++ b/DagorEngine.rev.txt @@ -1 +1 @@ -734cc7c4ec4235531e627319981a91c135d4319f +7c0d114dca5d924ef032d22d724c67eaea1faf30 diff --git a/prog/1stPartyLibs/daScript/daslib/coroutines.das b/prog/1stPartyLibs/daScript/daslib/coroutines.das index 83744bc17..570f1090a 100644 --- a/prog/1stPartyLibs/daScript/daslib/coroutines.das +++ b/prog/1stPartyLibs/daScript/daslib/coroutines.das @@ -24,6 +24,7 @@ class private YieldFrom : AstCallMacro //! The idea is that coroutine or generator can continuesly yield from another sub-coroutine or generator. def override visit ( prog:ProgramPtr; mod:Module?; var call:smart_ptr ) : ExpressionPtr macro_verify( call.arguments |> length==1,prog,call.at,"expecting yeild_from(iterator)" ) + macro_verify( call.arguments[0]._type!=null,prog,call.at,"expecting iterator" ) macro_verify( call.arguments[0]._type.isIterator,prog,call.at,"expecting iterator" ) let iname = make_unique_private_name("_yield_from_iterator",call.at) return <- qmacro_block <| diff --git a/prog/1stPartyLibs/daScript/examples/test/misc/hello_world.das b/prog/1stPartyLibs/daScript/examples/test/misc/hello_world.das index e115727a6..b3e5e300c 100644 --- a/prog/1stPartyLibs/daScript/examples/test/misc/hello_world.das +++ b/prog/1stPartyLibs/daScript/examples/test/misc/hello_world.das @@ -1,17 +1,11 @@ -require dump_fields - -[|>dump_fields] -struct Foo - a : int - -struct Bar : Foo - b : int +class Foo + static a : int + def static bar ( A : auto ) + a = int(A) + print("set to {a}\n") [export] def main - var a = Foo() - print("a = {a}\n") - var b = Bar() - print("b = {b}\n") + Foo`bar(13.1) + -options log diff --git a/prog/1stPartyLibs/daScript/include/daScript/ast/ast.h b/prog/1stPartyLibs/daScript/include/daScript/ast/ast.h index 75ba2bfc5..a82e5117c 100644 --- a/prog/1stPartyLibs/daScript/include/daScript/ast/ast.h +++ b/prog/1stPartyLibs/daScript/include/daScript/ast/ast.h @@ -1308,6 +1308,7 @@ namespace das bool aot_module = false; // this is how AOT tool knows module is module, and not an entry point bool completion = false; // this code is being compiled for 'completion' mode bool export_all = false; // when user compiles, export all (public?) functions + bool serialize_main_module = true; // if false, then we recompile main module each time // error reporting int32_t always_report_candidates_threshold = 6; // always report candidates if there are less than this number // memory @@ -1540,6 +1541,8 @@ namespace das // this one collectes dependencies and compiles with modules ProgramPtr compileDaScript ( const string & fileName, const FileAccessPtr & access, TextWriter & logs, ModuleGroup & libGroup, CodeOfPolicies policies = CodeOfPolicies() ); + ProgramPtr compileDaScriptSerialize ( const string & fileName, const FileAccessPtr & access, + TextWriter & logs, ModuleGroup & libGroup, CodeOfPolicies policies = CodeOfPolicies() ); // collect script prerequisits bool getPrerequisits ( const string & fileName, @@ -1591,6 +1594,8 @@ namespace das bool g_resolve_annotations = true; TextWriter * g_compilerLog = nullptr; int64_t macroTimeTicks = 0; + AstSerializer * serializer_read = nullptr; + AstSerializer * serializer_write = nullptr; DebugAgentInstance g_threadLocalDebugAgent; static DAS_THREAD_LOCAL daScriptEnvironment * bound; static DAS_THREAD_LOCAL daScriptEnvironment * owned; diff --git a/prog/1stPartyLibs/daScript/include/daScript/ast/ast_serializer.h b/prog/1stPartyLibs/daScript/include/daScript/ast/ast_serializer.h index 4b6182da3..8070abadb 100644 --- a/prog/1stPartyLibs/daScript/include/daScript/ast/ast_serializer.h +++ b/prog/1stPartyLibs/daScript/include/daScript/ast/ast_serializer.h @@ -23,8 +23,11 @@ namespace das { Module * thisModule = nullptr; Module * astModule = nullptr; bool writing = false; + bool failed = false; size_t readOffset = 0; vector buffer; + vector metadata; + bool seenNewModule = false; // file info clean up vector deleteUponFinish; // these pointers are for builtins (which we don't serialize) and need to be cleaned manually das_hash_set doNotDelete; @@ -53,7 +56,7 @@ namespace das { // fieldRefs tuple contains: fieldptr, module, structname, fieldname vector> fieldRefs; // tracking for shared modules - das_hash_set writingReadyModules; + das_hash_set writingReadyModules; void tag ( const char * name ); void read ( void * data, size_t size ); void write ( const void * data, size_t size ); @@ -113,6 +116,8 @@ namespace das { // Top-level AstSerializer & operator << ( Module & module ); + void serializeProgram ( ProgramPtr program, ModuleGroup & libGroup ); + template void serializeSmartPtr( smart_ptr & obj, das_hash_map> & objMap ); diff --git a/prog/1stPartyLibs/daScript/include/daScript/ast/ast_typedecl.h b/prog/1stPartyLibs/daScript/include/daScript/ast/ast_typedecl.h index 92484beca..c48b43e54 100644 --- a/prog/1stPartyLibs/daScript/include/daScript/ast/ast_typedecl.h +++ b/prog/1stPartyLibs/daScript/include/daScript/ast/ast_typedecl.h @@ -81,6 +81,7 @@ namespace das { __forceinline bool isGoodVariantType() const; __forceinline bool isVoid() const; __forceinline bool isRef() const; + __forceinline bool isAnyType() const; bool isRefType() const; bool isRefOrPointer() const { return isRef() || isPointer(); } bool canWrite() const; @@ -614,7 +615,7 @@ namespace das { auto tt = typeFactory::make(ctx); if (tt->isRefType()) { tt->ref = false; - } else if (!tt->isRef()) { + } else if (!tt->isRef() && !tt->isAnyType()) { // note: // C++ does not differentiate between void foo ( Foo ); and void foo ( const Foo ); // DAS differenciates for pointers @@ -787,4 +788,8 @@ namespace das { __forceinline bool TypeDecl::isConst() const { return constant; } + + __forceinline bool TypeDecl::isAnyType() const { + return baseType==Type::anyArgument; + } } diff --git a/prog/1stPartyLibs/daScript/include/daScript/misc/platform.h b/prog/1stPartyLibs/daScript/include/daScript/misc/platform.h index a8e26930a..de7390192 100644 --- a/prog/1stPartyLibs/daScript/include/daScript/misc/platform.h +++ b/prog/1stPartyLibs/daScript/include/daScript/misc/platform.h @@ -211,12 +211,12 @@ __forceinline uint32_t rotr_c(uint32_t a, uint32_t b) { void os_debug_break(); #ifndef DAS_FATAL_LOG -#define DAS_FATAL_LOG printf +#define DAS_FATAL_LOG(...) do { printf(__VA_ARGS__); fflush(stdout); } while(0) #endif #ifndef DAS_FATAL_ERROR #define DAS_FATAL_ERROR(...) { \ - printf(__VA_ARGS__); \ + DAS_FATAL_LOG(__VA_ARGS__); \ assert(0 && "fatal error"); \ exit(-1); \ } @@ -228,8 +228,7 @@ void os_debug_break(); #else #define DAS_ASSERT(cond) { \ if ( !(cond) ) { \ - printf("assertion failed: %s, %s:%d\n", #cond, __FILE__, __LINE__); \ - fflush(stdout); \ + DAS_FATAL_LOG("assertion failed: %s, %s:%d\n", #cond, __FILE__, __LINE__); \ os_debug_break(); \ } \ } @@ -242,9 +241,8 @@ void os_debug_break(); #else #define DAS_ASSERTF(cond,...) { \ if ( !(cond) ) { \ - printf("assertion failed: %s, %s:%d\n", #cond, __FILE__, __LINE__); \ - printf(__VA_ARGS__); \ - fflush(stdout); \ + DAS_FATAL_LOG("assertion failed: %s, %s:%d\n", #cond, __FILE__, __LINE__); \ + DAS_FATAL_LOG(__VA_ARGS__); \ os_debug_break(); \ } \ } @@ -256,16 +254,14 @@ void os_debug_break(); #ifdef NDEBUG #define DAS_VERIFY(cond) { \ if ( !(cond) ) { \ - printf("verify failed: %s, %s:%d\n", #cond, __FILE__, __LINE__); \ - fflush(stdout); \ + DAS_FATAL_LOG("verify failed: %s, %s:%d\n", #cond, __FILE__, __LINE__); \ exit(-1); \ } \ } #else #define DAS_VERIFY(cond) { \ if ( !(cond) ) { \ - printf("verify failed: %s, %s:%d\n", #cond, __FILE__, __LINE__); \ - fflush(stdout); \ + DAS_FATAL_LOG("verify failed: %s, %s:%d\n", #cond, __FILE__, __LINE__); \ os_debug_break(); \ } \ } @@ -276,18 +272,16 @@ void os_debug_break(); #ifdef NDEBUG #define DAS_VERIFYF(cond,...) { \ if ( !(cond) ) { \ - printf("verify failed: %s, %s:%d\n", #cond, __FILE__, __LINE__); \ - printf(__VA_ARGS__); \ - fflush(stdout); \ + DAS_FATAL_LOG("verify failed: %s, %s:%d\n", #cond, __FILE__, __LINE__); \ + DAS_FATAL_LOG(__VA_ARGS__); \ exit(-1); \ } \ } #else #define DAS_VERIFYF(cond,...) { \ if ( !(cond) ) { \ - printf("verify failed: %s, %s:%d\n", #cond, __FILE__, __LINE__); \ - printf(__VA_ARGS__); \ - fflush(stdout); \ + DAS_FATAL_LOG("verify failed: %s, %s:%d\n", #cond, __FILE__, __LINE__); \ + DAS_FATAL_LOG(__VA_ARGS__); \ os_debug_break(); \ } \ } diff --git a/prog/1stPartyLibs/daScript/include/daScript/simulate/debug_info.h b/prog/1stPartyLibs/daScript/include/daScript/simulate/debug_info.h index 6e2a89232..5b7ad6c53 100644 --- a/prog/1stPartyLibs/daScript/include/daScript/simulate/debug_info.h +++ b/prog/1stPartyLibs/daScript/include/daScript/simulate/debug_info.h @@ -144,6 +144,7 @@ namespace das virtual bool invalidateFileInfo ( const string & fileName ); virtual string getIncludeFileName ( const string & fileName, const string & incFileName ) const; void freeSourceData(); + virtual int64_t getFileMtime ( const string & fileName ) const; FileInfoPtr letGoOfFileInfo ( const string & fileName ); virtual ModuleInfo getModuleInfo ( const string & req, const string & from ) const; virtual bool isModuleAllowed ( const string &, const string & ) const { return true; }; diff --git a/prog/1stPartyLibs/daScript/src/ast/ast.cpp b/prog/1stPartyLibs/daScript/src/ast/ast.cpp index 4d662ce21..67e0c1fd0 100644 --- a/prog/1stPartyLibs/daScript/src/ast/ast.cpp +++ b/prog/1stPartyLibs/daScript/src/ast/ast.cpp @@ -2680,14 +2680,12 @@ namespace das { Module * Program::addModule ( const string & name ) { if ( auto lm = library.findModule(name) ) { return lm; - } else { - if ( auto pm = Module::require(name) ) { - library.addModule(pm); - return pm; - } else { - return nullptr; - } } + if ( auto pm = Module::require(name) ) { + library.addModule(pm); + return pm; + } + return nullptr; } bool Program::addAlias ( const TypeDeclPtr & at ) { diff --git a/prog/1stPartyLibs/daScript/src/ast/ast_const_folding.cpp b/prog/1stPartyLibs/daScript/src/ast/ast_const_folding.cpp index 109ec3191..6a8b9df89 100644 --- a/prog/1stPartyLibs/daScript/src/ast/ast_const_folding.cpp +++ b/prog/1stPartyLibs/daScript/src/ast/ast_const_folding.cpp @@ -627,6 +627,11 @@ namespace das { bool noSideEffects = true; for ( auto & src : expr->sources ) { noSideEffects &= src->noSideEffects; + if ( !noSideEffects ) break; + if ( src->type->isIterator() ) { + noSideEffects = false; + break; + } } if ( noSideEffects ) { reportFolding(); diff --git a/prog/1stPartyLibs/daScript/src/ast/ast_infer_type.cpp b/prog/1stPartyLibs/daScript/src/ast/ast_infer_type.cpp index 5f54abc9b..37efde433 100644 --- a/prog/1stPartyLibs/daScript/src/ast/ast_infer_type.cpp +++ b/prog/1stPartyLibs/daScript/src/ast/ast_infer_type.cpp @@ -4701,7 +4701,7 @@ namespace das { virtual ExpressionPtr visit ( ExprField * expr ) override { if ( !expr->value->type || expr->value->type->isAliasOrExpr() ) return Visitor::visit(expr); // failed to infer if ( expr->underClone ) { // we wait for the 'right' type to be infered - if ( !expr->underClone->right->type || expr->underClone->right->type->isAliasOrExpr() ) { + if ( !expr->underClone->right->type || expr->underClone->right->type->isAutoOrAlias() ) { error("under clone field type not infered yet", "", "", expr->at, CompilationError::cant_get_field); return Visitor::visit(expr); @@ -4998,19 +4998,6 @@ namespace das { return Visitor::visit(expr); } } - // with - if ( auto eW = hasMatchingWith(expr->name) ) { - reportAstChanged(); - return make_smart(expr->at, forceAt(eW->with->clone(),expr->at), expr->name); - } - // static class method accessing static variables - if ( func && func->isStaticClassMethod && func->classParent->hasStaticMembers ) { - auto staticVarName = func->classParent->name + "`" + expr->name; - if ( func->classParent->module->findVariable(staticVarName) ) { - reportAstChanged(); - return make_smart(expr->at, staticVarName); - } - } // block arguments for ( auto it = blocks.rbegin(); it!=blocks.rend(); ++it ) { ExprBlock * block = *it; @@ -5050,6 +5037,19 @@ namespace das { argumentIndex ++; } } + // with + if ( auto eW = hasMatchingWith(expr->name) ) { + reportAstChanged(); + return make_smart(expr->at, forceAt(eW->with->clone(),expr->at), expr->name); + } + // static class method accessing static variables + if ( func && func->isStaticClassMethod && func->classParent->hasStaticMembers ) { + auto staticVarName = func->classParent->name + "`" + expr->name; + if ( func->classParent->module->findVariable(staticVarName) ) { + reportAstChanged(); + return make_smart(expr->at, staticVarName); + } + } // global auto vars = findMatchingVar(expr->name, false); if ( vars.size()==1 ) { diff --git a/prog/1stPartyLibs/daScript/src/ast/ast_parse.cpp b/prog/1stPartyLibs/daScript/src/ast/ast_parse.cpp index ea01d2f64..6aec3777b 100644 --- a/prog/1stPartyLibs/daScript/src/ast/ast_parse.cpp +++ b/prog/1stPartyLibs/daScript/src/ast/ast_parse.cpp @@ -1,6 +1,7 @@ #include "daScript/misc/platform.h" #include "daScript/ast/ast.h" +#include "daScript/ast/ast_serializer.h" #include "daScript/ast/ast_expressions.h" #include "../parser/parser_state.h" @@ -284,6 +285,49 @@ namespace das { static DAS_THREAD_LOCAL int64_t totOpt = 0; static DAS_THREAD_LOCAL int64_t totM = 0; + bool trySerializeProgramModule ( + ProgramPtr & program, + const FileAccessPtr & access, + const string & fileName, + ModuleGroup & libGroup ) { + auto & serializer_read = daScriptEnvironment::bound->serializer_read; + auto & serializer_write = daScriptEnvironment::bound->serializer_write; + + if ( serializer_read == nullptr || serializer_read->seenNewModule ) { + return false; + } + + int64_t file_mtime = access->getFileMtime(fileName.c_str()); + int64_t saved_mtime = 0; *serializer_read << saved_mtime; + + string saved_filename; *serializer_read << saved_filename; + DAS_ASSERTF(saved_filename == fileName, "expected the same order of modules"); + + if ( file_mtime != saved_mtime ) { + serializer_read->seenNewModule = true; + return false; + } + + serializer_read->thisModuleGroup = &libGroup; + serializer_read->serializeProgram(program, libGroup); + program->thisModuleGroup = &libGroup; + + if ( serializer_read->failed ) { + serializer_read->seenNewModule = true; + program = make_smart(); + return false; + } + + // Writeback + if ( serializer_write != nullptr ) { + *serializer_write << file_mtime; + *serializer_write << const_cast(fileName); + serializer_write->serializeProgram(program, libGroup); + } + + return true; + } + ProgramPtr parseDaScript ( const string & fileName, const FileAccessPtr & access, TextWriter & logs, @@ -291,10 +335,15 @@ namespace das { bool exportAll, bool isDep, CodeOfPolicies policies ) { + ProgramPtr program = make_smart(); ReuseCacheGuard rcg; auto time0 = ref_time_ticks(); + + if ( trySerializeProgramModule(program, access, fileName, libGroup) ) { + return program; + } + int err; - auto program = make_smart(); daScriptEnvironment::bound->g_Program = program; daScriptEnvironment::bound->g_compilerLog = &logs; program->promoteToBuiltin = false; @@ -427,6 +476,13 @@ namespace das { auto dt = get_time_usec(time0) / 1000000.; logs << "compiler took " << dt << ", " << fileName << "\n"; } + auto & serializer_write = daScriptEnvironment::bound->serializer_write; + if ( serializer_write != nullptr ) { + int64_t file_mtime = access->getFileMtime(fileName.c_str()); + *serializer_write << file_mtime; + *serializer_write << const_cast(fileName); + serializer_write->serializeProgram(program, libGroup); + } return program; } } @@ -461,6 +517,125 @@ namespace das { } } + vector saveRequireMetadata ( vector & req ) { + AstSerializer ser; + for ( auto & r : req ) { + ser << r.fileName; + } + return das::move(ser.buffer); + } + + vector restoreMetadata ( vector & metadata ) { + vector result; + AstSerializer deser{ForReading{}, das::move(metadata)}; + while ( deser.buffer.size() != deser.readOffset ) { + string filename; deser << filename; + result.push_back(das::move(filename)); + } + return result; + } + + void updateSerializationMetadata ( vector & req ) { + auto & serializer_read = daScriptEnvironment::bound->serializer_read; + auto & serializer_write = daScriptEnvironment::bound->serializer_write; + if ( serializer_read != nullptr ) { + auto saved_filenames = restoreMetadata(serializer_read->metadata); + auto current_filenames = vector(); + for ( auto & mod : req ) { current_filenames.push_back(mod.fileName); } + if ( current_filenames != saved_filenames ) { serializer_read->seenNewModule = true; } + } + if ( serializer_write != nullptr ) + serializer_write->metadata = saveRequireMetadata(req); + }; + + bool aotModuleHasName ( ProgramPtr program, const ModuleInfo & mod ) { + if ( bool no_aot = program->options.getBoolOption("no_aot",false); no_aot ) + return true; + if ( !program->thisModule->name.empty() ) + return true; + program->error("Module " + mod.moduleName + " is not setup correctly for AOT", + "module " + mod.moduleName + " is required", "", LineInfo(), + CompilationError::module_does_not_have_a_name); + return false; + } + + void addNewModules ( ModuleGroup & libGroup, ProgramPtr program ) { + libGroup.addModule(program->thisModule.release()); + program->library.foreach([&](Module * pm) -> bool { + if ( !pm->name.empty() && pm->name!="$" ) { + if ( !libGroup.findModule(pm->name) ) { + libGroup.addModule(pm); + } + } + return true; + }, "*"); + } + + bool canShareModule ( ProgramPtr program ) { + // Check if all dependencies are shared too + bool regFromShar = false; + for ( auto & reqM : program->thisModule->requireModule ) { + if ( !reqM.first->builtIn ) { + program->error("Shared module " + program->thisModule->name + " has incorrect dependency type.", + "Can't require " + reqM.first->name + " because its not shared", "", LineInfo(), + CompilationError::module_required_from_shared); + regFromShar = true; + } + } + return !regFromShar; + } + + void addRttiRequireVariable ( ProgramPtr res, string fileName ) { + TextWriter ss; + for ( const auto & arq : res->allRequireDecl ) { + ss << get<1>(arq) << " "; + } + ss << fileName; + auto rtti_require = make_smart(); + rtti_require->name = "__rtti_require"; + rtti_require->type = make_smart(Type::tString); + rtti_require->init = make_smart(ss.str()); + rtti_require->init->type = make_smart(Type::tString); + rtti_require->used = true; + rtti_require->private_variable = true; + res->thisModule->addVariable(rtti_require); + } + + + ProgramPtr reportPrerequisitesErrors ( + string fileName, + vector & missing, + vector & circular, + vector & notAllowed, + vector & req, + das_set & dependencies, + const FileAccessPtr & access, + ModuleGroup & libGroup, + CodeOfPolicies policies ) { + TextWriter tw; + req.clear(); + missing.clear(); + circular.clear(); + dependencies.clear(); + getPrerequisits(fileName, access, req, missing, circular, notAllowed, dependencies, libGroup, &tw, 1, false); + auto program = make_smart(); + program->policies = policies; + program->thisModuleGroup = &libGroup; + TextWriter err; + for ( auto & mis : missing ) { + err << "missing prerequisit " << mis << "\n"; + } + for ( auto & mis : circular ) { + err << "circular dependency " << mis << "\n"; + } + for ( auto & mis : notAllowed ) { + err << "module not allowed " << mis << "\n"; + } + program->error(err.str(),"module dependency graph:\n" + tw.str(), "", LineInfo(), + CompilationError::module_not_found); + return program; + } + ProgramPtr compileDaScript ( const string & fileName, const FileAccessPtr & access, TextWriter & logs, @@ -481,6 +656,7 @@ namespace das { if ( getPrerequisits(fileName, access, req, missing, circular, notAllowed, dependencies, libGroup, nullptr, 1, !policies.ignore_shared_modules) ) { preqT = get_time_usec(time0); + updateSerializationMetadata(req); if ( policies.debugger ) { addExtraDependency("debug", policies.debug_module, missing, circular, notAllowed, req, dependencies, access, libGroup, policies); } @@ -488,52 +664,32 @@ namespace das { addExtraDependency("profiler", policies.profile_module, missing, circular, notAllowed, req, dependencies, access, libGroup, policies); } for ( auto & mod : req ) { - if ( !libGroup.findModule(mod.moduleName) ) { - auto program = parseDaScript(mod.fileName, access, logs, libGroup, true, true, policies); - policies.threadlock_context |= program->options.getBoolOption("threadlock_context",false); - if ( program->failed() ) { - return program; - } - if ( policies.fail_on_lack_of_aot_export ) { - if ( !program->options.getBoolOption("no_aot",false) ) { - if ( program->thisModule->name.empty() ) { - program->error("Module " + mod.moduleName + " is not setup correctly for AOT", - "module " + mod.moduleName + " is required", "", LineInfo(), - CompilationError::module_does_not_have_a_name); - return program; - } - } - } - if ( program->thisModule->name.empty() ) { - program->thisModule->name = mod.moduleName; - program->thisModule->wasParsedNameless = true; - } - if ( program->promoteToBuiltin ) { - bool regFromShar = false; - for ( auto & reqM : program->thisModule->requireModule ) { - if ( !reqM.first->builtIn ) { - program->error("Shared module " + program->thisModule->name + " has incorrect dependency type.", - "Can't require " + reqM.first->name + " because its not shared", "", LineInfo(), - CompilationError::module_required_from_shared); - regFromShar = true; - } - } - if ( regFromShar ) { - return program; - } + if ( libGroup.findModule(mod.moduleName) ) { + continue; + } + auto program = parseDaScript(mod.fileName, access, logs, libGroup, true, true, policies); + policies.threadlock_context |= program->options.getBoolOption("threadlock_context",false); + if ( program->failed() ) { + return program; + } + if ( policies.fail_on_lack_of_aot_export && !aotModuleHasName(program, mod) ) { + return program; + } + if ( program->thisModule->name.empty() ) { + program->thisModule->name = mod.moduleName; + program->thisModule->wasParsedNameless = true; + } + if ( program->promoteToBuiltin ) { + if ( canShareModule(program) ) { program->thisModule->promoteToBuiltin(access); + } else { + return program; } - libGroup.addModule(program->thisModule.release()); - program->library.foreach([&](Module * pm) -> bool { - if ( !pm->name.empty() && pm->name!="$" ) { - if ( !libGroup.findModule(pm->name) ) { - libGroup.addModule(pm); - } - } - return true; - }, "*"); } + addNewModules(libGroup, program); } + auto & serializer_read = daScriptEnvironment::bound->serializer_read; + if ( serializer_read && !policies.serialize_main_module ) serializer_read->seenNewModule = true; auto res = parseDaScript(fileName, access, logs, libGroup, exportAll, false, policies); policies.threadlock_context |= res->options.getBoolOption("threadlock_context",false); if ( !res->failed() ) { @@ -555,21 +711,8 @@ namespace das { } else { if (!res->failed()) res->markExecutableSymbolUse(); - if ( res->getDebugger()) { - TextWriter ss; - for ( const auto & arq : res->allRequireDecl ) { - ss << get<1>(arq) << " "; - } - ss << fileName; - auto rtti_require = make_smart(); - rtti_require->name = "__rtti_require"; - rtti_require->type = make_smart(Type::tString); - rtti_require->init = make_smart(ss.str()); - rtti_require->init->type = make_smart(Type::tString); - rtti_require->used = true; - rtti_require->private_variable = true; - res->thisModule->addVariable(rtti_require); - } + if (res->getDebugger()) + addRttiRequireVariable(res, fileName); if (!res->failed()) res->removeUnusedSymbols(); if (!res->failed()) @@ -602,28 +745,8 @@ namespace das { } return res; } else { - TextWriter tw; - req.clear(); - missing.clear(); - circular.clear(); - dependencies.clear(); - getPrerequisits(fileName, access, req, missing, circular, notAllowed, dependencies, libGroup, &tw, 1, false); - auto program = make_smart(); - program->policies = policies; - program->thisModuleGroup = &libGroup; - TextWriter err; - for ( auto & mis : missing ) { - err << "missing prerequisit " << mis << "\n"; - } - for ( auto & mis : circular ) { - err << "circular dependency " << mis << "\n"; - } - for ( auto & mis : notAllowed ) { - err << "module not allowed " << mis << "\n"; - } - program->error(err.str(),"module dependency graph:\n" + tw.str(), "", LineInfo(), - CompilationError::module_not_found); - return program; + return reportPrerequisitesErrors(fileName, missing, circular, notAllowed, + req, dependencies, access, libGroup, policies); } } } diff --git a/prog/1stPartyLibs/daScript/src/builtin/module_builtin_ast_serialize.cpp b/prog/1stPartyLibs/daScript/src/builtin/module_builtin_ast_serialize.cpp index 4c59709c9..691b8310d 100644 --- a/prog/1stPartyLibs/daScript/src/builtin/module_builtin_ast_serialize.cpp +++ b/prog/1stPartyLibs/daScript/src/builtin/module_builtin_ast_serialize.cpp @@ -326,11 +326,6 @@ namespace das { string mangeldName = func->getMangledName(); string moduleName = func->module->name; *this << moduleName << mangeldName; - if ( func->module->findFunction(mangeldName) == nullptr && func->module->builtIn && !func->module->promoted ) { - auto f = func->module->findUniqueFunction(func->name); - DAS_VERIFYF(f, "expected to find f"); - *this << f->name; - } } void AstSerializer::writeIdentifications ( Enumeration * & ptr ) { @@ -406,7 +401,10 @@ namespace das { splitTypeName(mangledName, modname, funcname); func = funcModule->findFunction(funcname).get(); } - DAS_VERIFYF(func!=nullptr, "function '%s' is not found", mangledName.c_str()); + if ( func == nullptr ) { + failed = true; + das_to_stderr("das: ser: function '%s' not found", mangledName.c_str()); + } } void AstSerializer::findExternal ( Enumeration * & ptr ) { @@ -838,6 +836,7 @@ namespace das { if ( !is_null ) { string name; *this << name; module = moduleLibrary->findModule(name); + DAS_VERIFYF(module, "expected to fetch module from library"); } else { module = nullptr; } @@ -1291,17 +1290,20 @@ namespace das { if ( value->type->isPointer() ) { DAS_VERIFYF(value->type->firstType->isStructure(), "expected to see structure field access via pointer"); mangledName = value->type->firstType->structType->getMangledName(); + ser << value->type->firstType->structType->module; } else { DAS_VERIFYF(value->type->isStructure(), "expected to see structure field access"); mangledName = value->type->structType->getMangledName(); + ser << value->type->structType->module; } ser << mangledName; } else { bool has_field = false; ser << has_field; if ( !has_field ) return; + Module * module; ser << module; string mangledName; ser << mangledName; field = ( Structure::FieldDeclaration * ) 1; - ser.fieldRefs.emplace_back(&field, ser.thisModule, move(mangledName), name); + ser.fieldRefs.emplace_back(&field, module, move(mangledName), name); } } @@ -1644,7 +1646,6 @@ namespace das { program->isCompiling = false; program->markMacroSymbolUse(); program->allocateStack(ignore_logs); - daScriptEnvironment::bound->g_Program = program; program->makeMacroModule(ignore_logs); // unbind the module from the program return program->thisModule.release(); @@ -1654,6 +1655,7 @@ namespace das { void finalizeModule ( AstSerializer & ser, ModuleLibrary & lib, Module * this_mod ) { ProgramPtr program; + if ( ser.failed ) return; // simulate macros if ( ser.writing ) { bool is_macro_module = this_mod->macroContext; // it's a macro module if it has macroContext @@ -1668,6 +1670,7 @@ namespace das { program->thisModuleGroup = ser.thisModuleGroup; program->thisModuleName.clear(); program->library.reset(); + program->policies.stack = 64 * 1024; program->thisModule.release(); program->thisModule.reset(this_mod); lib.foreach([&] ( Module * pm ) { @@ -1675,6 +1678,7 @@ namespace das { return true; },"*"); // always finalize annotations + daScriptEnvironment::bound->g_Program = program; program->finalizeAnnotations(); bool is_macro_module = false; @@ -1857,7 +1861,9 @@ namespace das { serializeGlobals(ser, globals); // globals require insertion in the same order serializeStructures(ser, structures); serializeFunctions(ser, functions); + if ( ser.failed ) return; serializeFunctions(ser, generics); + if ( ser.failed ) return; ser << functionsByName << genericsByName; ser << ownFileInfo; //<< promotedAccess; @@ -1909,6 +1915,11 @@ namespace das { } } + vector getDependecyOrdered(Module * m) { + visit(m); + return std::move(sorted); + } + vector getDependecyOrdered() { for ( auto mod : input ) { visit(mod); @@ -1990,6 +2001,75 @@ namespace das { return *this; } + // Used in eden + void AstSerializer::serializeProgram ( ProgramPtr program, ModuleGroup & libGroup ) { + auto & ser = *this; + + ser << program->thisNamespace << program->thisModuleName; + + ser << program->totalFunctions << program->totalVariables << program->newLambdaIndex; + ser << program->globalInitStackSize << program->globalStringHeapSize; + ser << program->flags; + + ser << program->options << program->policies; + + if ( writing ) { + TopSort ts(program->library.getModules()); + auto modules = ts.getDependecyOrdered(program->thisModule.get()); + + uint64_t size = modules.size(); *this << size; + + for ( auto & m : modules ) { + bool builtin = m->builtIn, promoted = m->promoted; + *this << builtin << promoted; + *this << m->name; + + if ( m->builtIn && !m->promoted ) { + continue; + } + + if ( writingReadyModules.count(m) == 0 ) { + writingReadyModules.insert(m); + *this << *m; + } + } + } else { + uint64_t size = 0; ser << size; + + program->library.reset(); + program->thisModule.release(); + moduleLibrary = &program->library; + + for ( uint64_t i = 0; i < size; i++ ) { + bool builtin, promoted; + ser << builtin << promoted; + string name; ser << name; + + if ( builtin && !promoted ) { + auto m = Module::require(name); + program->library.addModule(m); + continue; + } + + if ( auto m = libGroup.findModule(name) ) { + program->library.addModule(m); + continue; + } + + auto deser = new Module(); + program->library.addModule(deser); + ser << *deser; + } + + for ( auto & m : program->library.getModules() ) { + if ( m->name == program->thisModuleName ) { + program->thisModule.reset(m); + } + } + } + } + + // Used in daNetGame currently void Program::serialize ( AstSerializer & ser ) { ser << thisNamespace << thisModuleName; diff --git a/prog/1stPartyLibs/daScript/src/builtin/module_file_access.cpp b/prog/1stPartyLibs/daScript/src/builtin/module_file_access.cpp index 8f6f8abf7..dec91230d 100644 --- a/prog/1stPartyLibs/daScript/src/builtin/module_file_access.cpp +++ b/prog/1stPartyLibs/daScript/src/builtin/module_file_access.cpp @@ -1,6 +1,10 @@ #include "daScript/misc/platform.h" #include "daScript/ast/ast.h" +#if !defined(DAS_NO_FILEIO) +#include +#endif + das::Context * get_context ( int stackSize=0 ); namespace das { @@ -65,6 +69,16 @@ namespace das { } } + int64_t FileAccess::getFileMtime ( const string & fileName) const { +#if !defined(DAS_NO_FILEIO) + struct stat st; + stat(fileName.c_str(), &st); + return st.st_mtime; +#else + return -1; +#endif + } + bool ModuleFileAccess::canModuleBeUnsafe ( const string & mod, const string & fileName ) const { if(failed() || !moduleUnsafe) return FileAccess::canModuleBeUnsafe(mod,fileName); vec4f args[2]; diff --git a/prog/1stPartyLibs/daScript/src/misc/sysos.cpp b/prog/1stPartyLibs/daScript/src/misc/sysos.cpp index 0a74f2ac4..80319f9f3 100644 --- a/prog/1stPartyLibs/daScript/src/misc/sysos.cpp +++ b/prog/1stPartyLibs/daScript/src/misc/sysos.cpp @@ -404,14 +404,13 @@ #include #include namespace das { - static char executablePath[MAXPATHLEN]; - extern "C" void - initialize_before(image_id ourImage) - { - image_info ii; - get_image_info(ourImage, &ii); - snprintf(executablePath, sizeof(executablePath), "%s", ii.name); - } + static char executablePath[MAXPATHLEN]; + extern "C" void + initialize_before(image_id ourImage) { + image_info ii; + get_image_info(ourImage, &ii); + snprintf(executablePath, sizeof(executablePath), "%s", ii.name); + } void hwSetBreakpointHandler ( void (*) ( int, void * ) ) { } int hwBreakpointSet ( void *, int, int ) { return -1; diff --git a/prog/1stPartyLibs/daScript/src/parser/parser_impl.cpp b/prog/1stPartyLibs/daScript/src/parser/parser_impl.cpp index 0e2de2986..5d358d517 100644 --- a/prog/1stPartyLibs/daScript/src/parser/parser_impl.cpp +++ b/prog/1stPartyLibs/daScript/src/parser/parser_impl.cpp @@ -535,13 +535,14 @@ namespace das { const LineInfo & fromBlock, const LineInfo & annLAt ) { func->atDecl = fromBlock; func->body = block; + auto isGeneric = func->isGeneric(); if ( !yyextra->g_thisStructure ) { das_yyerror(scanner,"internal error or invalid macro. member function is declared outside of a class", func->at, CompilationError::invalid_member_function); } else if ( yyextra->g_Program->policies.no_members_functions_in_struct && !yyextra->g_thisStructure->isClass ) { das_yyerror(scanner,"structure can't have a member function", func->at, CompilationError::invalid_member_function); - } else if ( func->isGeneric() ) { + } else if ( isGeneric && !isStatic ) { das_yyerror(scanner,"generic function can't be a member of a class " + func->getMangledName(), func->at, CompilationError::invalid_member_function); } else if ( isOpName(func->name) ) { @@ -617,10 +618,18 @@ namespace das { } } assignDefaultArguments(func); - runFunctionAnnotations(scanner, func, annL, annLAt); - if ( !yyextra->g_Program->addFunction(func) ) { - das_yyerror(scanner,"function is already defined " + func->getMangledName(), - func->at, CompilationError::function_already_declared); + if ( isGeneric ) { + if ( !yyextra->g_Program->addGeneric(func) ) { + das_yyerror(scanner,"generic function is already defined " + func->getMangledName(), + func->at, CompilationError::function_already_declared); + } + + } else { + runFunctionAnnotations(scanner, func, annL, annLAt); + if ( !yyextra->g_Program->addFunction(func) ) { + das_yyerror(scanner,"function is already defined " + func->getMangledName(), + func->at, CompilationError::function_already_declared); + } } func->delRef(); } diff --git a/prog/1stPartyLibs/dag/dag_relocatable.h b/prog/1stPartyLibs/dag/dag_relocatable.h index d178645aa..0150e5bfc 100644 --- a/prog/1stPartyLibs/dag/dag_relocatable.h +++ b/prog/1stPartyLibs/dag/dag_relocatable.h @@ -3,7 +3,7 @@ * Copyright (C) 2023 Gaijin Games KFT. All rights reserved * * (for conditions of use see prog/license.txt) -*/ + */ #ifndef _DAGOR_DAG_RELOCATABLE_H_ #define _DAGOR_DAG_RELOCATABLE_H_ @@ -11,16 +11,33 @@ #include +#include namespace dag { - template - struct is_type_relocatable : public eastl::false_type {}; - template - struct is_type_init_constructing : public eastl::true_type {}; -} +template +struct is_type_relocatable : public eastl::false_type +{}; -#define DAG_DECLARE_RELOCATABLE(C) template<> struct dag::is_type_relocatable : public eastl::true_type {} +template +struct is_type_relocatable>> : public eastl::true_type +{}; + +template +struct is_type_relocatable, + typename eastl::enable_if_t::value && is_type_relocatable::value>> : public eastl::true_type +{}; + +template +struct is_type_init_constructing : public eastl::true_type +{}; + +} // namespace dag + +#define DAG_DECLARE_RELOCATABLE(C) \ + template <> \ + struct dag::is_type_relocatable : public eastl::true_type \ + {} #endif diff --git a/prog/1stPartyLibs/dag/dag_vector.h b/prog/1stPartyLibs/dag/dag_vector.h index 9c14e402c..65c079b48 100644 --- a/prog/1stPartyLibs/dag/dag_vector.h +++ b/prog/1stPartyLibs/dag/dag_vector.h @@ -99,9 +99,6 @@ small_vector_default_fill_n(ForwardIterator first, size_t n) eastl::uninitialized_default_fill_n(first, n); } -template -struct is_type_relocatable>> : public eastl::true_type {}; - template class Vector; template struct is_type_relocatable, void> : public eastl::true_type {}; @@ -841,7 +838,7 @@ class Vector IF_CONSTEXPR(is_type_relocatable::value) { if (mpEnd != destPosition) // branch for append-like inserts - memmove(destPosition + n, destPosition, (char*)mpEnd - (char*)destPosition); //-V780 + memmove((void*)&destPosition[n], (const void*)destPosition, (char*)mpEnd - (char*)destPosition); //-V780 eastl::uninitialized_copy_ptr(first, last, destPosition); } else if (n < nExtra) // If the inserted values are entirely within initialized memory (i.e. are before mpEnd)... diff --git a/prog/1stPartyLibs/protolib/serialization/tests/generate.cmd b/prog/1stPartyLibs/protolib/serialization/tests/generate.cmd deleted file mode 100644 index b2c21113d..000000000 --- a/prog/1stPartyLibs/protolib/serialization/tests/generate.cmd +++ /dev/null @@ -1,3 +0,0 @@ -"../../../../../prog\tools\custom-protoc\exec\custom-protoc-dbg.exe" --proto_path=../../../../../prog\tools\custom-protoc;.;../../../../../prog/3rdPartyLibs/protobuf/src --char_out=generated testMapSerialization.proto -"../../../../../prog\tools\custom-protoc\exec\custom-protoc-dbg.exe" --proto_path=../../../../../prog\tools\custom-protoc;.;../../../../../prog/3rdPartyLibs/protobuf/src --char_out=generated testVectorSerialization.proto -pause \ No newline at end of file diff --git a/prog/1stPartyLibs/protolib/serialization/tests/generate.sh b/prog/1stPartyLibs/protolib/serialization/tests/generate.sh deleted file mode 100644 index f4d1edeb8..000000000 --- a/prog/1stPartyLibs/protolib/serialization/tests/generate.sh +++ /dev/null @@ -1,2 +0,0 @@ -../../../../../prog/tools/custom-protoc/exec/custom-protoc-dbg --proto_path=../../../../../prog/tools/custom-protoc --proto_path=. --proto_path=../../../../../prog/3rdPartyLibs/protobuf/src --char_out=generated testMapSerialization.proto -../../../../../prog/tools/custom-protoc/exec/custom-protoc-dbg --proto_path=../../../../../prog/tools/custom-protoc --proto_path=. --proto_path=../../../../../prog/3rdPartyLibs/protobuf/src --char_out=generated testVectorSerialization.proto \ No newline at end of file diff --git a/prog/1stPartyLibs/protolib/serialization/tests/generated/testMapSerialization.pb.cpp b/prog/1stPartyLibs/protolib/serialization/tests/generated/testMapSerialization.pb.cpp deleted file mode 100644 index 1eb69682e..000000000 --- a/prog/1stPartyLibs/protolib/serialization/tests/generated/testMapSerialization.pb.cpp +++ /dev/null @@ -1,136 +0,0 @@ -#include "testMapSerialization.pb.h" - -#include -#include -#include -#include -#include -#include - - -TestItem::TestItem() -{ - clear(); -} - - -void TestItem::clear() -{ - name.clear(); -} - - -TestItem::~TestItem() -{ -} - - -void TestItem::touchRecursive(proto::version value) -{ - touch(value); -} - - -void TestItem::saveToDB( - proto::io::OutputCodeStream & stream, - const proto::BinarySaveOptions & saveOptions) const -{ - stream.blockBegin(saveOptions.getNumber()); - stream.writeString(1, proto::str_cstr(name), proto::str_size(name)); - stream.blockEnd(); -} - - -bool TestItem::loadFromDB( - proto::io::InputCodeStream & stream, - proto::io::StreamTag tag) -{ - PROTO_VALIDATE(tag.type == proto::io::StreamTag::BLOCK_BEGIN); - PROTO_VALIDATE(stream.readTag(tag)); - while (true) - { - if (tag.isBlockEnded()) - break; - - switch (tag.number) - { - case 1: - PROTO_VALIDATE(proto::io::readString(stream, tag, name)); - PROTO_VALIDATE(stream.readTag(tag)); - break; - default: - PROTO_VALIDATE(stream.skip(tag)); // unknown field - PROTO_VALIDATE(stream.readTag(tag)); - } - - } - - return true; -} - - -TestMessage::TestMessage() -{ - clear(); -} - - -void TestMessage::clear() -{ - - items.clear(); - items.touchStructure(proto::DEFAULT_VERSION); -} - - -TestMessage::~TestMessage() -{ -} - - -void TestMessage::touchRecursive(proto::version value) -{ - touch(value); - items.touchRecursive(value); -} - - -void TestMessage::saveToDB( - proto::io::OutputCodeStream & stream, - const proto::BinarySaveOptions & saveOptions) const -{ - stream.blockBegin(saveOptions.getNumber()); - proto::io::writeMapVersioned(stream, 1, items, &TestItem::saveToDB); - - stream.blockEnd(); -} - - -bool TestMessage::loadFromDB( - proto::io::InputCodeStream & stream, - proto::io::StreamTag tag) -{ - PROTO_VALIDATE(tag.type == proto::io::StreamTag::BLOCK_BEGIN); - PROTO_VALIDATE(stream.readTag(tag)); - while (true) - { - if (tag.isBlockEnded()) - break; - - switch (tag.number) - { - case 1: - PROTO_VALIDATE((proto::io::MapSerialization::readMap( - stream, tag, items, &TestItem::loadFromDB))); - PROTO_VALIDATE(stream.readTag(tag)); - break; - default: - PROTO_VALIDATE(stream.skip(tag)); // unknown field - PROTO_VALIDATE(stream.readTag(tag)); - } - - } - - return true; -} - diff --git a/prog/1stPartyLibs/protolib/serialization/tests/generated/testMapSerialization.pb.h b/prog/1stPartyLibs/protolib/serialization/tests/generated/testMapSerialization.pb.h deleted file mode 100644 index 27d6fc248..000000000 --- a/prog/1stPartyLibs/protolib/serialization/tests/generated/testMapSerialization.pb.h +++ /dev/null @@ -1,103 +0,0 @@ -// Generated by the protocol buffer compiler. DO NOT EDIT! -// source: testMapSerialization.proto - -#pragma once - - -#include - - -namespace proto -{ - namespace io - { - struct StreamTag; - class OutputCodeStream; - class InputCodeStream; - } -} - - - -class TestItem: - public proto::SerializableVersioned -{ -public: - TestItem(); - ~TestItem(); - - void clear(); - - void touchRecursive(proto::version value); - - void saveToDB( - proto::io::OutputCodeStream & stream, - const proto::BinarySaveOptions & saveOptions) const; - - bool loadFromDB(proto::io::InputCodeStream & stream, proto::io::StreamTag tag); - - const proto::string & getName() const; - void setName(const proto::string & value); - -private: - proto::string name; -}; - -typedef proto::ContainerVersioned::TType> - TStringToTestItemUnorderedMap; - - -class TestMessage: - public proto::SerializableVersioned -{ -public: - TestMessage(); - ~TestMessage(); - - void clear(); - - void touchRecursive(proto::version value); - - void saveToDB( - proto::io::OutputCodeStream & stream, - const proto::BinarySaveOptions & saveOptions) const; - - bool loadFromDB(proto::io::InputCodeStream & stream, proto::io::StreamTag tag); - - TStringToTestItemUnorderedMap & getItems(); - const TStringToTestItemUnorderedMap & getItems() const; - -private: - TStringToTestItemUnorderedMap items; -}; - - -inline const proto::string & TestItem::getName() const -{ - return name; -} - - -inline void TestItem::setName(const proto::string & value) -{ - name = value; -} - - -inline TStringToTestItemUnorderedMap & TestMessage::getItems() -{ - return items; -} - - -inline const TStringToTestItemUnorderedMap & TestMessage::getItems() const -{ - return items; -} - - -// type traits -namespace proto -{ -} - diff --git a/prog/1stPartyLibs/protolib/serialization/tests/generated/testVectorSerialization.pb.cpp b/prog/1stPartyLibs/protolib/serialization/tests/generated/testVectorSerialization.pb.cpp deleted file mode 100644 index 91a428969..000000000 --- a/prog/1stPartyLibs/protolib/serialization/tests/generated/testVectorSerialization.pb.cpp +++ /dev/null @@ -1,138 +0,0 @@ -#include "testVectorSerialization.pb.h" - -#include -#include -#include -#include -#include -#include - -namespace proto_vector_test -{ - - TestItem::TestItem() - { - clear(); - } - - - void TestItem::clear() - { - name.clear(); - } - - - TestItem::~TestItem() - { - } - - - void TestItem::touchRecursive(proto::version value) - { - touch(value); - } - - - void TestItem::saveToDB( - proto::io::OutputCodeStream & stream, - const proto::BinarySaveOptions & saveOptions) const - { - stream.blockBegin(saveOptions.getNumber()); - stream.writeString(1, proto::str_cstr(name), proto::str_size(name)); - stream.blockEnd(); - } - - - bool TestItem::loadFromDB( - proto::io::InputCodeStream & stream, - proto::io::StreamTag tag) - { - PROTO_VALIDATE(tag.type == proto::io::StreamTag::BLOCK_BEGIN); - PROTO_VALIDATE(stream.readTag(tag)); - while (true) - { - if (tag.isBlockEnded()) - break; - - switch (tag.number) - { - case 1: - PROTO_VALIDATE(proto::io::readString(stream, tag, name)); - PROTO_VALIDATE(stream.readTag(tag)); - break; - default: - PROTO_VALIDATE(stream.skip(tag)); // unknown field - PROTO_VALIDATE(stream.readTag(tag)); - } - - } - - return true; - } - - - TestMessage::TestMessage() - { - clear(); - } - - - void TestMessage::clear() - { - - items.clear(); - items.touchStructure(proto::DEFAULT_VERSION); - } - - - TestMessage::~TestMessage() - { - } - - - void TestMessage::touchRecursive(proto::version value) - { - touch(value); - items.touchRecursive(value); - } - - - void TestMessage::saveToDB( - proto::io::OutputCodeStream & stream, - const proto::BinarySaveOptions & saveOptions) const - { - stream.blockBegin(saveOptions.getNumber()); - proto::io::writeVectorVersioned(stream, 1, items, &TestItem::saveToDB); - stream.blockEnd(); - } - - - bool TestMessage::loadFromDB( - proto::io::InputCodeStream & stream, - proto::io::StreamTag tag) - { - PROTO_VALIDATE(tag.type == proto::io::StreamTag::BLOCK_BEGIN); - PROTO_VALIDATE(stream.readTag(tag)); - while (true) - { - if (tag.isBlockEnded()) - break; - - switch (tag.number) - { - case 1: - PROTO_VALIDATE((proto::io::readVectorVersioned( - stream, tag, items, &TestItem::loadFromDB))); - PROTO_VALIDATE(stream.readTag(tag)); - break; - default: - PROTO_VALIDATE(stream.skip(tag)); // unknown field - PROTO_VALIDATE(stream.readTag(tag)); - } - - } - - return true; - } - -} diff --git a/prog/1stPartyLibs/protolib/serialization/tests/generated/testVectorSerialization.pb.h b/prog/1stPartyLibs/protolib/serialization/tests/generated/testVectorSerialization.pb.h deleted file mode 100644 index 3906a162d..000000000 --- a/prog/1stPartyLibs/protolib/serialization/tests/generated/testVectorSerialization.pb.h +++ /dev/null @@ -1,107 +0,0 @@ -// Generated by the protocol buffer compiler. DO NOT EDIT! -// source: testVectorSerialization.proto - -#pragma once - - -#include - - -namespace proto -{ - namespace io - { - struct StreamTag; - class OutputCodeStream; - class InputCodeStream; - } -} - - -namespace proto_vector_test -{ - - class TestItem: - public proto::SerializableVersioned - { - public: - TestItem(); - ~TestItem(); - - void clear(); - - void touchRecursive(proto::version value); - - void saveToDB( - proto::io::OutputCodeStream & stream, - const proto::BinarySaveOptions & saveOptions) const; - - bool loadFromDB(proto::io::InputCodeStream & stream, proto::io::StreamTag tag); - - const proto::string & getName() const; - void setName(const proto::string & value); - - private: - proto::string name; - }; - - typedef proto::ContainerVersioned::TType> - TTestItemVector; - - - class TestMessage: - public proto::SerializableVersioned - { - public: - TestMessage(); - ~TestMessage(); - - void clear(); - - void touchRecursive(proto::version value); - - void saveToDB( - proto::io::OutputCodeStream & stream, - const proto::BinarySaveOptions & saveOptions) const; - - bool loadFromDB(proto::io::InputCodeStream & stream, proto::io::StreamTag tag); - - TTestItemVector & getItems(); - const TTestItemVector & getItems() const; - - private: - TTestItemVector items; - }; - - - inline const proto::string & TestItem::getName() const - { - return name; - } - - - inline void TestItem::setName(const proto::string & value) - { - name = value; - } - - - inline TTestItemVector & TestMessage::getItems() - { - return items; - } - - - inline const TTestItemVector & TestMessage::getItems() const - { - return items; - } - -} - - -// type traits -namespace proto -{ -} - diff --git a/prog/1stPartyLibs/protolib/serialization/tests/jsonUserTypeTests.cpp b/prog/1stPartyLibs/protolib/serialization/tests/jsonUserTypeTests.cpp deleted file mode 100644 index 3714bd64f..000000000 --- a/prog/1stPartyLibs/protolib/serialization/tests/jsonUserTypeTests.cpp +++ /dev/null @@ -1,289 +0,0 @@ -#include -#include - -#include -#include - -#include -#include - - -BOOST_AUTO_TEST_SUITE(proto_json_user_type) - - -const int MAGIC_TAG = 8942; - - -void check_input_stream(proto::io::InputCodeStream & stream, - const std::initializer_list & numbers) -{ - proto::io::StreamTag tag; - BOOST_CHECK(stream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, MAGIC_TAG); - - const_cast(stream.getBinaryStream()).seekBegin(0); - for (int n : numbers) - { - BOOST_CHECK(stream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, n); - BOOST_CHECK(stream.skip(tag)); - } - - BOOST_CHECK(stream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, MAGIC_TAG); -} - - -BOOST_AUTO_TEST_CASE(test_null) -{ - proto::io::OutputCodeStream ostream; - Json::Value v = Json::nullValue; - BOOST_CHECK_EQUAL(v.type(), Json::nullValue); - proto::serialize(ostream, 1, v); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().getBuffer().size(), 1); - ostream.writeTag(proto::io::StreamTag::EMPTY, MAGIC_TAG); - - v = Json::objectValue; - BOOST_CHECK_EQUAL(v.type(), Json::objectValue); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - proto::io::StreamTag tag; - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 1); - BOOST_CHECK(proto::serialize(istream, tag, v)); - BOOST_CHECK_EQUAL(v.type(), Json::nullValue); - - check_input_stream(istream, {1}); -} - - -BOOST_AUTO_TEST_CASE(test_bare_value) -{ - proto::io::OutputCodeStream ostream; - const int TESTINT = -123123; - const std::string TESTSTRING = "test string"; - Json::Value v = TESTINT; - BOOST_CHECK_EQUAL(v.type(), Json::intValue); - proto::serialize(ostream, 1, v); - v = TESTSTRING; - BOOST_CHECK_EQUAL(v.type(), Json::stringValue); - proto::serialize(ostream, 2, v); - ostream.writeTag(proto::io::StreamTag::EMPTY, MAGIC_TAG); - - v = Json::objectValue; - BOOST_CHECK_EQUAL(v.type(), Json::objectValue); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - proto::io::StreamTag tag; - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 1); - BOOST_CHECK(proto::serialize(istream, tag, v)); - BOOST_CHECK_EQUAL(v.type(), Json::intValue); - BOOST_CHECK_EQUAL(v.asInt(), TESTINT); - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 2); - BOOST_CHECK(proto::serialize(istream, tag, v)); - BOOST_CHECK_EQUAL(v.type(), Json::stringValue); - BOOST_CHECK_EQUAL(v.asString(), TESTSTRING); - - check_input_stream(istream, {1, 2}); -} - - -BOOST_AUTO_TEST_CASE(test_empty_block) -{ - proto::io::OutputCodeStream ostream; - Json::Value v = Json::objectValue; - BOOST_CHECK_EQUAL(v.type(), Json::objectValue); - proto::serialize(ostream, 1, v); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().getBuffer().size(), 1); - ostream.writeTag(proto::io::StreamTag::EMPTY, MAGIC_TAG); - - v = Json::nullValue; - BOOST_CHECK_EQUAL(v.type(), Json::nullValue); - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - proto::io::StreamTag tag; - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 1); - BOOST_CHECK(proto::serialize(istream, tag, v)); - - BOOST_CHECK_EQUAL(v.type(), Json::objectValue); - BOOST_CHECK_EQUAL(v.size(), 0); - - check_input_stream(istream, {1}); -} - - -BOOST_AUTO_TEST_CASE(test_empty_array) -{ - proto::io::OutputCodeStream ostream; - Json::Value v = Json::arrayValue; - BOOST_CHECK_EQUAL(v.type(), Json::arrayValue); - proto::serialize(ostream, 1, v); - ostream.writeTag(proto::io::StreamTag::EMPTY, MAGIC_TAG); - - v = Json::nullValue; - BOOST_CHECK_EQUAL(v.type(), Json::nullValue); - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - proto::io::StreamTag tag; - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 1); - BOOST_CHECK(proto::serialize(istream, tag, v)); - - BOOST_CHECK_EQUAL(v.type(), Json::arrayValue); - BOOST_CHECK_EQUAL(v.size(), 0); - - check_input_stream(istream, {1}); -} - - -BOOST_AUTO_TEST_CASE(test_param_types) -{ - const double TESTDOUBLE = 123.4567890123; - Json::Value v; - v["int"] = -123123; - v["int64"] = (Json::Int64)0x123456789ABCDEF0; - v["uint"] = (unsigned)456456; - v["double"] = TESTDOUBLE; - v["bool"] = true; - v["str"] = "test"; - v["null"] = Json::nullValue; - v["array"] = Json::arrayValue; - v["obj"] = Json::objectValue; - - proto::io::OutputCodeStream ostream; - proto::serialize(ostream, 1, v); - ostream.writeTag(proto::io::StreamTag::EMPTY, MAGIC_TAG); - - Json::Value v2; - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - proto::io::StreamTag tag; - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 1); - BOOST_CHECK(proto::serialize(istream, tag, v2)); - - BOOST_CHECK_EQUAL(v2["int"].type(), Json::intValue); - BOOST_CHECK_EQUAL(v2["int"].asInt(), -123123); - - BOOST_CHECK_EQUAL(v2["int64"].type(), Json::intValue); - BOOST_CHECK_EQUAL(v2["int64"].asInt64(), 0x123456789ABCDEF0); - - BOOST_CHECK_EQUAL(v2["uint"].type(), Json::uintValue); - BOOST_CHECK_EQUAL(v2["uint"].asUInt(), 456456); - - BOOST_CHECK_EQUAL(v2["double"].type(), Json::realValue); - BOOST_CHECK_EQUAL(v2["double"].asDouble(), TESTDOUBLE); // should be binary identical - - BOOST_CHECK_EQUAL(v2["bool"].type(), Json::booleanValue); - BOOST_CHECK_EQUAL(v2["bool"].asBool(), true); - - BOOST_CHECK_EQUAL(v2["null"].type(), Json::nullValue); - BOOST_CHECK_EQUAL(v2["array"].type(), Json::arrayValue); - BOOST_CHECK_EQUAL(v2["obj"].type(), Json::objectValue); - - check_input_stream(istream, {1}); -} - - -BOOST_AUTO_TEST_CASE(test_complex) -{ - // taken from https://www.sitepoint.com/colors-json-example/ - const char * const srcJson = R"JSON( -{ - "colors": [ - { - "color": "black", - "category": "hue", - "type": "primary", - "code": - { - "rgba": [255,255,255,1], - "hex": "#000" - } - }, - { - "color": "white", - "category": "value", - "code": - { - "rgba": [0,0,0,1], - "hex": "#FFF" - } - }, - { - "color": "red", - "category": "hue", - "type": "primary", - "code": - { - "rgba": [255,0,0,1], - "hex": "#FF0" - } - }, - { - "color": "blue", - "category": "hue", - "type": "primary", - "code": - { - "rgba": [0,0,255,1], - "hex": "#00F" - } - }, - { - "color": "yellow", - "category": "hue", - "type": "primary", - "code": - { - "rgba": [255,255,0,1], - "hex": "#FF0" - } - }, - { - "color": "green", - "category": "hue", - "type": "secondary", - "code": - { - "rgba": [0,255,0,1], - "hex": "#0F0" - } - } - ], - "nestedArraysOfDouble": [ - [0.1, 0.2, -0.3, [4.5, 6.7, 10000.0]], - { - "yetAnotherArray": [55.8], - "andAnotherDouble": 123.45 - } - ] -} - )JSON"; - - Json::Value v1; - Json::Reader().parse(srcJson, v1); - // make sure it reads properly - BOOST_CHECK_EQUAL(v1.type(), Json::objectValue); - BOOST_CHECK_EQUAL(v1["colors"].type(), Json::arrayValue); - - proto::io::OutputCodeStream ostream; - proto::serialize(ostream, 1, v1); - ostream.writeTag(proto::io::StreamTag::EMPTY, MAGIC_TAG); - - Json::Value v2; - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - proto::io::StreamTag tag; - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 1); - BOOST_CHECK(proto::serialize(istream, tag, v2)); - - std::string str1 = Json::FastWriter().write(v1); - std::string str2 = Json::FastWriter().write(v2); - BOOST_CHECK_EQUAL(str1, str2); - - check_input_stream(istream, {1}); -} - - -BOOST_AUTO_TEST_SUITE_END() diff --git a/prog/1stPartyLibs/protolib/serialization/tests/protoIoMapSerializationTests.cpp b/prog/1stPartyLibs/protolib/serialization/tests/protoIoMapSerializationTests.cpp deleted file mode 100644 index 550a963dc..000000000 --- a/prog/1stPartyLibs/protolib/serialization/tests/protoIoMapSerializationTests.cpp +++ /dev/null @@ -1,114 +0,0 @@ -#include -#include - -#include - -#include "generated/testMapSerialization.pb.h" -#include "generated/testMapSerialization.pb.cpp" - -#include - -BOOST_AUTO_TEST_SUITE(proto_io_map) - -const int ITEM_COUNT = 20; - -static TestMessage prepareMessage() -{ - TestMessage message; - for (int i = 0; i < ITEM_COUNT; ++i) - { - std::string strNum = boost::lexical_cast(i); - TestItem item; - item.setName("name" + strNum); - message.getItems().insert(proto::make_pair("item" + strNum, item)); - } - - return message; -} - - -BOOST_AUTO_TEST_CASE(test_empty) -{ - TestMessage message; - message.touchRecursive(2); - proto::io::OutputCodeStream ostream(1); - message.saveToDB(ostream, proto::BinarySaveOptions(266)); - - message = prepareMessage(); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - proto::io::StreamTag tag; - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 266); - BOOST_CHECK(message.loadFromDB(istream, tag)); - BOOST_CHECK_EQUAL(message.getItems().size(), 0); -} - - -BOOST_AUTO_TEST_CASE(test_save) -{ - TestMessage message = prepareMessage(); - TStringToTestItemUnorderedMap & map = message.getItems(); - map.touch(3); - map["item1"].touch(1); - map["item2"].touch(2); - map["item5"].touch(3); - map["item7"].touch(100); - - proto::io::OutputCodeStream ostream(1); - message.saveToDB(ostream, proto::BinarySaveOptions(266)); - - message.clear(); - map["testf1"].setName("f1"); - map["item2"].setName("old2"); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - proto::io::StreamTag tag; - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 266); - BOOST_CHECK(message.loadFromDB(istream, tag)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), ostream.getBinaryStream().tell()); - BOOST_CHECK_EQUAL(map.size(), 4); - BOOST_CHECK_EQUAL(map["testf1"].getName(), "f1"); - - BOOST_CHECK_EQUAL(map["item2"].getName(), "name2"); - BOOST_CHECK_EQUAL(map["item5"].getName(), "name5"); - BOOST_CHECK_EQUAL(map["item7"].getName(), "name7"); -} - - -BOOST_AUTO_TEST_CASE(test_delete) -{ - TestMessage message = prepareMessage(); - TStringToTestItemUnorderedMap & map = message.getItems(); - map.touch(1); - map.touchStructure(2); - map["item2"].setName("testSaveAndDelete"); - map["item2"].touch(2); - map.erase("item15"); - map.erase("item5"); - - proto::io::OutputCodeStream ostream(1); - message.saveToDB(ostream, proto::BinarySaveOptions(266)); - - message = prepareMessage(); - - map["test_delete"].setName("ddd"); - map["item4"].setName("must_be_unchanged"); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - proto::io::StreamTag tag; - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 266); - BOOST_CHECK(message.loadFromDB(istream, tag)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), ostream.getBinaryStream().tell()); - BOOST_CHECK_EQUAL(map.size(), 18); - BOOST_CHECK(map.find("item5") == map.end()); - BOOST_CHECK(map.find("item15") == map.end()); - BOOST_CHECK(map.find("test_delete") == map.end()); - BOOST_CHECK_EQUAL(map["item2"].getName(), "testSaveAndDelete"); - BOOST_CHECK_EQUAL(map["item4"].getName(), "must_be_unchanged"); -} - - -BOOST_AUTO_TEST_SUITE_END() diff --git a/prog/1stPartyLibs/protolib/serialization/tests/protoIoUnitTests.cpp b/prog/1stPartyLibs/protolib/serialization/tests/protoIoUnitTests.cpp deleted file mode 100644 index a2db766dd..000000000 --- a/prog/1stPartyLibs/protolib/serialization/tests/protoIoUnitTests.cpp +++ /dev/null @@ -1,539 +0,0 @@ -#include -#include - -#include - -#include - -#include -#include - -#include - - -BOOST_AUTO_TEST_SUITE(proto_io) - -BOOST_AUTO_TEST_CASE(test_buffer) -{ - char * nullPtr = NULL; - - proto::io::Buffer buffer; - BOOST_CHECK_EQUAL(buffer.begin(), nullPtr); - BOOST_CHECK_EQUAL(buffer.end(), nullPtr); - BOOST_CHECK_EQUAL(buffer.size(), 0); - - buffer.resize(1000); - - BOOST_CHECK_NE(buffer.begin(), nullPtr); - BOOST_CHECK_NE(buffer.end(), nullPtr); - BOOST_CHECK_EQUAL(buffer.end(), buffer.begin() + 1000); - BOOST_CHECK_EQUAL(buffer.size(), 1000); - - const char * begin = buffer.begin(); - - proto::io::Buffer moveBuffer(buffer.move()); - - BOOST_CHECK_EQUAL(buffer.begin(), nullPtr); - BOOST_CHECK_EQUAL(buffer.end(), nullPtr); - BOOST_CHECK_EQUAL(buffer.size(), 0); - - BOOST_CHECK_EQUAL(moveBuffer.begin(), begin); - BOOST_CHECK_EQUAL(moveBuffer.end(), begin + 1000); - BOOST_CHECK_EQUAL(moveBuffer.size(), 1000); -} - -BOOST_AUTO_TEST_CASE(test_var_int) -{ - proto::io::OutputCodeStream ostream; - - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 0); - long long int intValue; - ostream.writeVarInt(6); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 1); - ostream.writeVarInt(444); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 3); - ostream.writeVarInt(-5); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 13); - ostream.writeVarInt(32767); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 16); - ostream.writeVarInt(LLONG_MAX); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 25); - ostream.writeVarInt(LLONG_MIN); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 35); - ostream.writeVarInt(INT_MAX); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 40); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 0); - BOOST_CHECK(istream.readVarInt(intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 1); - BOOST_CHECK_EQUAL(intValue, 6); - - BOOST_CHECK(istream.readVarInt(intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 3); - BOOST_CHECK_EQUAL(intValue, 444); - - BOOST_CHECK(istream.readVarInt(intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 13); - BOOST_CHECK_EQUAL(intValue, -5); - - BOOST_CHECK(istream.readVarInt(intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 16); - BOOST_CHECK_EQUAL(intValue, 32767); - - BOOST_CHECK(istream.readVarInt(intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 25); - BOOST_CHECK_EQUAL(intValue, LLONG_MAX); - - BOOST_CHECK(istream.readVarInt(intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 35); - BOOST_CHECK_EQUAL(intValue, LLONG_MIN); - - BOOST_CHECK(istream.readVarInt(intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 40); - BOOST_CHECK_EQUAL(intValue, INT_MAX); - - //TODO many tests -} - - -BOOST_AUTO_TEST_CASE(test_tag) -{ - proto::io::OutputCodeStream ostream; - - ostream.writeTag(proto::io::StreamTag::BLOCK_BEGIN, 1); - ostream.writeTag(proto::io::StreamTag::VAR_INT_MINUS, 2000000); - ostream.writeTag(proto::io::StreamTag::VAR_INT, proto::io::StreamTag::SHORT_NUMBER_LIMIT - 1); - ostream.writeTag(proto::io::StreamTag::SPECIAL, proto::io::StreamTag::SHORT_NUMBER_LIMIT); - ostream.writeTag(proto::io::StreamTag::EMPTY, proto::io::StreamTag::SHORT_NUMBER_LIMIT + 1); - - - proto::io::StreamTag tag; - int number = -1; - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::BLOCK_BEGIN); - BOOST_CHECK_EQUAL(tag.number, 1); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::VAR_INT_MINUS); - BOOST_CHECK_EQUAL(tag.number, 2000000); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::VAR_INT); - BOOST_CHECK_EQUAL(tag.number, proto::io::StreamTag::SHORT_NUMBER_LIMIT - 1); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::SPECIAL); - BOOST_CHECK_EQUAL(tag.number, proto::io::StreamTag::SHORT_NUMBER_LIMIT); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::EMPTY); - BOOST_CHECK_EQUAL(tag.number, proto::io::StreamTag::SHORT_NUMBER_LIMIT + 1); -} - - -void testSkip(proto::io::OutputCodeStream & ostream) -{ - ostream.blockEnd(); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - - proto::io::StreamTag tag; - tag.type = proto::io::StreamTag::BLOCK_BEGIN; - tag.number = 1; - BOOST_CHECK(istream.skip(tag)); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), istream.getBinaryStream().tell()); -} - - -BOOST_AUTO_TEST_CASE(test_var_int_with_tag) -{ - proto::io::OutputCodeStream ostream; - - const int hiLimitOfVar3Byte =(1 << 21); - - long long int intValue; - - ostream.writeVarInt(1, 0); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 1); - ostream.writeVarInt(1, 1); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 3); - ostream.writeVarInt(1, -1); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 5); - ostream.writeVarInt(1, INT_MAX); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 10); - ostream.writeVarInt(1, INT_MIN); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 15); - ostream.writeVarInt(1, hiLimitOfVar3Byte); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 20); - ostream.writeVarInt(1, -hiLimitOfVar3Byte); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 25); - ostream.writeVarInt(1, hiLimitOfVar3Byte - 1); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 29); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - - proto::io::StreamTag tag; - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::EMPTY); - BOOST_CHECK(istream.readVarInt(tag, intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 1); - BOOST_CHECK_EQUAL(intValue, 0); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::VAR_INT); - BOOST_CHECK(istream.readVarInt(tag, intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 3); - BOOST_CHECK_EQUAL(intValue, 1); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::VAR_INT_MINUS); - BOOST_CHECK(istream.readVarInt(tag, intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 5); - BOOST_CHECK_EQUAL(intValue, -1); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::RAW32); - BOOST_CHECK(istream.readVarInt(tag, intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 10); - BOOST_CHECK_EQUAL(intValue, INT_MAX); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::RAW32); - BOOST_CHECK(istream.readVarInt(tag, intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 15); - BOOST_CHECK_EQUAL(intValue, INT_MIN); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::RAW32); - BOOST_CHECK(istream.readVarInt(tag, intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 20); - BOOST_CHECK_EQUAL(intValue, hiLimitOfVar3Byte); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::RAW32); - BOOST_CHECK(istream.readVarInt(tag, intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 25); - BOOST_CHECK_EQUAL(intValue, -hiLimitOfVar3Byte); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::VAR_INT); - BOOST_CHECK(istream.readVarInt(tag, intValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 29); - BOOST_CHECK_EQUAL(intValue, hiLimitOfVar3Byte - 1); -} - - -BOOST_AUTO_TEST_CASE(test_bool) -{ - proto::io::OutputCodeStream ostream; - - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 0); - ostream.writeBool(7, true); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 1); - ostream.writeBool(5, false); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 2); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - - proto::io::StreamTag tag; - bool value = false; - - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 0); - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 1); - BOOST_CHECK_EQUAL(tag.number, 7); - BOOST_CHECK(istream.readBool(tag, value)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 1); - BOOST_CHECK_EQUAL(value, true); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 2); - BOOST_CHECK_EQUAL(tag.number, 5); - BOOST_CHECK(istream.readBool(tag, value)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 2); - BOOST_CHECK_EQUAL(value, false); - - testSkip(ostream); -} - - -BOOST_AUTO_TEST_CASE(test_float) -{ - proto::io::OutputCodeStream ostream; - - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 0); - ostream.writeFloat(45, 1.7f); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 6); - ostream.writeDouble(22, 5.7); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 16); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - - proto::io::StreamTag tag; - - float floatValue = 0; - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 0); - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 2); - BOOST_CHECK_EQUAL(tag.number, 45); - BOOST_CHECK(istream.readFloat(tag, floatValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 6); - BOOST_CHECK_EQUAL(floatValue, 1.7f); - - double doubleValue = 0; - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 7); - BOOST_CHECK_EQUAL(tag.number, 22); - BOOST_CHECK(istream.readDouble(tag, doubleValue)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 16); - BOOST_CHECK_EQUAL(doubleValue, 5.7); - - testSkip(ostream); -} - - -BOOST_AUTO_TEST_CASE(test_string) -{ - proto::io::OutputCodeStream ostream; - - std::string str1("simple string"); - std::string str2("hello world"); - - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 0); - ostream.writeString(15, "", 0); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 1); - ostream.writeString(26347, str1.c_str(), str1.size()); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 19 ); - ostream.writeString(43958729, str2.c_str(), str2.size()); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 36 ); - ostream.writeString(2, str1.c_str(), str1.size()); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 38 ); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - - proto::io::StreamTag tag; - std::string value;; - - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 0); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::EMPTY); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 1); - BOOST_CHECK_EQUAL(tag.number, 15); - BOOST_CHECK(readString(istream, tag, value)); - BOOST_CHECK_EQUAL("", value); - - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 1); - - - BOOST_CHECK(istream.readTag(tag)); - - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 5); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::VAR_LENGTH); - BOOST_CHECK_EQUAL(tag.number, 26347); - BOOST_CHECK(readString(istream, tag, value)); - BOOST_CHECK_EQUAL(str1, value); - - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 19); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 24); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::VAR_LENGTH); - BOOST_CHECK_EQUAL(tag.number, 43958729); - BOOST_CHECK(readString(istream, tag, value)); - BOOST_CHECK_EQUAL(str2, value); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 36); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::VAR_INT); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 37); - BOOST_CHECK_EQUAL(tag.number, 2); - BOOST_CHECK(readString(istream, tag, value)); - BOOST_CHECK_EQUAL(str1, value); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 38); - - testSkip(ostream); -} - - -enum TestEnum -{ - TEZERO = 0, - TEV1 = 55, - TEV2 = 5916372, - TEMINUS = -2384, -}; - - -bool validate_enum_value(TestEnum value) -{ - switch (value) - { - case TEZERO: - case TEV1: - case TEV2: - case TEMINUS: - return true; - } - - return false; -} - - -BOOST_AUTO_TEST_CASE(test_enum) -{ - proto::io::OutputCodeStream ostream; - - ostream.writeEnum(1, TEV1); - ostream.writeEnum(2, TEZERO); - ostream.writeEnum(3, TEV2); - ostream.writeEnum(4, TEMINUS); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - - proto::io::StreamTag tag; - int number = -1; - TestEnum value; - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::VAR_INT); - BOOST_CHECK_EQUAL(tag.number, 1); - BOOST_CHECK(istream.readEnum(tag, value)); - BOOST_CHECK_EQUAL(value, TEV1); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::EMPTY); - BOOST_CHECK_EQUAL(tag.number, 2); - BOOST_CHECK(istream.readEnum(tag, value)); - BOOST_CHECK_EQUAL(value, TEZERO); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::RAW32); - BOOST_CHECK_EQUAL(tag.number, 3); - BOOST_CHECK(istream.readEnum(tag, value)); - BOOST_CHECK_EQUAL(value, TEV2); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::VAR_INT_MINUS); - BOOST_CHECK_EQUAL(tag.number, 4); - BOOST_CHECK(istream.readEnum(tag, value)); - BOOST_CHECK_EQUAL(value, TEMINUS); - - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), ostream.getBinaryStream().tell()); - - testSkip(ostream); -} - - -BOOST_AUTO_TEST_CASE(test_block) -{ - proto::io::OutputCodeStream ostream; - - ostream.blockBegin(150); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 2); - ostream.writeVarInt(5, 13); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 4); - ostream.blockEnd(); - BOOST_CHECK_EQUAL(ostream.getBinaryStream().tell(), 5); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - - proto::io::StreamTag tag; - int number = -1; - TestEnum value; - proto::uint32 length = 0; - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 2); - BOOST_CHECK_EQUAL(tag.number, 150); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::BLOCK_BEGIN); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 3); - BOOST_CHECK_EQUAL(tag.number, 5); - BOOST_CHECK(istream.readVarInt(tag, length)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 4); - BOOST_CHECK_EQUAL(length, 13); - - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(istream.getBinaryStream().tell(), 5); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::SPECIAL); - BOOST_CHECK_EQUAL(tag.number, proto::io::StreamTag::SPECIAL_BLOCK_END); - - testSkip(ostream); -} - - -BOOST_AUTO_TEST_CASE(test_strings_in_big_block) -{ - std::vector testStrings = {"dsafdf", "dfu8723yrhu", "hello world", "beer"}; - proto::io::OutputCodeStream ostream; - - - int counter = 1; - ostream.blockBegin(150); - srand(666); - while (ostream.getBinaryStream().tell() < 4 * 1024 * 1024) - { - const std::string & selectString = testStrings.at(rand() % testStrings.size()); - ostream.writeString(counter++, selectString.c_str(), selectString.size()); - } - - testStrings.emplace_back("732he92hyr731hxd"); - testStrings.emplace_back("j2-3498fy7whgfiuasfd"); - testStrings.emplace_back("j2-dsdf"); - testStrings.emplace_back("j2-dsefq4rdf"); - testStrings.emplace_back("j2-sadfsadfqw4trq432t"); - - for (int i = 0; i < 1000; ++i) - { - const std::string & selectString = testStrings.at(rand() % testStrings.size()); - ostream.writeString(counter++, selectString.c_str(), selectString.size()); - } - - ostream.blockEnd(); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - - counter = 1; - proto::io::StreamTag tag; - int number = -1; - - std::unordered_set testStringsSet(testStrings.begin(), testStrings.end()); - std::string result; - BOOST_REQUIRE(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 150); - BOOST_CHECK_EQUAL(tag.type, proto::io::StreamTag::BLOCK_BEGIN); - - BOOST_CHECK(istream.readTag(tag)); - while (!tag.isBlockEnded()) - { - BOOST_CHECK_EQUAL(counter, tag.number); - counter++; - if (!proto::io::readString(istream, tag, result)) - { - BOOST_TEST_MESSAGE(istream.errorMessage()); - BOOST_REQUIRE(false); - } - - if (!testStringsSet.count(result)) - { - BOOST_TEST_MESSAGE(result); - BOOST_REQUIRE(false); - } - - BOOST_REQUIRE(istream.readTag(tag)); - } - - testSkip(ostream); -} - -BOOST_AUTO_TEST_SUITE_END() - - diff --git a/prog/1stPartyLibs/protolib/serialization/tests/protoIoVectorSerializationTests.cpp b/prog/1stPartyLibs/protolib/serialization/tests/protoIoVectorSerializationTests.cpp deleted file mode 100644 index ec5bad43c..000000000 --- a/prog/1stPartyLibs/protolib/serialization/tests/protoIoVectorSerializationTests.cpp +++ /dev/null @@ -1,205 +0,0 @@ -#include -#include - -#include - -#include -#include - -#include - -#include "generated/testVectorSerialization.pb.cpp" - -namespace proto_vector_test -{ - bool operator!=( - const proto_vector_test::TestItem & o1, - const proto_vector_test::TestItem & o2) - { - return o1.getName() != o2.getName(); - } - - std::ostream & operator<<(std::ostream & stream, const proto_vector_test::TestItem & obj) - { - return stream << obj.getName(); - } -} - -using namespace proto_vector_test; - - -BOOST_AUTO_TEST_SUITE(proto_io_vector) - -typedef proto::Vector::TType TTestVector; -typedef proto::Vector::TType TStringVector; -BOOST_AUTO_TEST_CASE(int_empty) -{ - TTestVector srcVector; - - proto::io::OutputCodeStream ostream(1); - - proto::io::writeVector(ostream, 666, srcVector); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - proto::io::StreamTag tag; - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 666); - - TTestVector dstVector; - dstVector.push_back(444); - BOOST_CHECK(proto::io::readVector(istream, tag, dstVector)); - BOOST_CHECK(dstVector.empty()); -} - - -BOOST_AUTO_TEST_CASE(int_values) -{ - TTestVector srcVector; - srcVector.push_back(5); - srcVector.push_back(77); - srcVector.push_back(984765); - srcVector.push_back(-2); - - proto::io::OutputCodeStream ostream(1); - - proto::io::writeVector(ostream, 666, srcVector); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - proto::io::StreamTag tag; - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 666); - - TTestVector dstVector; - dstVector.push_back(444); - BOOST_CHECK(proto::io::readVector(istream, tag, dstVector)); - BOOST_CHECK_EQUAL_COLLECTIONS( - dstVector.begin(), dstVector.end(), - srcVector.begin(), srcVector.end()); -} - -BOOST_AUTO_TEST_CASE(blk_int_empty) -{ - TTestVector srcVector; - - DataBlock blk; - proto::BlkSerializator ser(blk, -2); - proto::io::writeVector(ser, "test", srcVector); - - TTestVector dstVector; - dstVector.push_back(444); - proto::io::readVector(ser, "test", dstVector); - BOOST_CHECK(dstVector.empty()); -} - -BOOST_AUTO_TEST_CASE(blk_int_values) -{ - TTestVector srcVector; - srcVector.push_back(5); - srcVector.push_back(77); - srcVector.push_back(984765); - srcVector.push_back(-2); - - DataBlock blk; - proto::BlkSerializator ser(blk, -2); - proto::io::writeVector(ser, "test", srcVector); - - TTestVector dstVector; - dstVector.push_back(444); - proto::io::readVector(ser, "test", dstVector); - BOOST_CHECK_EQUAL_COLLECTIONS( - dstVector.begin(), dstVector.end(), - srcVector.begin(), srcVector.end()); -} - - -BOOST_AUTO_TEST_CASE(blk_string_values) -{ - TStringVector srcVector; - srcVector.push_back("5"); - srcVector.push_back("77"); - srcVector.push_back(""); - srcVector.push_back("984765"); - srcVector.push_back("-2"); - - DataBlock blk; - proto::BlkSerializator ser(blk, -2); - proto::io::writeVector(ser, "test", srcVector); - - TStringVector dstVector; - dstVector.push_back("444"); - proto::io::readVector(ser, "test", dstVector); - BOOST_CHECK_EQUAL_COLLECTIONS( - dstVector.begin(), dstVector.end(), - srcVector.begin(), srcVector.end()); -} - -const int ITEM_COUNT = 20; - -static TestMessage prepareMessage() -{ - TestMessage message; - for (int i = 0; i < ITEM_COUNT; ++i) - { - std::string strNum = boost::lexical_cast(i); - TestItem item; - item.setName("name" + strNum); - message.getItems().push_back(item); - } - - return message; -} - - -BOOST_AUTO_TEST_CASE(test_vector_versioned_all) -{ - TestMessage message = prepareMessage(); - message.touchRecursive(2); - - proto::io::OutputCodeStream ostream(1); - message.saveToDB(ostream, proto::BinarySaveOptions(266)); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - proto::io::StreamTag tag; - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 266); - - TestMessage dstMessage; - BOOST_CHECK(dstMessage.loadFromDB(istream, tag)); - - BOOST_CHECK_EQUAL_COLLECTIONS( - message.getItems().begin(), message.getItems().end(), - dstMessage.getItems().begin(), dstMessage.getItems().end()); -} - - -BOOST_AUTO_TEST_CASE(test_vector_versioned_delete) -{ - TestMessage message = prepareMessage(); - message.touchRecursive(2); - TTestItemVector & items = message.getItems(); - message.getItems().erase(items.begin() + 5); - for (TTestItemVector::iterator it = items.begin() + 5; it != items.end(); ++it) - { - (*it).touch(3); - } - - message.getItems().touchStructure(3); - - proto::io::OutputCodeStream ostream(2); - message.saveToDB(ostream, proto::BinarySaveOptions(266)); - BOOST_CHECK_LT(ostream.getBinaryStream().tell(), 150); - - proto::io::InputCodeStream istream(ostream.getBinaryStream().getBuffer()); - proto::io::StreamTag tag; - BOOST_CHECK(istream.readTag(tag)); - BOOST_CHECK_EQUAL(tag.number, 266); - - TestMessage dstMessage = prepareMessage(); - BOOST_CHECK(dstMessage.loadFromDB(istream, tag)); - - BOOST_CHECK_EQUAL_COLLECTIONS( - message.getItems().begin(), message.getItems().end(), - dstMessage.getItems().begin(), dstMessage.getItems().end()); -} - -BOOST_AUTO_TEST_SUITE_END() diff --git a/prog/1stPartyLibs/protolib/serialization/tests/serializationPresets.proto b/prog/1stPartyLibs/protolib/serialization/tests/serializationPresets.proto deleted file mode 100644 index dea3db3e4..000000000 --- a/prog/1stPartyLibs/protolib/serialization/tests/serializationPresets.proto +++ /dev/null @@ -1,11 +0,0 @@ -import public "options.proto"; - -package serialization_presets; - -message db -{ - option (ser_types) = "binary"; - option (ser_save_fn) = "saveToDB"; - option (ser_load_fn) = "loadFromDB"; - option (ser_gen_flags) = "char"; -} diff --git a/prog/1stPartyLibs/protolib/serialization/tests/testMapSerialization.proto b/prog/1stPartyLibs/protolib/serialization/tests/testMapSerialization.proto deleted file mode 100644 index 75ba52cb6..000000000 --- a/prog/1stPartyLibs/protolib/serialization/tests/testMapSerialization.proto +++ /dev/null @@ -1,15 +0,0 @@ -import public "serializationPresets.proto"; - -message TestItem -{ - option (serializable) = "versioned; binary; db"; - - optional string name = 1; -} - -message TestMessage -{ - option (serializable) = "versioned; binary; db"; - - repeated TestItem items = 1 [(hash_map_key) = "string"]; -} \ No newline at end of file diff --git a/prog/1stPartyLibs/protolib/serialization/tests/testVectorSerialization.proto b/prog/1stPartyLibs/protolib/serialization/tests/testVectorSerialization.proto deleted file mode 100644 index 0b1aafa40..000000000 --- a/prog/1stPartyLibs/protolib/serialization/tests/testVectorSerialization.proto +++ /dev/null @@ -1,17 +0,0 @@ -import public "serializationPresets.proto"; - -package proto_vector_test; - -message TestItem -{ - option (serializable) = "versioned; binary; db"; - - optional string name = 1; -} - -message TestMessage -{ - option (serializable) = "versioned; binary; db"; - - repeated TestItem items = 1; -} \ No newline at end of file diff --git a/prog/1stPartyLibs/quirrel/quirrel/doc/source/index.rst b/prog/1stPartyLibs/quirrel/quirrel/doc/source/index.rst index b1fb52e70..cebee70f6 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/doc/source/index.rst +++ b/prog/1stPartyLibs/quirrel/quirrel/doc/source/index.rst @@ -17,6 +17,7 @@ Quirrel documentation modules/bindings.rst repl/index.rst rfcs/STATUS.md + rfcs/README.md Indices and tables ================== diff --git a/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/builtin_functions.rst b/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/builtin_functions.rst index d96716b6b..3768fc33f 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/builtin_functions.rst +++ b/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/builtin_functions.rst @@ -18,27 +18,10 @@ Global Symbols creates and returns array of a specified size. If the optional parameter fill is specified its value will be used to fill the new array's slots. If the fill parameter is omitted, null is used instead. -.. sq:function:: seterrorhandler(func) - - -sets the runtime error handler - .. sq:function:: callee() - returns the currently running closure -.. sq:function:: setdebughook(hook_func) - -sets the debug hook - -hook_func should have follow signature - hook_func(hook_type:integer, source_file:string, line_num:integer, func_name:string) - -hook_type can be 'l' - line, 'r' - return, 'c' - call or 'x' for VM shutdown - -call of debughook for each line performed only when debuginfo is enabled - .. sq:function:: getroottable() returns the root table of the VM. @@ -83,61 +66,26 @@ or providing compile-time bindings:: let compiledscript=compilestring("foo()", "bindings_test", api) compiledscript() -.. sq:function:: collectgarbage() - - Runs the garbage collector and returns the number of reference cycles found (and deleted). This function only works on garbage collector builds. - -.. sq:function:: resurrectunreachable() - -Runs the garbage collector and returns an array containing all unreachable object found. If no unreachable object is found, null is returned instead. This function is meant to help debugging reference cycles. This function only works on garbage collector builds. - .. sq:function:: type(obj) return the 'raw' type of an object without invoking the metamethod '_typeof'. -.. sq:function:: getstackinfos(level) - -returns the stack informations of a given call stack level. returns a table formatted as follow: :: - - { - func="DoStuff", //function name - - src="test.nut", //source file - - line=10, //line number - - locals = { //a table containing the local variables - - a=10, - - testy="I'm a string" - } - } - -level = 0 is getstackinfos() itself! level = 1 is the current function, level = 2 is the caller of the current function, and so on. If the stack level doesn't exist the function returns null. - .. sq:function:: newthread(threadfunc) creates a new cooperative thread object(coroutine) and returns it -.. sq:function:: min(x, y, [z], [w], ...) - -returns minimal value of all arguments - -.. sq:function:: max(x, y, [z], [w], ...) +.. sq:function:: freeze(x) -returns maximal value of all arguments +returns immutable reference to given object. +Throws an error if argument is of POD type (to help prevent errors). -.. sq:function:: clamp(x, min_val, max_val) +.. sq:function:: call_type_method(object, , [...]) -returns value limited by provided min-max range +Calls built-in type method of an object, with arguments in variable arguments +for example: -creates a new cooperative thread object(coroutine) and returns it + `call_type_method({foo=1}, "findvalue", @(v) v==1) //1` -.. sq:function:: freeze(x) - -returns immutable reference to given object. -Throws an error if argument is of POD type (to help prevent errors). .. sq:function:: getobjflags(x) @@ -146,15 +94,6 @@ Given object handle, return its flags that may be: * 0 - no special flags * SQOBJ_FLAG_IMMUTABLE - bit set if the object handle is immutable -.. sq:function:: getbuildinfo(x) - -returns table containing information on VM build parameters. - - * **version** - string values describing the version of VM and compiler. - * **charsize** - size in bytes of the internal VM representation for characters(1 for ASCII builds 2 for UNICODE builds). - * **intsize** - size in bytes of the internal VM representation for integers(4 for 32bits builds 8 for 64bits builds). - * **floatsize** - size in bytes of the internal VM representation for floats(4 for single precision builds 8 for double precision builds). - .. _default_delegates: ----------------- @@ -391,12 +330,12 @@ Table .. sq:function:: table.len() -returns the number of slots contained in a table +Returns the number of slots contained in a table .. sq:function:: table.rawget(key) -tries to get a value from the slot 'key' without employing delegation +Tries to get a value from the slot 'key' without employing delegation .. sq:function:: table.rawset(key,val) @@ -411,12 +350,12 @@ Deletes the slot key without employing delegation and returns its value. If the .. sq:function:: table.rawin(key) -returns true if the slot 'key' exists. the function has the same effect as the operator 'in' but does not employ delegation. +Returns true if the slot 'key' exists. the function has the same effect as the operator 'in' but does not employ delegation. .. sq:function:: table.weakref() -returns a weak reference to the object. +Returns a weak reference to the object. .. sq:function:: table.tostring() @@ -426,7 +365,7 @@ Tries to invoke the _tostring metamethod. If that fails, it returns "(table : po .. sq:function:: table.clear() -removes all the slots from the table. Returns table itself. +Removes all the slots from the table. Returns table itself. .. sq:function:: table.filter(func(val, [key], [table_ref])) @@ -434,15 +373,19 @@ Creates a new table with all values that pass the test implemented by the provid .. sq:function:: table.keys() -returns an array containing all the keys of the table slots. +Returns an array containing all the keys of the table slots. .. sq:function:: table.values() -returns an array containing all the values of the table slots. +Returns an array containing all the values of the table slots. .. sq:function:: table.topairs() -returns an array containing arrays of pairs [key, value]. Useful when you need to sort data from table. +Returns an array containing arrays of pairs [key, value]. Useful when you need to sort data from table. + +.. sq:function:: table.clone() + +Returns a clone of table. .. sq:function:: table.map(func(slot_value, [slot_key], [table_ref])) @@ -501,6 +444,11 @@ Example: :: => foo == {fizz=1, bazz=2}; bar={fizz=1, buzz=2} +.. sq:function:: table.is_frozen() + +Return true if reference to the table is frozen with 'freeze' global function. + + ^^^^^^ Array ^^^^^^ @@ -650,6 +598,14 @@ Returns matched value (for which callback returned non-false value) or default v Copies content of source array into given array by replacing its contents. Returns target array itself. +.. sq:function:: array.is_frozen() + +Return true if reference to the array is frozen with 'freeze' global function. + +.. sq:function:: array.clone() + +Return clone of the array. + ^^^^^^^^ Function ^^^^^^^^ @@ -819,6 +775,10 @@ If instance has _call() metamethod, get info about it (see function.getfuncinfos Returns metamethod closure (e.g. foo.getmetamethod("_add")) or null if method is not implemented in class. +.. sq:function:: instance.is_frozen() + +Return true if reference to the instance is frozen with 'freeze' global function. + ^^^^^^^^^^^^^^ Generator diff --git a/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/classes.rst b/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/classes.rst index 5d5fef133..de1b4f8a6 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/classes.rst +++ b/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/classes.rst @@ -269,6 +269,14 @@ A method of a base class can be explicitly invoked by a method of a derived clas //prints "I'm the derived" inst.DoIt() +An alternative way to inheret class it to use Python-style syntax. It works the same way as described above. + + class SuperFoo(Foo) { + function DoSomething() { + println("I'm doing something") + } + } + ---------------------- Metamethods ---------------------- diff --git a/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/compiler_directives.rst b/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/compiler_directives.rst index 472aa83ae..4b3ce5588 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/compiler_directives.rst +++ b/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/compiler_directives.rst @@ -122,6 +122,40 @@ Enable plus string concatenation Allow using plus operator '+' to concatenate strings. +---------------------------------------------- +Clone operator +---------------------------------------------- + + :: + + #allow-clone-operator + +Allow using 'clone' operator (let a = clone {}) + + :: + + forbid-clone-operator + +Forbid using 'clone' operator use .$clone instead (let a = {}.$clone()) +'clone' is not a keyword in this case you call variables with it for example. + +---------------------------------------------- +Delete operator +---------------------------------------------- + + :: + + #allow-delete-operator + +Allow using 'delete' operator (let a = delete {foo = 2}["foo"] //2) + + :: + + forbid-delete-operator + +Forbid using 'delete' operator use .$rawdelete instead (let a = {foo=2}.$rawdelete("foo") //2) +'delete' is not a keyword in this case and you call variables with it for example. + ------------------ #strict ------------------ @@ -143,3 +177,4 @@ Enable all extra checks/restrictions Disable all extra checks/restrictions + diff --git a/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/expressions.rst b/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/expressions.rst index 496851844..3b8a5f1ad 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/expressions.rst +++ b/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/expressions.rst @@ -144,6 +144,28 @@ One would have to type ?. everywhere, writing it as Instead it is done by compiler - once a null-operator is met, it is also assumed for the subsequent ., [] and () operators in an expression. +Note: 'key' should not be separated from '?.' or '.' by space[s] or new line. + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.$ and ?.$ - Type methods access operator +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. index:: + pair: .$ and ?.$ Operators; Operators + +:: + + exp := value '.$' key + + +:: + + exp := value '?.$' key + + +If 'key' exists in value's type built-in methods (default delegates) returns method's closure, else returns null in case of '?.$' or throws an error if '.$' + +Note: 'key' should not be separated from '.$' and '?.$' by space[s] or new line. ^^^^^^^^^^^^^ Arithmetic @@ -444,6 +466,8 @@ After the new object is ready the "_cloned" meta method is called (see :ref:`Met When a class instance is cloned the constructor is not invoked(initializations must rely on ```_cloned``` instead +Note: Usage of this operator could be prohibited with ``#forbid-clone-operator``. + ----------------- Array contructor ----------------- diff --git a/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/statements.rst b/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/statements.rst index b473a136c..61e76ed5f 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/statements.rst +++ b/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/statements.rst @@ -181,6 +181,30 @@ Executes a statement as long as a condition is different than false.:: println("loops forever") } +.. index:: + pair: range-loop; statement + +:: + + stat:= 'for' '(' variable ':' [startValue ',' ] limitValue [ ',' step] ')' statement + stat:= 'for' '(' count ')' statement + stat:= 'for' '(' [startValue ',' ] limitValue ')' statement + +This is currently just syntax sugar over regular 'for' loop which are being desugared into the following general form::: + + for (variable = startValue; variable < limitValue; variable += step) + println($"indexed range loop: {variable}") + //or + for ($i = 0; $i < count; ++$i) + println("count range loop") + //or + for ($i = startValue; $i < limitValue; ++$i) + println("bounded range loop") + +Here if 'startValue' is not explicitly provided it is '0' by default. If 'step' is not provided it is '1' by default. +'step' can only be integer literal. + + ^^^^^^^^ foreach ^^^^^^^^ @@ -313,8 +337,8 @@ So if you see somewhere in function scope let foo = you can be sure that foo wi .. note:: While named bindings looks like constants they do not provide immutability. Named bindings can reference mutable objects (like array or instance or table) - - + + -------------------- Function declaration -------------------- diff --git a/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/tables.rst b/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/tables.rst index c7a007290..49886b837 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/tables.rst +++ b/prog/1stPartyLibs/quirrel/quirrel/doc/source/reference/language/tables.rst @@ -69,3 +69,4 @@ the value of the deleted slot.:: delete a.test1 print(delete a.deleteme); //this will print the string "now" +Note: Usage of this method could be prohibited with ``#forbid-delete-operator`` diff --git a/prog/1stPartyLibs/quirrel/quirrel/doc/source/rfcs/STATUS.md b/prog/1stPartyLibs/quirrel/quirrel/doc/source/rfcs/STATUS.md index 8529561f3..a68caaaa8 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/doc/source/rfcs/STATUS.md +++ b/prog/1stPartyLibs/quirrel/quirrel/doc/source/rfcs/STATUS.md @@ -8,35 +8,30 @@ Use Python style for extends let Bar = class {} let Baz = class(Bar) //(instead of Baz = class extends Bar -**Status**: Needs implementation and detailed rfc +**Status**: Implemented -## Introduce way to call table methods +## Introduce way to call builtin table methods to avoid ambiguity with table fields {values = @() print("values")}.values() +Introduce operator '.$' (as well as ?.$ ) + ``` + print({keys = 1}.$keys()?[0]) //will print 'keys', not 'null' -Suggestions: - - - with 'prefix'-style, operator .$ ( print({keys = 1}.$keys()) //'keys', not 1) - - '::' (if and whe nwe remove :: operator) table::values() - - '->' - - '|>' (betterr keep for pipe) - - some other way, for example with like getMethod({}.values)({}) - -**Status**: Needs implementation and detailed rfc +**Status**: Implemented. ## Deprecate delete operator Use rawdelete instead -**Status**: Needs implementation and detailed rfc +**Status**: Implemented, as optional behavior specified with pragma #forbid-delete-operator #allow-delete-operator. ## Deprecate clone operator and replace it with .clone method -**Status**: Needs implementation and detailed rfc +**Status**: Implemented, as optional behavior currently. Can be specified by #forbid-clone-operator or #allow-clone-operator -## Add is_freezed method to array and table +## Add is_frozen method to array and table -**Status**: Needs implementation and detailed rfc +**Status**: Implemented ## Add for and foreach in ranges @@ -49,7 +44,7 @@ Syntax like: Will allow to make code faster and safer than with for(local i=0; i disable diagnostic by numeric id") @@ -387,6 +387,10 @@ int getargs(HSQUIRRELVM v,int argc, char* argv[],SQInteger *retval) module_mgr->compilationOptions.doStaticAnalysis = static_analysis; + if (static_analysis) { + sq_setcompilationoption(v, CompilationOptions::CO_CLOSURE_HOISTING_OPT, false); + } + if (diagFile) { errorStream = diagFile; @@ -399,10 +403,10 @@ int getargs(HSQUIRRELVM v,int argc, char* argv[],SQInteger *retval) filename=argv[arg]; if (static_analysis) { - sq_resetanalyserconfig(); + sq_resetanalyzerconfig(); char buffer[1024]; if (search_sqconfig(filename, buffer, sizeof buffer)) { - if (!sq_loadanalyserconfig(buffer)) { + if (!sq_loadanalyzerconfig(buffer)) { fprintf(errorStream, "Cannot load .sqconfig file %s\n", buffer); return _ERROR; } diff --git a/prog/1stPartyLibs/quirrel/quirrel/sqmodules/sqmodules.cpp b/prog/1stPartyLibs/quirrel/quirrel/sqmodules/sqmodules.cpp index e25083b87..e4a0ecb5d 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/sqmodules/sqmodules.cpp +++ b/prog/1stPartyLibs/quirrel/quirrel/sqmodules/sqmodules.cpp @@ -201,7 +201,7 @@ bool SqModules::compileScriptImpl(const std::vector &buf, const char *sour if (compilationOptions.doStaticAnalysis) { - sq_analyseast(sqvm, ast, bindings, &buf[0], buf.size()); + sq_analyzeast(sqvm, ast, bindings, &buf[0], buf.size()); } if (onBytecode_cb) { diff --git a/prog/1stPartyLibs/quirrel/quirrel/squirrel/CMakeLists.txt b/prog/1stPartyLibs/quirrel/quirrel/squirrel/CMakeLists.txt index 1158f90dc..03d648535 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/squirrel/CMakeLists.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/squirrel/CMakeLists.txt @@ -19,7 +19,7 @@ set(SQUIRREL_SRC sqapi.cpp sqtable.cpp sqvm.cpp optimizations/closureHoisting.cpp - static_analyser/analyser.cpp) + static_analyzer/analyzer.cpp) if (ENABLE_VAR_TRACE) list(APPEND SQUIRREL_SRC vartrace.cpp) diff --git a/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqapi.cpp b/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqapi.cpp index 015f64fe4..974bd010f 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqapi.cpp +++ b/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqapi.cpp @@ -15,7 +15,7 @@ #include "arena.h" #include "sqast.h" #include "sqastrender.h" -#include "static_analyser/analyser.h" +#include "static_analyzer/analyzer.h" SQUIRREL_API SQBool sq_tracevar(HSQUIRRELVM v, const HSQOBJECT *container, const HSQOBJECT * key, SQChar * buf, int buf_size) { @@ -1760,14 +1760,14 @@ SQRESULT sq_translateasttobytecode(HSQUIRRELVM v, SQCompilation::SqASTData *astD return SQ_ERROR; } -void sq_analyseast(HSQUIRRELVM v, SQCompilation::SqASTData *astData, const HSQOBJECT *bindings, const SQChar *s, SQInteger size) +void sq_analyzeast(HSQUIRRELVM v, SQCompilation::SqASTData *astData, const HSQOBJECT *bindings, const SQChar *s, SQInteger size) { - AnalyseCode(v, astData, bindings, s, size); + AnalyzeCode(v, astData, bindings, s, size); } void sq_checktrailingspaces(HSQUIRRELVM v, const SQChar *sourceName, const SQChar *s, SQInteger size) { - StaticAnalyser::checkTrailingWhitespaces(v, sourceName, s, size); + StaticAnalyzer::checkTrailingWhitespaces(v, sourceName, s, size); } void sq_releaseASTData(HSQUIRRELVM v, SQCompilation::SqASTData *astData) @@ -1864,15 +1864,15 @@ SQRESULT sq_limitthreadaccess(HSQUIRRELVM vm, int64_t tid) return SQ_OK; } -void sq_resetanalyserconfig() { +void sq_resetanalyzerconfig() { SQCompilationContext::resetConfig(); } -bool sq_loadanalyserconfig(const char *configFileName) { +bool sq_loadanalyzerconfig(const char *configFileName) { return SQCompilationContext::loadConfigFile(configFileName); } -bool sq_loadanalyserconfigblk(const KeyValueFile &config) { +bool sq_loadanalyzerconfigblk(const KeyValueFile &config) { return SQCompilationContext::loadConfigFile(config); } @@ -1901,9 +1901,9 @@ void sq_enablesyntaxwarnings() { } void sq_checkglobalnames(HSQUIRRELVM v) { - StaticAnalyser::reportGlobalNameDiagnostics(v); + StaticAnalyzer::reportGlobalNameDiagnostics(v); } void sq_mergeglobalnames(const HSQOBJECT *bindings) { - StaticAnalyser::mergeKnownBindings(bindings); + StaticAnalyzer::mergeKnownBindings(bindings); } diff --git a/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqastcodegen.cpp b/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqastcodegen.cpp index f4b032a53..d79e11e09 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqastcodegen.cpp +++ b/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqastcodegen.cpp @@ -683,7 +683,7 @@ void CodegenVisitor::visitClassDecl(ClassDecl *klass) { } } -static const SQChar *varDescriptor(VarDecl *var) { +static const SQChar *varDeclKindDescription(VarDecl *var) { Expr *init = var->initializer(); if (init == NULL) { return var->isAssignable() ? _SC("Local variable") : _SC("Named binding"); @@ -716,7 +716,7 @@ void CodegenVisitor::visitVarDecl(VarDecl *var) { SQObject varName = _fs->CreateString(name); - CheckDuplicateLocalIdentifier(var, varName, varDescriptor(var), false); + CheckDuplicateLocalIdentifier(var, varName, varDeclKindDescription(var), false); if (var->initializer()) { visitForceGet(var->initializer()); @@ -1097,7 +1097,7 @@ void CodegenVisitor::visitUnExpr(UnExpr *unary) { } } -void CodegenVisitor::emitSimpleBin(SQOpcode op, Expr *lhs, Expr *rhs, SQInteger op3) { +void CodegenVisitor::emitSimpleBinaryOp(SQOpcode op, Expr *lhs, Expr *rhs, SQInteger op3) { visitForceGet(lhs); visitForceGet(rhs); SQInteger op1 = _fs->PopTarget(); @@ -1105,7 +1105,7 @@ void CodegenVisitor::emitSimpleBin(SQOpcode op, Expr *lhs, Expr *rhs, SQInteger _fs->AddInstruction(op, _fs->PushTarget(), op1, op2, op3); } -void CodegenVisitor::emitJpmArith(SQOpcode op, Expr *lhs, Expr *rhs) { +void CodegenVisitor::emitShortCircuitLogicalOp(SQOpcode op, Expr *lhs, Expr *rhs) { visitForceGet(lhs); SQInteger first_exp = _fs->PopTarget(); @@ -1428,9 +1428,9 @@ void CodegenVisitor::visitBinExpr(BinExpr *expr) { maybeAddInExprLine(expr); switch (expr->op()) { case TO_NEWSLOT: emitNewSlot(expr->lhs(), expr->rhs()); break; - case TO_NULLC: emitJpmArith(_OP_NULLCOALESCE, expr->lhs(), expr->rhs()); break; - case TO_OROR: emitJpmArith(_OP_OR, expr->lhs(), expr->rhs()); break; - case TO_ANDAND: emitJpmArith(_OP_AND, expr->lhs(), expr->rhs()); break; + case TO_NULLC: emitShortCircuitLogicalOp(_OP_NULLCOALESCE, expr->lhs(), expr->rhs()); break; + case TO_OROR: emitShortCircuitLogicalOp(_OP_OR, expr->lhs(), expr->rhs()); break; + case TO_ANDAND: emitShortCircuitLogicalOp(_OP_AND, expr->lhs(), expr->rhs()); break; case TO_INEXPR_ASSIGN: emitAssign(expr->lhs(), expr->rhs(), true); break; case TO_ASSIGN: emitAssign(expr->lhs(), expr->rhs(), false); break; case TO_PLUSEQ: emitCompoundArith(_OP_ADD, '+', expr->lhs(), expr->rhs()); break; @@ -1438,26 +1438,26 @@ void CodegenVisitor::visitBinExpr(BinExpr *expr) { case TO_MULEQ: emitCompoundArith(_OP_MUL, '*', expr->lhs(), expr->rhs()); break; case TO_DIVEQ: emitCompoundArith(_OP_DIV, '/', expr->lhs(), expr->rhs()); break; case TO_MODEQ: emitCompoundArith(_OP_MOD, '%', expr->lhs(), expr->rhs()); break; - case TO_ADD: emitSimpleBin(_OP_ADD, expr->lhs(), expr->rhs()); break; - case TO_SUB: emitSimpleBin(_OP_SUB, expr->lhs(), expr->rhs()); break; - case TO_MUL: emitSimpleBin(_OP_MUL, expr->lhs(), expr->rhs()); break; - case TO_DIV: emitSimpleBin(_OP_DIV, expr->lhs(), expr->rhs()); break; - case TO_MOD: emitSimpleBin(_OP_MOD, expr->lhs(), expr->rhs()); break; - case TO_OR: emitSimpleBin(_OP_BITW, expr->lhs(), expr->rhs(), BW_OR); break; - case TO_AND: emitSimpleBin(_OP_BITW, expr->lhs(), expr->rhs(), BW_AND); break; - case TO_XOR: emitSimpleBin(_OP_BITW, expr->lhs(), expr->rhs(), BW_XOR); break; - case TO_USHR:emitSimpleBin(_OP_BITW, expr->lhs(), expr->rhs(), BW_USHIFTR); break; - case TO_SHR: emitSimpleBin(_OP_BITW, expr->lhs(), expr->rhs(), BW_SHIFTR); break; - case TO_SHL: emitSimpleBin(_OP_BITW, expr->lhs(), expr->rhs(), BW_SHIFTL); break; - case TO_EQ: emitSimpleBin(_OP_EQ, expr->lhs(), expr->rhs()); break; - case TO_NE: emitSimpleBin(_OP_NE, expr->lhs(), expr->rhs()); break; - case TO_GE: emitSimpleBin(_OP_CMP, expr->lhs(), expr->rhs(), CMP_GE); break; - case TO_GT: emitSimpleBin(_OP_CMP, expr->lhs(), expr->rhs(), CMP_G); break; - case TO_LE: emitSimpleBin(_OP_CMP, expr->lhs(), expr->rhs(), CMP_LE); break; - case TO_LT: emitSimpleBin(_OP_CMP, expr->lhs(), expr->rhs(), CMP_L); break; - case TO_3CMP: emitSimpleBin(_OP_CMP, expr->lhs(), expr->rhs(), CMP_3W); break; - case TO_IN: emitSimpleBin(_OP_EXISTS, expr->lhs(), expr->rhs()); break; - case TO_INSTANCEOF: emitSimpleBin(_OP_INSTANCEOF, expr->lhs(), expr->rhs()); break; + case TO_ADD: emitSimpleBinaryOp(_OP_ADD, expr->lhs(), expr->rhs()); break; + case TO_SUB: emitSimpleBinaryOp(_OP_SUB, expr->lhs(), expr->rhs()); break; + case TO_MUL: emitSimpleBinaryOp(_OP_MUL, expr->lhs(), expr->rhs()); break; + case TO_DIV: emitSimpleBinaryOp(_OP_DIV, expr->lhs(), expr->rhs()); break; + case TO_MOD: emitSimpleBinaryOp(_OP_MOD, expr->lhs(), expr->rhs()); break; + case TO_OR: emitSimpleBinaryOp(_OP_BITW, expr->lhs(), expr->rhs(), BW_OR); break; + case TO_AND: emitSimpleBinaryOp(_OP_BITW, expr->lhs(), expr->rhs(), BW_AND); break; + case TO_XOR: emitSimpleBinaryOp(_OP_BITW, expr->lhs(), expr->rhs(), BW_XOR); break; + case TO_USHR:emitSimpleBinaryOp(_OP_BITW, expr->lhs(), expr->rhs(), BW_USHIFTR); break; + case TO_SHR: emitSimpleBinaryOp(_OP_BITW, expr->lhs(), expr->rhs(), BW_SHIFTR); break; + case TO_SHL: emitSimpleBinaryOp(_OP_BITW, expr->lhs(), expr->rhs(), BW_SHIFTL); break; + case TO_EQ: emitSimpleBinaryOp(_OP_EQ, expr->lhs(), expr->rhs()); break; + case TO_NE: emitSimpleBinaryOp(_OP_NE, expr->lhs(), expr->rhs()); break; + case TO_GE: emitSimpleBinaryOp(_OP_CMP, expr->lhs(), expr->rhs(), CMP_GE); break; + case TO_GT: emitSimpleBinaryOp(_OP_CMP, expr->lhs(), expr->rhs(), CMP_G); break; + case TO_LE: emitSimpleBinaryOp(_OP_CMP, expr->lhs(), expr->rhs(), CMP_LE); break; + case TO_LT: emitSimpleBinaryOp(_OP_CMP, expr->lhs(), expr->rhs(), CMP_L); break; + case TO_3CMP: emitSimpleBinaryOp(_OP_CMP, expr->lhs(), expr->rhs(), CMP_3W); break; + case TO_IN: emitSimpleBinaryOp(_OP_EXISTS, expr->lhs(), expr->rhs()); break; + case TO_INSTANCEOF: emitSimpleBinaryOp(_OP_INSTANCEOF, expr->lhs(), expr->rhs()); break; default: break; } diff --git a/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqastcodegen.h b/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqastcodegen.h index 00549d3f8..fb7ac498d 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqastcodegen.h +++ b/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqastcodegen.h @@ -7,6 +7,7 @@ #include struct SQFuncState; + namespace SQCompilation { class CodegenVisitor : public Visitor { @@ -46,9 +47,7 @@ class CodegenVisitor : public Visitor { bool CheckMemberUniqueness(ArenaVector &vec, Expr *obj); void Emit2ArgsOP(SQOpcode op, SQInteger p3 = 0); - void EmitLoadConstInt(SQInteger value, SQInteger target); - void EmitLoadConstFloat(SQFloat value, SQInteger target); void ResolveBreaks(SQFuncState *funcstate, SQInteger ntoresolve); @@ -63,33 +62,24 @@ class CodegenVisitor : public Visitor { SQTable* GetScopedConstsTable(); void emitUnaryOp(SQOpcode op, UnExpr *arg); - void emitDelete(UnExpr *argument); - - void emitSimpleBin(SQOpcode op, Expr *lhs, Expr *rhs, SQInteger op3 = 0); - - void emitJpmArith(SQOpcode op, Expr *lhs, Expr *rhs); - + void emitSimpleBinaryOp(SQOpcode op, Expr *lhs, Expr *rhs, SQInteger op3 = 0); + void emitShortCircuitLogicalOp(SQOpcode op, Expr *lhs, Expr *rhs); void emitCompoundArith(SQOpcode op, SQInteger opcode, Expr *lvalue, Expr *rvalue); bool isLValue(Expr *expr); void emitNewSlot(Expr *lvalue, Expr *rvalue); - void emitAssign(Expr *lvalue, Expr * rvalue, bool inExpr); - void emitFieldAssign(bool isLiteral); bool CanBeDefaultDelegate(const SQChar *key); - bool canBeLiteral(AccessExpr *expr); void MoveIfCurrentTargetIsLocal(); bool IsConstant(const SQObject &name, SQObject &e); - bool IsLocalConstant(const SQObject &name, SQObject &e); - bool IsGlobalConstant(const SQObject &name, SQObject &e); SQObject selectLiteral(LiteralExpr *lit); @@ -108,79 +98,42 @@ class CodegenVisitor : public Visitor { public: void visitBlock(Block *block) override; - void visitIfStatement(IfStatement *ifStmt) override; - void visitWhileStatement(WhileStatement *whileLoop) override; - void visitDoWhileStatement(DoWhileStatement *doWhileLoop) override; - void visitForStatement(ForStatement *forLoop) override; - void visitForeachStatement(ForeachStatement *foreachLoop) override; - void visitSwitchStatement(SwitchStatement *swtch) override; - void visitTryStatement(TryStatement *tryStmt) override; - void visitBreakStatement(BreakStatement *breakStmt) override; - void visitContinueStatement(ContinueStatement *continueStmt) override; - void visitTerminateStatement(TerminateStatement *terminator) override; - void visitReturnStatement(ReturnStatement *retStmt) override; - void visitYieldStatement(YieldStatement *yieldStmt) override; - void visitThrowStatement(ThrowStatement *throwStmt) override; - void visitExprStatement(ExprStatement *stmt) override; - void visitTableDecl(TableDecl *tableDecl) override; - void visitClassDecl(ClassDecl *klass) override; - void visitParamDecl(ParamDecl *param) override; - void visitVarDecl(VarDecl *var) override; - void visitDeclGroup(DeclGroup *group) override; - void visitDestructuringDecl(DestructuringDecl *destruct) override; - void visitFunctionDecl(FunctionDecl *func) override; - void visitConstDecl(ConstDecl *decl) override; - void visitEnumDecl(EnumDecl *enums) override; - void visitCallExpr(CallExpr *call) override; - void visitBaseExpr(BaseExpr *base) override; - void visitRootExpr(RootExpr *expr) override; - void visitLiteralExpr(LiteralExpr *lit) override; - void visitArrayExpr(ArrayExpr *expr) override; - void visitUnExpr(UnExpr *unary) override; - void visitGetFieldExpr(GetFieldExpr *expr) override; - void visitGetTableExpr(GetTableExpr *expr) override; - void visitBinExpr(BinExpr *expr) override; - void visitTerExpr(TerExpr *expr) override; - void visitIncExpr(IncExpr *expr) override; - void visitId(Id *id) override; - void visitCommaExpr(CommaExpr *expr) override; - void visitDirectiveStatement(DirectiveStmt *dir) override; }; diff --git a/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqcompiler.cpp b/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqcompiler.cpp index b22a68684..3f0fe1505 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqcompiler.cpp +++ b/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqcompiler.cpp @@ -23,7 +23,7 @@ #include "optimizations/closureHoisting.h" #include "sqbinaryast.h" #include "sqcompilationcontext.h" -#include "static_analyser/analyser.h" +#include "static_analyzer/analyzer.h" namespace SQCompilation { @@ -111,13 +111,13 @@ bool TranslateASTToBytecode(SQVM *vm, SqASTData *astData, const HSQOBJECT *bindi return TranslateASTToBytecodeImpl(vm, astData->root, bindings, astData->sourceName, sourceText, sourceTextSize, out, astData->comments, raiseerror, lineinfo); } -void AnalyseCode(SQVM *vm, SqASTData *astData, const HSQOBJECT *bindings, const char *sourceText, size_t sourceTextSize) +void AnalyzeCode(SQVM *vm, SqASTData *astData, const HSQOBJECT *bindings, const char *sourceText, size_t sourceTextSize) { - Arena saArena(_ss(vm)->_alloc_ctx, "Analyser"); + Arena saArena(_ss(vm)->_alloc_ctx, "Analyzer"); SQCompilationContext ctx(vm, &saArena, astData->sourceName, sourceText, sourceTextSize, astData->comments, true); RootBlock *ast = astData->root; - StaticAnalyser sa(ctx); + StaticAnalyzer sa(ctx); sa.runAnalysis(ast, bindings); } diff --git a/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqcompiler.h b/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqcompiler.h index c43eaba0c..9a9075fd7 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqcompiler.h +++ b/prog/1stPartyLibs/quirrel/quirrel/squirrel/sqcompiler.h @@ -180,7 +180,7 @@ SqASTData *ParseToAST(SQVM *vm, const char *sourceText, size_t sourceTextSize, c bool ParseAndSaveBinaryAST(SQVM *vm, const char *sourceText, size_t sourceTextSize, const SQChar *sourcename, OutputStream *ostream, bool raiseerror); bool TranslateASTToBytecode(SQVM *vm, SqASTData *astData, const HSQOBJECT *bindings, const char *sourceText, size_t sourceTextSize, SQObjectPtr &out, bool raiseerror, bool lineinfo); bool TranslateBinaryASTToBytecode(SQVM *vm, const uint8_t *buffer, size_t size, const HSQOBJECT *bindings, SQObjectPtr &out, bool raiseerror, bool lineinfo); -void AnalyseCode(SQVM *vm, SqASTData *astData, const HSQOBJECT *bindings, const char *sourceText, size_t sourceTextSize); +void AnalyzeCode(SQVM *vm, SqASTData *astData, const HSQOBJECT *bindings, const char *sourceText, size_t sourceTextSize); }; // SQCompilation #endif //_SQCOMPILER_H_ diff --git a/prog/1stPartyLibs/quirrel/quirrel/squirrel/static_analyser/analyser.cpp b/prog/1stPartyLibs/quirrel/quirrel/squirrel/static_analyzer/analyzer.cpp similarity index 99% rename from prog/1stPartyLibs/quirrel/quirrel/squirrel/static_analyser/analyser.cpp rename to prog/1stPartyLibs/quirrel/quirrel/squirrel/static_analyzer/analyzer.cpp index 5bc12325b..51d414177 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/squirrel/static_analyser/analyser.cpp +++ b/prog/1stPartyLibs/quirrel/quirrel/squirrel/static_analyzer/analyzer.cpp @@ -1,4 +1,4 @@ -#include "analyser.h" +#include "analyzer.h" #include #include #include @@ -131,7 +131,7 @@ struct StringEqualer { } }; -StaticAnalyser::StaticAnalyser(SQCompilationContext &ctx) +StaticAnalyzer::StaticAnalyzer(SQCompilationContext &ctx) : _ctx(ctx) { } @@ -2745,7 +2745,7 @@ VarScope *VarScope::copy(Arena *a, bool forClosure) const { ValueRef *vcopy = new(mem) ValueRef(v->info, v->evalIndex); if (!v->isConstant() && forClosure) { - // if we analyse closure we cannot rely on existed assignable values + // if we analyze closure we cannot rely on existed assignable values vcopy->state = VRS_UNKNOWN; vcopy->expression = nullptr; vcopy->flagsNegative = vcopy->flagsPositive = 0; @@ -3094,7 +3094,7 @@ class CheckerVisitor : public Visitor { void visitEnumDecl(EnumDecl *enm); void visitDestructuringDecl(DestructuringDecl *decl); - void analyse(RootBlock *root); + void analyze(RootBlock *root); }; void VarScope::checkUnusedSymbols(CheckerVisitor *checker) { @@ -7703,7 +7703,7 @@ void CheckerVisitor::visitDestructuringDecl(DestructuringDecl *d) { Visitor::visitDestructuringDecl(d); } -void CheckerVisitor::analyse(RootBlock *root) { +void CheckerVisitor::analyze(RootBlock *root) { root->visit(this); } @@ -7806,7 +7806,7 @@ class NameShadowingChecker : public Visitor { void visitForStatement(ForStatement *stmt); void visitForeachStatement(ForeachStatement *stmt); - void analyse(RootBlock *root) { + void analyze(RootBlock *root) { root->visit(this); } }; @@ -8097,7 +8097,7 @@ static bool checkInBindings(const SQChar *id) { return knownBindings.find(id) != knownBindings.end(); } -void StaticAnalyser::reportGlobalNameDiagnostics(HSQUIRRELVM vm) { +void StaticAnalyzer::reportGlobalNameDiagnostics(HSQUIRRELVM vm) { auto errorFunc = _ss(vm)->_compilererrorhandler; if (!errorFunc) @@ -8152,7 +8152,7 @@ void StaticAnalyser::reportGlobalNameDiagnostics(HSQUIRRELVM vm) { static bool isSpaceOrTab(SQChar c) { return c == '\t' || c == ' '; } -void StaticAnalyser::checkTrailingWhitespaces(HSQUIRRELVM vm, const SQChar *sourceName, const SQChar *code, size_t codeSize) { +void StaticAnalyzer::checkTrailingWhitespaces(HSQUIRRELVM vm, const SQChar *sourceName, const SQChar *code, size_t codeSize) { Arena arena(_ss(vm)->_alloc_ctx, "tmp"); SQCompilationContext ctx(vm, &arena, sourceName, code, codeSize, nullptr, true); @@ -8173,7 +8173,7 @@ void StaticAnalyser::checkTrailingWhitespaces(HSQUIRRELVM vm, const SQChar *sour } } -void StaticAnalyser::mergeKnownBindings(const HSQOBJECT *bindings) { +void StaticAnalyzer::mergeKnownBindings(const HSQOBJECT *bindings) { if (bindings && sq_istable(*bindings)) { SQTable *table = _table(*bindings); @@ -8192,11 +8192,11 @@ void StaticAnalyser::mergeKnownBindings(const HSQOBJECT *bindings) { } } -void StaticAnalyser::runAnalysis(RootBlock *root, const HSQOBJECT *bindings) +void StaticAnalyzer::runAnalysis(RootBlock *root, const HSQOBJECT *bindings) { mergeKnownBindings(bindings); - CheckerVisitor(_ctx).analyse(root); - NameShadowingChecker(_ctx, bindings).analyse(root); + CheckerVisitor(_ctx).analyze(root); + NameShadowingChecker(_ctx, bindings).analyze(root); } } diff --git a/prog/1stPartyLibs/quirrel/quirrel/squirrel/static_analyser/analyser.h b/prog/1stPartyLibs/quirrel/quirrel/squirrel/static_analyzer/analyzer.h similarity index 87% rename from prog/1stPartyLibs/quirrel/quirrel/squirrel/static_analyser/analyser.h rename to prog/1stPartyLibs/quirrel/quirrel/squirrel/static_analyzer/analyzer.h index f3acb0ff3..c09da3bff 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/squirrel/static_analyser/analyser.h +++ b/prog/1stPartyLibs/quirrel/quirrel/squirrel/static_analyzer/analyzer.h @@ -6,13 +6,13 @@ namespace SQCompilation { -class StaticAnalyser { +class StaticAnalyzer { SQCompilationContext &_ctx; public: - StaticAnalyser(SQCompilationContext &ctx); + StaticAnalyzer(SQCompilationContext &ctx); void runAnalysis(RootBlock *r, const HSQOBJECT *bindings); diff --git a/prog/1stPartyLibs/quirrel/quirrel/static_analyzer/quirrel_lexer.cpp b/prog/1stPartyLibs/quirrel/quirrel/static_analyzer/quirrel_lexer.cpp index f12707866..edbf124e4 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/static_analyzer/quirrel_lexer.cpp +++ b/prog/1stPartyLibs/quirrel/quirrel/static_analyzer/quirrel_lexer.cpp @@ -918,6 +918,10 @@ bool Lexer::process() } else { + // HACK: just make code "x.$y" compilable + if (tokens.size() > 0 && tokens.back().type == TK_READER_MACRO) + tokens.erase(tokens.end() - 1, tokens.end()); + tokens.emplace_back(Token{ (TokenType)TK_IDENTIFIER, 0, (unsigned short)beginColumn, beginLine, u }); addCurrentComments(); } diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/200_nullc.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/200_nullc.diag.txt similarity index 85% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/200_nullc.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/200_nullc.diag.txt index f5689f407..5d759f139 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/200_nullc.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/200_nullc.diag.txt @@ -1,12 +1,12 @@ WARNING: w200 (potentially-nulled-ops) Comparison operation with potentially nullable expression. -testData/static_analyser/200_nullc.nut:7:44 +testData/static_analyzer/200_nullc.nut:7:44 let _x = (((item?.isPrimaryBuy ?? false) > (res?.isPrimaryBuy ?? null) ? item : res)) ^-------------------------- WARNING: w305 (relative-bool-cmp) Relative comparison non-boolean with boolean. It is potential runtime error -testData/static_analyser/200_nullc.nut:7:12 +testData/static_analyzer/200_nullc.nut:7:12 let _x = (((item?.isPrimaryBuy ?? false) > (res?.isPrimaryBuy ?? null) ? item : res)) ^---------------------------------------------------------- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/200_nullc.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/200_nullc.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/200_nullc.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/200_nullc.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/248_instanceof.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/248_instanceof.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/248_instanceof.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/248_instanceof.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/248_instanceof.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/248_instanceof.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/248_instanceof.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/248_instanceof.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/efferct_fropm_call.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/efferct_fropm_call.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/efferct_fropm_call.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/efferct_fropm_call.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/efferct_fropm_call.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/efferct_fropm_call.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/efferct_fropm_call.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/efferct_fropm_call.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/forloop_merge.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/forloop_merge.diag.txt similarity index 76% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/forloop_merge.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/forloop_merge.diag.txt index 50c65ece5..5715af820 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/forloop_merge.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/forloop_merge.diag.txt @@ -1,5 +1,5 @@ WARNING: w228 (declared-never-used) let 'findlast' was declared but never used. -testData/static_analyser/forloop_merge.nut:2:0 +testData/static_analyzer/forloop_merge.nut:2:0 function findlast(str, substr, startidx=0){ ^-- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/forloop_merge.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/forloop_merge.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/forloop_merge.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/forloop_merge.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/function_rt_detect.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/function_rt_detect.diag.txt similarity index 70% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/function_rt_detect.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/function_rt_detect.diag.txt index 54b8eff77..6a2670eb5 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/function_rt_detect.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/function_rt_detect.diag.txt @@ -1,5 +1,5 @@ WARNING: w226 (return-different-types) Function can return different types. -testData/static_analyser/function_rt_detect.nut:6:0 +testData/static_analyzer/function_rt_detect.nut:6:0 function _foo(p) { // EXPECTED ^ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/function_rt_detect.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/function_rt_detect.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/function_rt_detect.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/function_rt_detect.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/intersected_assignment.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/intersected_assignment.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/intersected_assignment.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/intersected_assignment.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/intersected_assignment.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/intersected_assignment.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/intersected_assignment.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/intersected_assignment.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/klass_check.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/klass_check.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/klass_check.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/klass_check.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/klass_check.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/klass_check.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/klass_check.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/klass_check.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/logic_ops_paren.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/logic_ops_paren.diag.txt similarity index 79% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/logic_ops_paren.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/logic_ops_paren.diag.txt index ac2d4e578..49d6c481c 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/logic_ops_paren.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/logic_ops_paren.diag.txt @@ -1,5 +1,5 @@ WARNING: w202 (and-or-paren) Priority of the '&&' operator is higher than that of the '||' operator. Perhaps parentheses are missing? -testData/static_analyser/logic_ops_paren.nut:5:8 +testData/static_analyzer/logic_ops_paren.nut:5:8 let a = x && y || z ^---------- @@ -7,7 +7,7 @@ let b = x || y && z WARNING: w202 (and-or-paren) Priority of the '&&' operator is higher than that of the '||' operator. Perhaps parentheses are missing? -testData/static_analyser/logic_ops_paren.nut:6:8 +testData/static_analyzer/logic_ops_paren.nut:6:8 let a = x && y || z let b = x || y && z @@ -16,7 +16,7 @@ let c = (x && y) || z WARNING: w203 (bitwise-bool-paren) Result of bitwise operation used in boolean expression. Perhaps parentheses are missing? -testData/static_analyser/logic_ops_paren.nut:10:8 +testData/static_analyzer/logic_ops_paren.nut:10:8 let e = x || y | z ^--------- @@ -24,7 +24,7 @@ let f = x || y & z WARNING: w203 (bitwise-bool-paren) Result of bitwise operation used in boolean expression. Perhaps parentheses are missing? -testData/static_analyser/logic_ops_paren.nut:11:8 +testData/static_analyzer/logic_ops_paren.nut:11:8 let e = x || y | z let f = x || y & z @@ -33,7 +33,7 @@ let g = x && y | z WARNING: w203 (bitwise-bool-paren) Result of bitwise operation used in boolean expression. Perhaps parentheses are missing? -testData/static_analyser/logic_ops_paren.nut:12:8 +testData/static_analyzer/logic_ops_paren.nut:12:8 let f = x || y & z let g = x && y | z @@ -42,7 +42,7 @@ let h = x && y & z WARNING: w203 (bitwise-bool-paren) Result of bitwise operation used in boolean expression. Perhaps parentheses are missing? -testData/static_analyser/logic_ops_paren.nut:13:8 +testData/static_analyzer/logic_ops_paren.nut:13:8 let g = x && y | z let h = x && y & z diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/logic_ops_paren.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/logic_ops_paren.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/logic_ops_paren.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/logic_ops_paren.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/loop_state.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/loop_state.diag.txt similarity index 75% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/loop_state.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/loop_state.diag.txt index b5746265b..2a99056b6 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/loop_state.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/loop_state.diag.txt @@ -2,7 +2,7 @@ AN ERROR HAS OCCURRED [cannot iterate null] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/loop_state.nut line [8] +*FUNCTION [__main__()] testData/static_analyzer/loop_state.nut line [8] LOCALS [@ITERATOR@] NULL diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/loop_state.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/loop_state.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/loop_state.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/loop_state.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/nullcheck_ternary.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/nullcheck_ternary.diag.txt similarity index 71% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/nullcheck_ternary.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/nullcheck_ternary.diag.txt index 23b07585e..ac57d2a1c 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/nullcheck_ternary.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/nullcheck_ternary.diag.txt @@ -1,5 +1,5 @@ WARNING: w248 (access-potentially-nulled) 'expression' can be null, but is used as a container without checking. -testData/static_analyser/nullcheck_ternary.nut:6:4 +testData/static_analyzer/nullcheck_ternary.nut:6:4 foo(p.z) ^ @@ -9,7 +9,7 @@ foo(p.z) AN ERROR HAS OCCURRED [the index 'y' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/nullcheck_ternary.nut line [4] +*FUNCTION [__main__()] testData/static_analyzer/nullcheck_ternary.nut line [4] LOCALS [x] TABLE={} diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/nullcheck_ternary.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/nullcheck_ternary.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/nullcheck_ternary.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/nullcheck_ternary.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/param_check.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/param_check.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/param_check.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/param_check.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/param_check.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/param_check.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/param_check.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/param_check.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w190.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w190.diag.txt similarity index 72% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w190.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w190.diag.txt index 5e57c5483..fbfb02d6c 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w190.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w190.diag.txt @@ -1,5 +1,5 @@ WARNING: w190 (paren-is-function-call) '(' on a new line parsed as function call. -testData/static_analyser/w190.nut:9:4 +testData/static_analyzer/w190.nut:9:4 foo(10) (x.bar) ? 10 : 20 @@ -11,7 +11,7 @@ testData/static_analyser/w190.nut:9:4 AN ERROR HAS OCCURRED [attempt to call 'string'] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w190.nut line [8] +*FUNCTION [__main__()] testData/static_analyzer/w190.nut line [8] LOCALS [x] TABLE={bar="T"} diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w190.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w190.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w190.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w190.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w192.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w192.diag.txt similarity index 78% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w192.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w192.diag.txt index ffba4de2f..2ad203d81 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w192.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w192.diag.txt @@ -1,5 +1,5 @@ WARNING: w192 (statement-on-same-line) Next statement on the same line after 'then' statement. -testData/static_analyser/w192.nut:5:9 +testData/static_analyzer/w192.nut:5:9 if (test == 3) a = 6; a = 10 @@ -7,7 +7,7 @@ if (test == 3) WARNING: w192 (statement-on-same-line) Next statement on the same line after 'else' statement. -testData/static_analyser/w192.nut:11:9 +testData/static_analyzer/w192.nut:11:9 else a = 6; a = 10 @@ -15,7 +15,7 @@ else WARNING: w192 (statement-on-same-line) Next statement on the same line after 'while loop body' statement. -testData/static_analyser/w192.nut:15:11 +testData/static_analyzer/w192.nut:15:11 while (a > b) a = 0; b = 20; diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w192.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w192.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w192.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w192.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200.diag.txt similarity index 85% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200.diag.txt index 67901e700..c9713fa03 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200.diag.txt @@ -1,5 +1,5 @@ WARNING: w200 (potentially-nulled-ops) Comparison operation with potentially nullable expression. -testData/static_analyser/w200.nut:4:10 +testData/static_analyzer/w200.nut:4:10 let function fn(mod, wpUnitRank) { //-declared-never-used return (mod?.reqRank > wpUnitRank) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_3wcmp.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_3wcmp.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_3wcmp.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_3wcmp.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_3wcmp.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_3wcmp.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_3wcmp.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_3wcmp.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith.diag.txt similarity index 78% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith.diag.txt index 4c21476e4..1cfda3587 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith.diag.txt @@ -1,5 +1,5 @@ WARNING: w200 (potentially-nulled-ops) Arithmetic operation with potentially nullable expression. -testData/static_analyser/w200_arith.nut:3:10 +testData/static_analyzer/w200_arith.nut:3:10 local x = {y = 2} local a = x?.y - 8 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith_deep.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith_deep.diag.txt similarity index 75% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith_deep.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith_deep.diag.txt index 9704e5eef..2d9d74767 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith_deep.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith_deep.diag.txt @@ -1,5 +1,5 @@ WARNING: w200 (potentially-nulled-ops) Arithmetic operation with potentially nullable expression. -testData/static_analyser/w200_arith_deep.nut:4:10 +testData/static_analyzer/w200_arith_deep.nut:4:10 local z = x?.y local a = z - 8 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith_deep.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith_deep.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith_deep.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith_deep.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith_plus_eq.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith_plus_eq.diag.txt similarity index 72% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith_plus_eq.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith_plus_eq.diag.txt index 9d691f7f0..4867e53e7 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith_plus_eq.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith_plus_eq.diag.txt @@ -1,5 +1,5 @@ WARNING: w200 (potentially-nulled-ops) Arithmetic operation with potentially nullable expression. -testData/static_analyser/w200_arith_plus_eq.nut:4:0 +testData/static_analyzer/w200_arith_plus_eq.nut:4:0 local a = x?.z a -= 10 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith_plus_eq.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith_plus_eq.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_arith_plus_eq.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_arith_plus_eq.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_special_func_name.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_special_func_name.diag.txt similarity index 70% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_special_func_name.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_special_func_name.diag.txt index 1547b1394..608fb1d6b 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_special_func_name.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_special_func_name.diag.txt @@ -1,5 +1,5 @@ WARNING: w200 (potentially-nulled-ops) Arithmetic operation with potentially nullable expression. -testData/static_analyser/w200_special_func_name.nut:3:7 +testData/static_analyzer/w200_special_func_name.nut:3:7 return ::a.b.c.indexof("x") + 6; ^------------------- @@ -9,7 +9,7 @@ return ::a.b.c.indexof("x") + 6; AN ERROR HAS OCCURRED [the index 'a' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w200_special_func_name.nut line [3] +*FUNCTION [__main__()] testData/static_analyzer/w200_special_func_name.nut line [3] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_special_func_name.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_special_func_name.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_special_func_name.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_special_func_name.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_stringconcat.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_stringconcat.diag.txt similarity index 68% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_stringconcat.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_stringconcat.diag.txt index 31eb4f1aa..c2b92c87a 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_stringconcat.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_stringconcat.diag.txt @@ -1,26 +1,26 @@ WARNING: w264 (plus-string) Usage of '+' for string concatenation. -testData/static_analyser/w200_stringconcat.nut:9:9 +testData/static_analyzer/w200_stringconcat.nut:9:9 let ss = "a" + o + s ^---------- WARNING: w286 (decl-in-expression) Declaration used in arith expression as operand. -testData/static_analyser/w200_stringconcat.nut:9:15 +testData/static_analyzer/w200_stringconcat.nut:9:15 let ss = "a" + o + s ^ WARNING: w264 (plus-string) Usage of '+' for string concatenation. -testData/static_analyser/w200_stringconcat.nut:9:9 +testData/static_analyzer/w200_stringconcat.nut:9:9 let ss = "a" + o + s ^------ WARNING: w228 (declared-never-used) let 'ss' was declared but never used. -testData/static_analyser/w200_stringconcat.nut:9:0 +testData/static_analyzer/w200_stringconcat.nut:9:0 let ss = "a" + o + s ^ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_stringconcat.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_stringconcat.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w200_stringconcat.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w200_stringconcat.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w203.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w203.diag.txt similarity index 77% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w203.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w203.diag.txt index 7ef71580b..f63d852fa 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w203.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w203.diag.txt @@ -1,5 +1,5 @@ WARNING: w203 (bitwise-bool-paren) Result of bitwise operation used in boolean expression. Perhaps parentheses are missing? -testData/static_analyser/w203.nut:7:18 +testData/static_analyzer/w203.nut:7:18 if (condition1 || condition2 || condition3 | condition4) ^------------------------------------ @@ -10,7 +10,7 @@ if (condition1 || condition2 || condition3 | condition4) AN ERROR HAS OCCURRED [the index 'x' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w203.nut line [2] +*FUNCTION [__main__()] testData/static_analyzer/w203.nut line [2] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w203.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w203.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w203.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w203.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w204.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w204.diag.txt similarity index 88% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w204.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w204.diag.txt index 23a48a85c..1e89a8fdc 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w204.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w204.diag.txt @@ -1,5 +1,5 @@ WARNING: w204 (bitwise-apply-to-bool) The '&' or '|' operator is applied to boolean type. You've probably forgotten to include parentheses or intended to use the '&&' or '||' operator. -testData/static_analyser/w204.nut:4:6 +testData/static_analyzer/w204.nut:4:6 let function foo(x){ //-declared-never-used if (x & 15 == 8) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w204.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w204.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w204.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w204.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w205-2.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w205-2.diag.txt similarity index 76% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w205-2.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w205-2.diag.txt index f6be9093e..6e997f064 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w205-2.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w205-2.diag.txt @@ -1,5 +1,5 @@ WARNING: w205 (unreachable-code) Unreachable code after 'return'. -testData/static_analyser/w205-2.nut:5:2 +testData/static_analyzer/w205-2.nut:5:2 return let t = { //-declared-never-used diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w205-2.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w205-2.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w205-2.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w205-2.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w205.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w205.diag.txt similarity index 74% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w205.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w205.diag.txt index f2ae78bd0..af74b32ce 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w205.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w205.diag.txt @@ -1,5 +1,5 @@ WARNING: w205 (unreachable-code) Unreachable code after 'return'. -testData/static_analyser/w205.nut:5:4 +testData/static_analyzer/w205.nut:5:4 return callback() diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w205.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w205.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w205.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w205.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w206.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w206.diag.txt similarity index 73% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w206.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w206.diag.txt index bde8a4b3a..35dbed922 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w206.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w206.diag.txt @@ -1,5 +1,5 @@ WARNING: w206 (assigned-twice) Variable is assigned twice successively. -testData/static_analyser/w206.nut:5:0 +testData/static_analyzer/w206.nut:5:0 tab.y = 7 tab.x = 8 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w206.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w206.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w206.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w206.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w206_arith.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w206_arith.diag.txt similarity index 72% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w206_arith.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w206_arith.diag.txt index 44621872f..8f3fd6cb3 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w206_arith.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w206_arith.diag.txt @@ -1,5 +1,5 @@ WARNING: w206 (assigned-twice) Variable is assigned twice successively. -testData/static_analyser/w206_arith.nut:11:0 +testData/static_analyzer/w206_arith.nut:11:0 x /= foo(); x = foo(); diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w206_arith.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w206_arith.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w206_arith.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w206_arith.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w208.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w208.diag.txt similarity index 79% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w208.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w208.diag.txt index 856fb2bcd..ecec525ec 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w208.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w208.diag.txt @@ -1,5 +1,5 @@ WARNING: w208 (potentially-nulled-assign) Assignment to potentially nullable expression. -testData/static_analyser/w208.nut:3:0 +testData/static_analyzer/w208.nut:3:0 local x = { z = {y = 3}} x.z?.y <- 6 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w208.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w208.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w208.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w208.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w208_rec.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w208_rec.diag.txt similarity index 76% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w208_rec.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w208_rec.diag.txt index 40f55ed92..340609e8e 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w208_rec.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w208_rec.diag.txt @@ -1,5 +1,5 @@ WARNING: w228 (declared-never-used) parameter 'x' was declared but never used. -testData/static_analyser/w208_rec.nut:4:14 +testData/static_analyzer/w208_rec.nut:4:14 function bart(x) { return "" } ^ @@ -9,7 +9,7 @@ function bart(x) { return "" } AN ERROR HAS OCCURRED [the index 'xx' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w208_rec.nut line [12] +*FUNCTION [__main__()] testData/static_analyzer/w208_rec.nut line [12] LOCALS [editorIsActive] 42 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w208_rec.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w208_rec.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w208_rec.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w208_rec.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w209.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w209.diag.txt similarity index 73% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w209.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w209.diag.txt index 0b55b5d08..2b742c88e 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w209.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w209.diag.txt @@ -1,5 +1,5 @@ WARNING: w209 (assigned-to-itself) The variable is assigned to itself. -testData/static_analyser/w209.nut:5:0 +testData/static_analyzer/w209.nut:5:0 x.y = (((x.y))) ^-------------- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w209.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w209.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w209.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w209.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210.diag.txt similarity index 77% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210.diag.txt index e80a0a602..889524fcc 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210.diag.txt @@ -1,5 +1,5 @@ WARNING: w210 (potentially-nulled-index) Potentially nullable expression used as array index. -testData/static_analyser/w210.nut:5:9 +testData/static_analyzer/w210.nut:5:9 ::f <- x[y?["a"]] ^------- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_complex.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_complex.diag.txt similarity index 74% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_complex.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_complex.diag.txt index dd4435d15..223f248d6 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_complex.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_complex.diag.txt @@ -1,5 +1,5 @@ WARNING: w210 (potentially-nulled-index) Potentially nullable expression used as array index. -testData/static_analyser/w210_complex.nut:10:6 +testData/static_analyzer/w210_complex.nut:10:6 foo(x[y]) ^ @@ -7,7 +7,7 @@ foo(x?[y]) WARNING: w210 (potentially-nulled-index) Potentially nullable expression used as array index. -testData/static_analyser/w210_complex.nut:14:12 +testData/static_analyzer/w210_complex.nut:14:12 foo(x.y?[10].y?[y]) foo(x.y.z.u[y]) @@ -19,7 +19,7 @@ foo(x.y?.z.u[y]) AN ERROR HAS OCCURRED [the index 'null' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w210_complex.nut line [10] +*FUNCTION [__main__()] testData/static_analyzer/w210_complex.nut line [10] LOCALS [x] TABLE={} diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_complex.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_complex.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_complex.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_complex.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_deep.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_deep.diag.txt similarity index 74% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_deep.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_deep.diag.txt index 2a97a710f..6945c1862 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_deep.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_deep.diag.txt @@ -1,5 +1,5 @@ WARNING: w210 (potentially-nulled-index) Potentially nullable expression used as array index. -testData/static_analyser/w210_deep.nut:6:9 +testData/static_analyzer/w210_deep.nut:6:9 ::f <- x[index] ^---- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_deep.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_deep.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_deep.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_deep.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_def.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_def.diag.txt similarity index 82% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_def.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_def.diag.txt index a10fa9e4e..52b27293a 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_def.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_def.diag.txt @@ -1,5 +1,5 @@ WARNING: w210 (potentially-nulled-index) Potentially nullable expression used as array index. -testData/static_analyser/w210_def.nut:4:31 +testData/static_analyzer/w210_def.nut:4:31 local buildBtnParams = ::kwarg(function(icon=null, option=null, count_list=null, counterFunc=null){ //-declared-never-used local list = ::contactsLists[count_list ?? option].list @@ -11,7 +11,7 @@ local buildBtnParams = ::kwarg(function(icon=null, option=null, count_list=null, AN ERROR HAS OCCURRED [the index 'kwarg' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w210_def.nut line [3] +*FUNCTION [__main__()] testData/static_analyzer/w210_def.nut line [3] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_def.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_def.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_def.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_def.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_nullc.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_nullc.diag.txt similarity index 80% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_nullc.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_nullc.diag.txt index cfccfabac..b3f939894 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_nullc.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_nullc.diag.txt @@ -1,5 +1,5 @@ WARNING: w228 (declared-never-used) let 'requestCratesContent' was declared but never used. -testData/static_analyser/w210_nullc.nut:6:0 +testData/static_analyzer/w210_nullc.nut:6:0 let function requestCratesContent(armyId, crates) { ^ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_nullc.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_nullc.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w210_nullc.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w210_nullc.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w211.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w211.diag.txt similarity index 81% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w211.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w211.diag.txt index ef65bc2f6..42cba0a71 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w211.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w211.diag.txt @@ -1,5 +1,5 @@ WARNING: w211 (duplicate-case) Duplicate case value. -testData/static_analyser/w211.nut:12:7 +testData/static_analyzer/w211.nut:12:7 case MODE.MODE_2: ::print("2"); break; case MODE.MODE_1: ::print("3"); break; diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w211.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w211.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w211.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w211.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w212.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w212.diag.txt similarity index 82% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w212.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w212.diag.txt index a75c91cfe..a665b196d 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w212.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w212.diag.txt @@ -1,5 +1,5 @@ WARNING: w212 (duplicate-if-expression) Detected pattern 'if (A) {...} else if (A) {...}'. Branch unreachable. -testData/static_analyser/w212.nut:10:9 +testData/static_analyzer/w212.nut:10:9 ::print("2") else if (x == 1) // OK @@ -8,7 +8,7 @@ else if (x == 1) // OK WARNING: w212 (duplicate-if-expression) Detected pattern 'if (A) {...} else if (A) {...}'. Branch unreachable. -testData/static_analyser/w212.nut:27:8 +testData/static_analyzer/w212.nut:27:8 ; if (x == 1) // OK @@ -17,7 +17,7 @@ testData/static_analyser/w212.nut:27:8 WARNING: w212 (duplicate-if-expression) Detected pattern 'if (A) {...} else if (A) {...}'. Branch unreachable. -testData/static_analyser/w212.nut:36:9 +testData/static_analyzer/w212.nut:36:9 ::print("2") else if (x == 1) // OK diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w212.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w212.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w212.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w212.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w213.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w213.diag.txt similarity index 76% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w213.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w213.diag.txt index 2c9996b4c..6ac0f3388 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w213.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w213.diag.txt @@ -1,5 +1,5 @@ WARNING: w213 (then-and-else-equals) then' statement is equivalent to 'else' statement. -testData/static_analyser/w213.nut:8:7 +testData/static_analyzer/w213.nut:8:7 } else { diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w213.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w213.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w213.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w213.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w214.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w214.diag.txt similarity index 75% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w214.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w214.diag.txt index 00ffce3b5..625c406d2 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w214.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w214.diag.txt @@ -1,5 +1,5 @@ WARNING: w214 (operator-returns-same-val) Both branches of operator '<> ? <> : <>' are equivalent. -testData/static_analyser/w214.nut:9:7 +testData/static_analyzer/w214.nut:9:7 ::x <- test ? REPLAY.SKIRMISH : REPLAY.SKIRMISH ^--------------------------------------- @@ -9,7 +9,7 @@ testData/static_analyser/w214.nut:9:7 AN ERROR HAS OCCURRED [the index 'f' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w214.nut line [2] +*FUNCTION [__main__()] testData/static_analyzer/w214.nut line [2] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w214.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w214.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w214.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w214.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w215.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w215.diag.txt similarity index 78% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w215.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w215.diag.txt index bedbe8b5e..0f5b31946 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w215.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w215.diag.txt @@ -1,5 +1,5 @@ WARNING: w215 (ternary-priority) The '?:' operator has lower priority than the '+' operator. Perhaps the '?:' operator works in a different way than it was expected. -testData/static_analyser/w215.nut:4:10 +testData/static_analyzer/w215.nut:4:10 local flag = true local b = 10 + flag ? 1 : 2 @@ -11,7 +11,7 @@ local b = 10 + flag ? 1 : 2 AN ERROR HAS OCCURRED [arith op + on between 'integer' and 'bool'] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w215.nut line [4] +*FUNCTION [__main__()] testData/static_analyzer/w215.nut line [4] LOCALS [flag] true diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w215.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w215.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w215.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w215.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w215_nullc.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w215_nullc.diag.txt similarity index 83% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w215_nullc.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w215_nullc.diag.txt index f1364e018..b0bb0a8e2 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w215_nullc.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w215_nullc.diag.txt @@ -1,5 +1,5 @@ WARNING: w215 (ternary-priority) The '?:' operator has lower priority than the '??' operator. Perhaps the '?:' operator works in a different way than it was expected. -testData/static_analyser/w215_nullc.nut:8:9 +testData/static_analyzer/w215_nullc.nut:8:9 let _u = o?.u ?? s ? s / r : 0.0 ^-------- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w215_nullc.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w215_nullc.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w215_nullc.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w215_nullc.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w216.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w216.diag.txt similarity index 78% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w216.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w216.diag.txt index ea7e808c2..cffec83b3 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w216.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w216.diag.txt @@ -1,5 +1,5 @@ WARNING: w216 (same-operands) Left and right operands of '<' operator are the same. -testData/static_analyser/w216.nut:5:4 +testData/static_analyzer/w216.nut:5:4 if (a.x < a.x) ^-------- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w216.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w216.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w216.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w216.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_break.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_break.diag.txt similarity index 71% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_break.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_break.diag.txt index b0b671011..c3d7fb5b4 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_break.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_break.diag.txt @@ -1,5 +1,5 @@ WARNING: w217 (unconditional-terminated-loop) Unconditional 'break' inside a loop. -testData/static_analyser/w217_break.nut:7:4 +testData/static_analyzer/w217_break.nut:7:4 y--; break; @@ -8,7 +8,7 @@ testData/static_analyser/w217_break.nut:7:4 WARNING: w205 (unreachable-code) Unreachable code after 'break'. -testData/static_analyser/w217_break.nut:8:4 +testData/static_analyzer/w217_break.nut:8:4 break; z--; diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_break.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_break.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_break.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_break.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_complex.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_complex.diag.txt similarity index 74% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_complex.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_complex.diag.txt index 6fcb975fe..f65207f62 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_complex.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_complex.diag.txt @@ -1,5 +1,5 @@ WARNING: w217 (unconditional-terminated-loop) Unconditional 'break' inside a loop. -testData/static_analyser/w217_complex.nut:26:12 +testData/static_analyzer/w217_complex.nut:26:12 foo() break @@ -8,7 +8,7 @@ testData/static_analyser/w217_complex.nut:26:12 WARNING: w217 (unconditional-terminated-loop) Unconditional 'throw' inside a loop. -testData/static_analyser/w217_complex.nut:44:8 +testData/static_analyzer/w217_complex.nut:44:8 foo() throw "y" @@ -17,7 +17,7 @@ testData/static_analyser/w217_complex.nut:44:8 WARNING: w217 (unconditional-terminated-loop) Unconditional 'return' inside a loop. -testData/static_analyser/w217_complex.nut:66:4 +testData/static_analyzer/w217_complex.nut:66:4 return 0 ^------- @@ -28,7 +28,7 @@ testData/static_analyser/w217_complex.nut:66:4 AN ERROR HAS OCCURRED [the index 'v' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w217_complex.nut line [9] +*FUNCTION [__main__()] testData/static_analyzer/w217_complex.nut line [9] LOCALS [offset] 1 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_complex.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_complex.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_complex.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_complex.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_cond_cont.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_cond_cont.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_cond_cont.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_cond_cont.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_cond_cont.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_cond_cont.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_cond_cont.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_cond_cont.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_continue.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_continue.diag.txt similarity index 73% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_continue.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_continue.diag.txt index 08703d590..67a2dd965 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_continue.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_continue.diag.txt @@ -1,5 +1,5 @@ WARNING: w217 (unconditional-terminated-loop) Unconditional 'continue' inside a loop. -testData/static_analyser/w217_continue.nut:7:4 +testData/static_analyzer/w217_continue.nut:7:4 ::h(::a, x) continue; @@ -8,7 +8,7 @@ testData/static_analyser/w217_continue.nut:7:4 WARNING: w205 (unreachable-code) Unreachable code after 'continue'. -testData/static_analyser/w217_continue.nut:8:4 +testData/static_analyzer/w217_continue.nut:8:4 continue; x--; diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_continue.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_continue.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_continue.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_continue.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_ret.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_ret.diag.txt similarity index 72% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_ret.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_ret.diag.txt index e4bd71256..cc0cfcd22 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_ret.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_ret.diag.txt @@ -1,5 +1,5 @@ WARNING: w217 (unconditional-terminated-loop) Unconditional 'return' inside a loop. -testData/static_analyser/w217_ret.nut:8:4 +testData/static_analyzer/w217_ret.nut:8:4 return ^----- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_ret.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_ret.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_ret.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_ret.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_throw.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_throw.diag.txt similarity index 73% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_throw.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_throw.diag.txt index 884a8d4ab..ef6655089 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_throw.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_throw.diag.txt @@ -1,5 +1,5 @@ WARNING: w217 (unconditional-terminated-loop) Unconditional 'throw' inside a loop. -testData/static_analyser/w217_throw.nut:8:4 +testData/static_analyzer/w217_throw.nut:8:4 throw "err" ^---------- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_throw.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_throw.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w217_throw.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w217_throw.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w220.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w220.diag.txt similarity index 83% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w220.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w220.diag.txt index 56e642312..779335fc3 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w220.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w220.diag.txt @@ -1,5 +1,5 @@ WARNING: w220 (potentially-nulled-container) 'foreach' on potentially nullable expression. -testData/static_analyser/w220.nut:4:15 +testData/static_analyzer/w220.nut:4:15 let function foo(a){ //-declared-never-used foreach(x in a?.y()) { diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w220.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w220.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w220.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w220.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w220_deep.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w220_deep.diag.txt similarity index 81% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w220_deep.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w220_deep.diag.txt index deb7e5dd4..7ab524f81 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w220_deep.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w220_deep.diag.txt @@ -1,5 +1,5 @@ WARNING: w220 (potentially-nulled-container) 'foreach' on potentially nullable expression. -testData/static_analyser/w220_deep.nut:5:15 +testData/static_analyzer/w220_deep.nut:5:15 local container = a?.y() foreach(x in container) { diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w220_deep.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w220_deep.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w220_deep.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w220_deep.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w221.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w221.diag.txt similarity index 74% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w221.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w221.diag.txt index d755e4764..8be2a2a27 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w221.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w221.diag.txt @@ -1,5 +1,5 @@ WARNING: w221 (result-not-utilized) Result of operation is not used. -testData/static_analyser/w221.nut:6:4 +testData/static_analyzer/w221.nut:6:4 z-- ::x == y diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w221.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w221.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w221.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w221.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w221_delete.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w221_delete.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w221_delete.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w221_delete.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w221_delete.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w221_delete.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w221_delete.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w221_delete.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w222.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w222.diag.txt similarity index 79% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w222.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w222.diag.txt index 26b539e85..6cc92e4bb 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w222.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w222.diag.txt @@ -1,5 +1,5 @@ WARNING: w222 (bool-as-index) Boolean used as array index. -testData/static_analyser/w222.nut:4:12 +testData/static_analyzer/w222.nut:4:12 let function foo(a,x,y) { //-declared-never-used ::print(a[x < y]) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w222.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w222.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w222.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w222.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w222_deep.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w222_deep.diag.txt similarity index 72% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w222_deep.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w222_deep.diag.txt index 98e0c6175..ca366f646 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w222_deep.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w222_deep.diag.txt @@ -1,5 +1,5 @@ WARNING: w222 (bool-as-index) Boolean used as array index. -testData/static_analyser/w222_deep.nut:10:14 +testData/static_analyzer/w222_deep.nut:10:14 b() ::print(a[index]) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w222_deep.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w222_deep.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w222_deep.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w222_deep.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w223.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w223.diag.txt similarity index 76% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w223.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w223.diag.txt index 803ec1b1a..dee52e2a5 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w223.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w223.diag.txt @@ -1,5 +1,5 @@ WARNING: w223 (compared-with-bool) Comparison with boolean. -testData/static_analyser/w223.nut:6:4 +testData/static_analyzer/w223.nut:6:4 local z = 2 if (x == y > z) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w223.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w223.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w223.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w223.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w223_method_is.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w223_method_is.diag.txt similarity index 70% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w223_method_is.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w223_method_is.diag.txt index dc2ac7832..fdd70edd1 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w223_method_is.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w223_method_is.diag.txt @@ -2,7 +2,7 @@ AN ERROR HAS OCCURRED [the index 'isVisible' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w223_method_is.nut line [5] +*FUNCTION [__main__()] testData/static_analyzer/w223_method_is.nut line [5] LOCALS [r] "" diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w223_method_is.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w223_method_is.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w223_method_is.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w223_method_is.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w224_then.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w224_then.diag.txt similarity index 67% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w224_then.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w224_then.diag.txt index 1d673f1aa..bd9ef9e7b 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w224_then.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w224_then.diag.txt @@ -1,5 +1,5 @@ WARNING: w224 (empty-body) 'then' operator has an empty body. -testData/static_analyser/w224_then.nut:4:11 +testData/static_analyzer/w224_then.nut:4:11 if (x == 5); ^ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w224_then.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w224_then.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w224_then.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w224_then.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w224_while.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w224_while.diag.txt similarity index 75% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w224_while.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w224_while.diag.txt index 3433369b1..e6e961af7 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w224_while.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w224_while.diag.txt @@ -1,5 +1,5 @@ WARNING: w192 (statement-on-same-line) Next statement on the same line after 'while loop body' statement. -testData/static_analyser/w224_while.nut:8:4 +testData/static_analyzer/w224_while.nut:8:4 break; } while (x < 6) ; @@ -8,7 +8,7 @@ testData/static_analyser/w224_while.nut:8:4 WARNING: w224 (empty-body) 'while' operator has an empty body. -testData/static_analyser/w224_while.nut:8:18 +testData/static_analyzer/w224_while.nut:8:18 break; } while (x < 6) ; diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w224_while.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w224_while.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w224_while.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w224_while.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w225.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w225.diag.txt similarity index 79% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w225.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w225.diag.txt index a36854884..6d528600d 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w225.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w225.diag.txt @@ -1,5 +1,5 @@ WARNING: w225 (all-paths-return-value) Not all control paths return a value. -testData/static_analyser/w225.nut:3:4 +testData/static_analyzer/w225.nut:3:4 let function x(y) { //-declared-never-used ^ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w225.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w225.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w225.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w225.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w225_empty_stmt.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w225_empty_stmt.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w225_empty_stmt.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w225_empty_stmt.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w225_empty_stmt.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w225_empty_stmt.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w225_empty_stmt.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w225_empty_stmt.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w225_switch.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w225_switch.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w225_switch.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w225_switch.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w225_switch.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w225_switch.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w225_switch.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w225_switch.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w226.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w226.diag.txt similarity index 75% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w226.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w226.diag.txt index ffc9feaf7..82d0d6d5e 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w226.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w226.diag.txt @@ -1,5 +1,5 @@ WARNING: w226 (return-different-types) Function can return different types. -testData/static_analyser/w226.nut:7:4 +testData/static_analyzer/w226.nut:7:4 let function x(y) { ^ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w226.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w226.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w226.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w226.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227.diag.txt similarity index 78% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227.diag.txt index 3fbd59c6f..ccde8aad2 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227.diag.txt @@ -1,5 +1,5 @@ WARNING: w227 (ident-hides-ident) variable 'a' hides parameter with the same name. -testData/static_analyser/w227.nut:6:4 +testData/static_analyzer/w227.nut:6:4 local x = c local a = x diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_external.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_external.diag.txt similarity index 77% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_external.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_external.diag.txt index f8cd73e0d..36b594b81 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_external.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_external.diag.txt @@ -1,5 +1,5 @@ WARNING: w227 (ident-hides-ident) parameter 'println' hides external binding with the same name. -testData/static_analyser/w227_external.nut:1:14 +testData/static_analyzer/w227_external.nut:1:14 function _foo(println) { ^------ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_external.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_external.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_external.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_external.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_fn_with_same_param.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_fn_with_same_param.diag.txt similarity index 73% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_fn_with_same_param.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_fn_with_same_param.diag.txt index b4fd372f5..1c83b7d21 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_fn_with_same_param.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_fn_with_same_param.diag.txt @@ -1,5 +1,5 @@ WARNING: w227 (ident-hides-ident) parameter 'txt' hides function with the same name. -testData/static_analyser/w227_fn_with_same_param.nut:3:17 +testData/static_analyzer/w227_fn_with_same_param.nut:3:17 let function txt(txt) {} //-declared-never-used ^-- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_fn_with_same_param.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_fn_with_same_param.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_fn_with_same_param.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_fn_with_same_param.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_lambdas.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_lambdas.diag.txt similarity index 80% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_lambdas.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_lambdas.diag.txt index 65d634dec..270308eff 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_lambdas.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_lambdas.diag.txt @@ -1,5 +1,5 @@ WARNING: w228 (declared-never-used) let 'y' was declared but never used. -testData/static_analyser/w227_lambdas.nut:3:0 +testData/static_analyzer/w227_lambdas.nut:3:0 local x = 10 let y = x > 320 ? @() 20 : function() { return 30 } diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_lambdas.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_lambdas.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_lambdas.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_lambdas.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_let_init_fun.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_let_init_fun.diag.txt similarity index 70% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_let_init_fun.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_let_init_fun.diag.txt index eaa90264c..46bbd6407 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_let_init_fun.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_let_init_fun.diag.txt @@ -2,7 +2,7 @@ AN ERROR HAS OCCURRED [attempt to call 'null'] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w227_let_init_fun.nut line [5] +*FUNCTION [__main__()] testData/static_analyzer/w227_let_init_fun.nut line [5] LOCALS [setGroup] NULL diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_let_init_fun.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_let_init_fun.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_let_init_fun.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_let_init_fun.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_table.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_table.diag.txt similarity index 68% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_table.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_table.diag.txt index 5c30bd190..18a77ae90 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_table.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_table.diag.txt @@ -1,5 +1,5 @@ WARNING: w228 (declared-never-used) let 'fx' was declared but never used. -testData/static_analyser/w227_table.nut:3:0 +testData/static_analyzer/w227_table.nut:3:0 let fx = 10 ^----- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_table.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_table.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_table.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_table.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_varargs.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_varargs.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_varargs.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_varargs.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_varargs.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_varargs.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w227_varargs.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w227_varargs.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228.diag.txt similarity index 81% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228.diag.txt index 661f550f5..8cc87d625 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228.diag.txt @@ -1,5 +1,5 @@ WARNING: w228 (declared-never-used) variable 'str' was declared but never used. -testData/static_analyser/w228.nut:4:2 +testData/static_analyzer/w228.nut:4:2 let function x() { //-declared-never-used local str = "string" diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_2.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_2.diag.txt similarity index 80% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_2.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_2.diag.txt index bd723d6a8..27f9d2621 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_2.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_2.diag.txt @@ -1,5 +1,5 @@ WARNING: w228 (declared-never-used) variable 'f' was declared but never used. -testData/static_analyser/w228_2.nut:4:2 +testData/static_analyzer/w228_2.nut:4:2 let function fn() { //-declared-never-used local f = 123 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_2.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_2.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_2.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_2.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_3.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_3.diag.txt similarity index 80% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_3.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_3.diag.txt index 69e71be4c..cd338a9e4 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_3.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_3.diag.txt @@ -1,5 +1,5 @@ WARNING: w228 (declared-never-used) variable 'f' was declared but never used. -testData/static_analyser/w228_3.nut:4:2 +testData/static_analyzer/w228_3.nut:4:2 let function fn() { //-declared-never-used local f = 123 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_3.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_3.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_3.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_3.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_4.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_4.diag.txt similarity index 76% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_4.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_4.diag.txt index 7625145aa..2834b7d29 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_4.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_4.diag.txt @@ -1,5 +1,5 @@ WARNING: w228 (declared-never-used) let 'f' was declared but never used. -testData/static_analyser/w228_4.nut:3:0 +testData/static_analyzer/w228_4.nut:3:0 let f = function foo() {} ^-- @@ -7,7 +7,7 @@ let c = class {} WARNING: w228 (declared-never-used) let 'c' was declared but never used. -testData/static_analyser/w228_4.nut:4:0 +testData/static_analyzer/w228_4.nut:4:0 let f = function foo() {} let c = class {} diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_4.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_4.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_4.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_4.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_table.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_table.diag.txt similarity index 71% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_table.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_table.diag.txt index 54bfbddc1..85c0c092d 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_table.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_table.diag.txt @@ -1,5 +1,5 @@ WARNING: w228 (declared-never-used) let 'foo' was declared but never used. -testData/static_analyser/w228_table.nut:3:0 +testData/static_analyzer/w228_table.nut:3:0 function foo(y) { ^ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_table.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_table.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_table.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_table.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_trivial.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_trivial.diag.txt similarity index 82% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_trivial.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_trivial.diag.txt index 204cc29e4..5a695a931 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_trivial.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_trivial.diag.txt @@ -1,5 +1,5 @@ WARNING: w228 (declared-never-used) let 'refreshOnWindowActivate' was declared but never used. -testData/static_analyser/w228_trivial.nut:5:0 +testData/static_analyzer/w228_trivial.nut:5:0 function refreshOnWindowActivate(repeatAmount = 1, refreshPeriodSec = 10.0) { ^------- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_trivial.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_trivial.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w228_trivial.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w228_trivial.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w229.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w229.diag.txt similarity index 85% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w229.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w229.diag.txt index 0b8f7501d..7c124c9fa 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w229.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w229.diag.txt @@ -1,5 +1,5 @@ WARNING: w229 (copy-of-expression) Duplicate expression found inside the sequence of operations. -testData/static_analyser/w229.nut:4:16 +testData/static_analyzer/w229.nut:4:16 local a ={x=0, y=0, z=1} if (a.x == 0 && a.y == 0 && a.x == 0) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w229.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w229.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w229.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w229.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w231.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w231.diag.txt similarity index 82% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w231.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w231.diag.txt index bb574ada2..b1e345e7b 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w231.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w231.diag.txt @@ -1,5 +1,5 @@ WARNING: w231 (format-arguments-count) Format string: arguments count mismatch. -testData/static_analyser/w231.nut:3:20 +testData/static_analyzer/w231.nut:3:20 local string = require("string") print(string.format("%d%%", 1, x)) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w231.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w231.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w231.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w231.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_cascade.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_cascade.diag.txt similarity index 73% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_cascade.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_cascade.diag.txt index 82e982e77..60655aced 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_cascade.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_cascade.diag.txt @@ -1,5 +1,5 @@ WARNING: w228 (declared-never-used) let 'foo' was declared but never used. -testData/static_analyser/w232_cascade.nut:6:0 +testData/static_analyzer/w232_cascade.nut:6:0 function foo(size, a, b) { ^-- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_cascade.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_cascade.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_cascade.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_cascade.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_false.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_false.diag.txt similarity index 80% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_false.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_false.diag.txt index 093497eac..33e1cb28f 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_false.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_false.diag.txt @@ -1,5 +1,5 @@ WARNING: w232 (always-true-or-false) Expression is always 'false'. -testData/static_analyser/w232_false.nut:7:8 +testData/static_analyzer/w232_false.nut:7:8 } ::print(a == ::XEnum.PARAM_A && a == ::XEnum.PARAM_B) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_false.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_false.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_false.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_false.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_lambda.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_lambda.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_lambda.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_lambda.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_lambda.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_lambda.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_lambda.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_lambda.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_nerge.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_nerge.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_nerge.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_nerge.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_nerge.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_nerge.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_nerge.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_nerge.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_not.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_not.diag.txt similarity index 77% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_not.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_not.diag.txt index 7f2558ec8..4d4bdeee2 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_not.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_not.diag.txt @@ -1,5 +1,5 @@ WARNING: w232 (always-true-or-false) Expression is always 'false'. -testData/static_analyser/w232_not.nut:3:8 +testData/static_analyzer/w232_not.nut:3:8 local i = {x = true} ::print(i.x && !i.x) @@ -8,7 +8,7 @@ local i = {x = true} WARNING: w232 (always-true-or-false) Expression is always 'true'. -testData/static_analyser/w232_not.nut:4:8 +testData/static_analyzer/w232_not.nut:4:8 ::print(i.x && !i.x) ::print(!i.x || i.x) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_not.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_not.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_not.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_not.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_ter.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_ter.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_ter.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_ter.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_ter.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_ter.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_ter.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_ter.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_ternary.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_ternary.diag.txt similarity index 77% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_ternary.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_ternary.diag.txt index bdba29b06..fea8e2227 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_ternary.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_ternary.diag.txt @@ -1,5 +1,5 @@ WARNING: w232 (always-true-or-false) Expression is always 'true'. -testData/static_analyser/w232_ternary.nut:2:12 +testData/static_analyzer/w232_ternary.nut:2:12 //expect:w232 local foo = function() { } ? false : true diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_ternary.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_ternary.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_ternary.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_ternary.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_true.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_true.diag.txt similarity index 80% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_true.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_true.diag.txt index 24330381a..db9704345 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_true.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_true.diag.txt @@ -1,5 +1,5 @@ WARNING: w232 (always-true-or-false) Expression is always 'true'. -testData/static_analyser/w232_true.nut:7:8 +testData/static_analyzer/w232_true.nut:7:8 local i = 2 ::print(s.charAt(i) != ' ' || s.charAt(i) != '\t') diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_true.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_true.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w232_true.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w232_true.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233.diag.txt similarity index 80% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233.diag.txt index 89f9fb463..f421fdbd4 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233.diag.txt @@ -1,5 +1,5 @@ WARNING: w233 (const-in-bool-expr) Constant in a boolean expression. -testData/static_analyser/w233.nut:4:19 +testData/static_analyzer/w233.nut:4:19 ::flags <- 0x2040 ::aspect_ratio <- (::flags && 0x40) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_const_key.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_const_key.diag.txt similarity index 73% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_const_key.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_const_key.diag.txt index c03ad5ba4..ae84dc782 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_const_key.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_const_key.diag.txt @@ -2,7 +2,7 @@ AN ERROR HAS OCCURRED [the index 's' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w233_const_key.nut line [4] +*FUNCTION [__main__()] testData/static_analyzer/w233_const_key.nut line [4] LOCALS [o] NULL diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_const_key.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_const_key.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_const_key.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_const_key.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_deep.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_deep.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_deep.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_deep.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_deep.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_deep.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_deep.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_deep.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_destruct.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_destruct.diag.txt similarity index 80% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_destruct.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_destruct.diag.txt index bd604fb67..c07d8c17b 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_destruct.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_destruct.diag.txt @@ -1,5 +1,5 @@ WARNING: w228 (declared-never-used) let 'y' was declared but never used. -testData/static_analyser/w233_destruct.nut:3:0 +testData/static_analyzer/w233_destruct.nut:3:0 local x = 10 let y = x > 320 ? @() 20 : function() { return 30 } diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_destruct.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_destruct.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_destruct.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_destruct.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_inc.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_inc.diag.txt similarity index 75% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_inc.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_inc.diag.txt index 9f54db8f7..36f2279c3 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_inc.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_inc.diag.txt @@ -2,7 +2,7 @@ AN ERROR HAS OCCURRED [the index '0' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w233_inc.nut line [17] +*FUNCTION [__main__()] testData/static_analyzer/w233_inc.nut line [17] LOCALS [ismasked] CLOSURE=FN:ismasked diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_inc.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_inc.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_inc.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_inc.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_inc_in_for.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_inc_in_for.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_inc_in_for.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_inc_in_for.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_inc_in_for.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_inc_in_for.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w233_inc_in_for.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w233_inc_in_for.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w234.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w234.diag.txt similarity index 78% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w234.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w234.diag.txt index 71da321a5..a70574f97 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w234.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w234.diag.txt @@ -1,5 +1,5 @@ WARNING: w234 (div-by-zero) Integer division by zero. -testData/static_analyser/w234.nut:4:10 +testData/static_analyzer/w234.nut:4:10 let function foo() { //-declared-never-used ::print(1 / (0)) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w234.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w234.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w234.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w234.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w235.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w235.diag.txt similarity index 77% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w235.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w235.diag.txt index 3814c85a9..5c43b6586 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w235.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w235.diag.txt @@ -1,5 +1,5 @@ WARNING: w235 (round-to-int) Result of division will be integer. -testData/static_analyser/w235.nut:3:19 +testData/static_analyzer/w235.nut:3:19 ::aspect_ratio <- (1280 / (720)) ^----------- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w235.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w235.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w235.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w235.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w236.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w236.diag.txt similarity index 88% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w236.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w236.diag.txt index 2b96ad4ec..9aee08034 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w236.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w236.diag.txt @@ -1,5 +1,5 @@ WARNING: w236 (shift-priority) Shift operator has lower priority. Perhaps parentheses are missing? -testData/static_analyser/w236.nut:4:30 +testData/static_analyzer/w236.nut:4:30 let function foo(berserkFx, state){ //-declared-never-used if (!berserkFx && (state & (1 << ::SCRIPT_STATE_USER_SHIFT + 4))) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w236.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w236.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w236.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w236.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w238_idname.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w238_idname.diag.txt similarity index 67% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w238_idname.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w238_idname.diag.txt index 67c6a7ff4..2964fb1a5 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w238_idname.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w238_idname.diag.txt @@ -2,7 +2,7 @@ AN ERROR HAS OCCURRED [the index 'foo' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w238_idname.nut line [3] +*FUNCTION [__main__()] testData/static_analyzer/w238_idname.nut line [3] LOCALS [state] TABLE={} diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w238_idname.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w238_idname.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w238_idname.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w238_idname.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w238_isis.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w238_isis.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w238_isis.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w238_isis.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w238_isis.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w238_isis.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w238_isis.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w238_isis.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w238_sqconfig.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w238_sqconfig.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w238_sqconfig.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w238_sqconfig.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w238_sqconfig.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w238_sqconfig.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w238_sqconfig.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w238_sqconfig.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w239.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w239.diag.txt similarity index 85% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w239.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w239.diag.txt index cad4cb234..105c484ea 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w239.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w239.diag.txt @@ -1,5 +1,5 @@ WARNING: w239 (named-like-return-bool) Function name 'isLoggedIn' implies a return boolean type but not all control paths returns boolean. -testData/static_analyser/w239.nut:3:4 +testData/static_analyzer/w239.nut:3:4 let function isLoggedIn() { //-declared-never-used ^ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w239.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w239.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w239.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w239.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w239_sqconfig.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w239_sqconfig.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w239_sqconfig.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w239_sqconfig.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w239_sqconfig.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w239_sqconfig.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w239_sqconfig.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w239_sqconfig.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w240.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w240.diag.txt similarity index 78% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w240.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w240.diag.txt index 65f9721e1..716f1494a 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w240.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w240.diag.txt @@ -1,5 +1,5 @@ WARNING: w240 (null-coalescing-priority) The '??' operator has a lower priority than the '!=' operator (a??b > c == a??(b > c)). Perhaps the '??' operator works in a different way than it was expected. -testData/static_analyser/w240.nut:6:13 +testData/static_analyzer/w240.nut:6:13 ::print(a ?? b != 1) // expected boolean ^----- @@ -9,7 +9,7 @@ testData/static_analyser/w240.nut:6:13 AN ERROR HAS OCCURRED [the index 'x' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w240.nut line [3] +*FUNCTION [__main__()] testData/static_analyzer/w240.nut line [3] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w240.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w240.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w240.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w240.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w241.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w241.diag.txt similarity index 83% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w241.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w241.diag.txt index c540abd91..bbfb024ae 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w241.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w241.diag.txt @@ -1,5 +1,5 @@ WARNING: w241 (already-required) Module 'string' has been required already. -testData/static_analyser/w241.nut:4:13 +testData/static_analyzer/w241.nut:4:13 local str1 = require("string") local str2 = require("string") @@ -11,7 +11,7 @@ local str2 = require("string") AN ERROR HAS OCCURRED [wrong number of parameters passed to native closure 'print' (3 passed, 2 required)] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w241.nut line [5] +*FUNCTION [__main__()] testData/static_analyzer/w241.nut line [5] LOCALS [str2] TABLE={startswith=FN:startswith, lstrip=FN:lstrip, escape=FN:escape, ...} (10) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w241.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w241.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w241.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w241.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w241_conditional.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w241_conditional.diag.txt similarity index 73% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w241_conditional.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w241_conditional.diag.txt index 356db20c0..aa1e2dd19 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w241_conditional.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w241_conditional.diag.txt @@ -2,7 +2,7 @@ AN ERROR HAS OCCURRED [Script file not found: m.nut / m.nut] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w241_conditional.nut line [5] +*FUNCTION [__main__()] testData/static_analyzer/w241_conditional.nut line [5] LOCALS [_m] NULL diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w241_conditional.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w241_conditional.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w241_conditional.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w241_conditional.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w244.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w244.diag.txt similarity index 80% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w244.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w244.diag.txt index 53778c9b2..7de07b336 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w244.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w244.diag.txt @@ -1,5 +1,5 @@ WARNING: w244 (used-from-static) Access 'this.y' from static member function. -testData/static_analyser/w244.nut:16:8 +testData/static_analyzer/w244.nut:16:8 this.sss(); // FP 3 this.y = 30 // EXPECTED 1 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w244.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w244.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w244.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w244.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w247.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w247.diag.txt similarity index 72% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w247.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w247.diag.txt index f4fdca003..408cc07aa 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w247.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w247.diag.txt @@ -1,5 +1,5 @@ WARNING: w247 (func-can-return-null) Function 'indexof' can return null, but its result is used here. -testData/static_analyser/w247.nut:3:7 +testData/static_analyzer/w247.nut:3:7 return ::a.b.c.indexof("x") + 6; ^------------------- @@ -9,7 +9,7 @@ return ::a.b.c.indexof("x") + 6; AN ERROR HAS OCCURRED [the index 'a' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w247.nut line [3] +*FUNCTION [__main__()] testData/static_analyzer/w247.nut line [3] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248.diag.txt similarity index 72% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248.diag.txt index 28dba4fca..b8c54bf3e 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248.diag.txt @@ -1,5 +1,5 @@ WARNING: w248 (access-potentially-nulled) 'a' can be null, but is used as a function without checking. -testData/static_analyser/w248.nut:8:4 +testData/static_analyzer/w248.nut:8:4 } else { a() @@ -8,7 +8,7 @@ testData/static_analyser/w248.nut:8:4 WARNING: w248 (access-potentially-nulled) 'a' can be null, but is used as a function without checking. -testData/static_analyser/w248.nut:11:7 +testData/static_analyzer/w248.nut:11:7 return a() ^ @@ -18,7 +18,7 @@ return a() AN ERROR HAS OCCURRED [the index 'x' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w248.nut line [3] +*FUNCTION [__main__()] testData/static_analyzer/w248.nut line [3] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_andnad1.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_andnad1.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_andnad1.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_andnad1.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_andnad1.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_andnad1.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_andnad1.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_andnad1.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_andor.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_andor.diag.txt similarity index 81% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_andor.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_andor.diag.txt index 510101ddf..25d52e644 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_andor.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_andor.diag.txt @@ -1,5 +1,5 @@ WARNING: w248 (access-potentially-nulled) 'expression' can be null, but is used as a container without checking. -testData/static_analyser/w248_andor.nut:4:28 +testData/static_analyzer/w248_andor.nut:4:28 let a = list != null && list.len() == 0 let b = list != null || list.len() == 0 @@ -7,7 +7,7 @@ testData/static_analyser/w248_andor.nut:4:28 WARNING: w228 (declared-never-used) let 'riIsEmptyGroup' was declared but never used. -testData/static_analyser/w248_andor.nut:1:0 +testData/static_analyzer/w248_andor.nut:1:0 let function riIsEmptyGroup(x) { ^-- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_andor.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_andor.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_andor.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_andor.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_assert.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_assert.diag.txt similarity index 69% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_assert.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_assert.diag.txt index 49051bb3f..2425fc47f 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_assert.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_assert.diag.txt @@ -2,7 +2,7 @@ AN ERROR HAS OCCURRED [assertion failed] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w248_assert.nut line [11] +*FUNCTION [__main__()] testData/static_analyzer/w248_assert.nut line [11] LOCALS [c] NULL diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_assert.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_assert.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_assert.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_assert.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_chain.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_chain.diag.txt similarity index 67% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_chain.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_chain.diag.txt index 62300639c..617433d72 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_chain.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_chain.diag.txt @@ -2,7 +2,7 @@ AN ERROR HAS OCCURRED [the index 'w' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w248_chain.nut line [7] +*FUNCTION [__main__()] testData/static_analyzer/w248_chain.nut line [7] LOCALS [a] 10 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_chain.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_chain.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_chain.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_chain.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_complex2.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_complex2.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_complex2.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_complex2.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_complex2.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_complex2.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_complex2.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_complex2.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_complex_key.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_complex_key.diag.txt similarity index 74% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_complex_key.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_complex_key.diag.txt index c26a3f7ff..fa127f7ec 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_complex_key.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_complex_key.diag.txt @@ -1,12 +1,12 @@ WARNING: w248 (access-potentially-nulled) 'expression' can be null, but is used as a container without checking. -testData/static_analyser/w248_complex_key.nut:14:4 +testData/static_analyzer/w248_complex_key.nut:14:4 qux(a.z, h.z) ^ WARNING: w248 (access-potentially-nulled) 'expression' can be null, but is used as a container without checking. -testData/static_analyser/w248_complex_key.nut:14:9 +testData/static_analyzer/w248_complex_key.nut:14:9 qux(a.z, h.z) ^ @@ -16,7 +16,7 @@ qux(a.z, h.z) AN ERROR HAS OCCURRED [the index 'z' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w248_complex_key.nut line [14] +*FUNCTION [__main__()] testData/static_analyzer/w248_complex_key.nut line [14] LOCALS [h] NULL diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_complex_key.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_complex_key.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_complex_key.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_complex_key.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_complexcond.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_complexcond.diag.txt similarity index 73% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_complexcond.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_complexcond.diag.txt index 48011090b..b802c092c 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_complexcond.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_complexcond.diag.txt @@ -1,5 +1,5 @@ WARNING: w248 (access-potentially-nulled) 'expression' can be null, but is used as a container without checking. -testData/static_analyser/w248_complexcond.nut:7:8 +testData/static_analyzer/w248_complexcond.nut:7:8 if (x == null && foo(2)) foo(x.y) @@ -11,7 +11,7 @@ else AN ERROR HAS OCCURRED [the index 'x' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w248_complexcond.nut line [9] +*FUNCTION [__main__()] testData/static_analyzer/w248_complexcond.nut line [9] LOCALS [x] NULL diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_complexcond.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_complexcond.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_complexcond.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_complexcond.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_eq_get.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_eq_get.diag.txt similarity index 70% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_eq_get.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_eq_get.diag.txt index 22bf625b3..a4591f14e 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_eq_get.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_eq_get.diag.txt @@ -2,7 +2,7 @@ AN ERROR HAS OCCURRED [the index 'value' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w248_eq_get.nut line [5] +*FUNCTION [__main__()] testData/static_analyzer/w248_eq_get.nut line [5] LOCALS [o] TABLE={} diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_eq_get.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_eq_get.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_eq_get.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_eq_get.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_evaled.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_evaled.diag.txt similarity index 70% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_evaled.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_evaled.diag.txt index 95a6fd48c..664c7eb67 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_evaled.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_evaled.diag.txt @@ -2,7 +2,7 @@ AN ERROR HAS OCCURRED [the index 'value' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w248_evaled.nut line [6] +*FUNCTION [__main__()] testData/static_analyzer/w248_evaled.nut line [6] LOCALS [foo] CLOSURE=FN:foo diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_evaled.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_evaled.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_evaled.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_evaled.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_getfield.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_getfield.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_getfield.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_getfield.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_getfield.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_getfield.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_getfield.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_getfield.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_in.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_in.diag.txt similarity index 78% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_in.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_in.diag.txt index 0633f3608..f115d592a 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_in.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_in.diag.txt @@ -1,5 +1,5 @@ WARNING: w248 (access-potentially-nulled) 'expression' can be null, but is used as a container without checking. -testData/static_analyser/w248_in.nut:6:11 +testData/static_analyzer/w248_in.nut:6:11 return text.d ^--- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_in.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_in.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_in.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_in.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_in_container.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_in_container.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_in_container.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_in_container.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_in_container.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_in_container.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_in_container.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_in_container.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_not.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_not.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_not.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_not.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_not.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_not.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_not.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_not.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_nullc_2.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_nullc_2.diag.txt similarity index 68% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_nullc_2.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_nullc_2.diag.txt index 3ea8ecb42..b5cb8f49a 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_nullc_2.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_nullc_2.diag.txt @@ -2,7 +2,7 @@ AN ERROR HAS OCCURRED [the index 'value' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w248_nullc_2.nut line [5] +*FUNCTION [__main__()] testData/static_analyzer/w248_nullc_2.nut line [5] LOCALS [i] 10 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_nullc_2.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_nullc_2.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_nullc_2.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_nullc_2.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_nullc_3.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_nullc_3.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_nullc_3.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_nullc_3.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_nullc_3.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_nullc_3.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_nullc_3.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_nullc_3.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_oror.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_oror.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_oror.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_oror.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_oror.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_oror.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_oror.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_oror.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_oror2.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_oror2.diag.txt similarity index 83% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_oror2.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_oror2.diag.txt index 6fa23f019..08a35c4c7 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_oror2.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_oror2.diag.txt @@ -1,5 +1,5 @@ WARNING: w248 (access-potentially-nulled) 'nullNotH' can be null, but is used as a function without checking. -testData/static_analyser/w248_oror2.nut:7:4 +testData/static_analyzer/w248_oror2.nut:7:4 onDoubleClickCb(itemDesc.__merge({ rectOrPos = event.targetRect })) nullNotH() diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_oror2.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_oror2.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_oror2.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_oror2.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_override.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_override.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_override.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_override.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_override.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_override.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_override.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_override.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_relative_pred.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_relative_pred.diag.txt similarity index 70% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_relative_pred.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_relative_pred.diag.txt index 9ac5f5670..0518f89ea 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_relative_pred.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_relative_pred.diag.txt @@ -1,5 +1,5 @@ WARNING: w200 (potentially-nulled-ops) Comparison operation with potentially nullable expression. -testData/static_analyser/w248_relative_pred.nut:12:9 +testData/static_analyzer/w248_relative_pred.nut:12:9 if (x && y > 0) { ^ @@ -7,7 +7,7 @@ if (x && y > 0) { WARNING: w228 (declared-never-used) variable 'r' was declared but never used. -testData/static_analyser/w248_relative_pred.nut:8:0 +testData/static_analyzer/w248_relative_pred.nut:8:0 local r = null ^-- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_relative_pred.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_relative_pred.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_relative_pred.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_relative_pred.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_special_name_func.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_special_name_func.diag.txt similarity index 71% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_special_name_func.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_special_name_func.diag.txt index f5e928dca..c5074cdb2 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_special_name_func.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_special_name_func.diag.txt @@ -1,5 +1,5 @@ WARNING: w248 (access-potentially-nulled) 'expression' can be null, but is used as a container without checking. -testData/static_analyser/w248_special_name_func.nut:4:0 +testData/static_analyzer/w248_special_name_func.nut:4:0 x.indexof(".").foo() ^------------- @@ -9,7 +9,7 @@ x.indexof(".").foo() AN ERROR HAS OCCURRED [the index 'indexof' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w248_special_name_func.nut line [4] +*FUNCTION [__main__()] testData/static_analyzer/w248_special_name_func.nut line [4] LOCALS [x] TABLE={} diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_special_name_func.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_special_name_func.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_special_name_func.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_special_name_func.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_terminated_branch1.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_terminated_branch1.diag.txt similarity index 71% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_terminated_branch1.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_terminated_branch1.diag.txt index 51d3c8a38..caab40ff9 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_terminated_branch1.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_terminated_branch1.diag.txt @@ -1,5 +1,5 @@ WARNING: w210 (potentially-nulled-index) Potentially nullable expression used as array index. -testData/static_analyser/w248_terminated_branch1.nut:19:12 +testData/static_analyzer/w248_terminated_branch1.nut:19:12 res[t] <- {} res[nu] <- {} @@ -8,7 +8,7 @@ testData/static_analyser/w248_terminated_branch1.nut:19:12 WARNING: w210 (potentially-nulled-index) Potentially nullable expression used as array index. -testData/static_analyser/w248_terminated_branch1.nut:23:10 +testData/static_analyzer/w248_terminated_branch1.nut:23:10 res[t][g] <- d res[nu][g] <- d @@ -17,7 +17,7 @@ testData/static_analyser/w248_terminated_branch1.nut:23:10 WARNING: w228 (declared-never-used) let 'bar' was declared but never used. -testData/static_analyser/w248_terminated_branch1.nut:8:0 +testData/static_analyzer/w248_terminated_branch1.nut:8:0 function bar() { ^-- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_terminated_branch1.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_terminated_branch1.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_terminated_branch1.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_terminated_branch1.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_tyopeof1.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_tyopeof1.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_tyopeof1.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_tyopeof1.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_tyopeof1.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_tyopeof1.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_tyopeof1.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_tyopeof1.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_tyopeof2.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_tyopeof2.diag.txt similarity index 86% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_tyopeof2.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_tyopeof2.diag.txt index 0e420e603..a954d4f5b 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_tyopeof2.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_tyopeof2.diag.txt @@ -1,5 +1,5 @@ WARNING: w248 (access-potentially-nulled) 'f' can be null, but is used as a function without checking. -testData/static_analyser/w248_tyopeof2.nut:6:15 +testData/static_analyzer/w248_tyopeof2.nut:6:15 if ((typeof colorStr != "string" || (colorStr.len() != 8 && colorStr.len() != 6)) && (typeof f == "null") ) return f("first param must be string with len 6 or 8") diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_tyopeof2.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_tyopeof2.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_tyopeof2.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_tyopeof2.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_type_func.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_type_func.diag.txt similarity index 85% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_type_func.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_type_func.diag.txt index 013d5200b..1b1c03438 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_type_func.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_type_func.diag.txt @@ -1,5 +1,5 @@ WARNING: w248 (access-potentially-nulled) 'f' can be null, but is used as a function without checking. -testData/static_analyser/w248_type_func.nut:4:15 +testData/static_analyzer/w248_type_func.nut:4:15 if ((type(colorStr) != "string" || (colorStr.len() != 8 && colorStr.len() != 6)) && (type(f) == "null") ) return f("first param must be string with len 6 or 8") diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_type_func.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_type_func.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_type_func.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_type_func.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_while_cond.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_while_cond.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_while_cond.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_while_cond.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_while_cond.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_while_cond.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w248_while_cond.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w248_while_cond.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w249.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w249.diag.txt similarity index 72% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w249.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w249.diag.txt index c44389281..a2396c85f 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w249.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w249.diag.txt @@ -1,5 +1,5 @@ WARNING: w248 (access-potentially-nulled) 'expression' can be null, but is used as a container without checking. -testData/static_analyser/w249.nut:4:7 +testData/static_analyzer/w249.nut:4:7 local a = ::x?.b return a.b[6] @@ -10,7 +10,7 @@ return a.b[6] AN ERROR HAS OCCURRED [the index 'x' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w249.nut line [3] +*FUNCTION [__main__()] testData/static_analyzer/w249.nut line [3] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w249.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w249.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w249.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w249.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w249_array.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w249_array.diag.txt similarity index 71% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w249_array.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w249_array.diag.txt index 20d3b5a06..a25b52c4a 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w249_array.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w249_array.diag.txt @@ -1,5 +1,5 @@ WARNING: w248 (access-potentially-nulled) 'expression' can be null, but is used as a container without checking. -testData/static_analyser/w249_array.nut:4:7 +testData/static_analyzer/w249_array.nut:4:7 local a = ::x?.b return a[6]; @@ -10,7 +10,7 @@ return a[6]; AN ERROR HAS OCCURRED [the index 'x' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w249_array.nut line [3] +*FUNCTION [__main__()] testData/static_analyzer/w249_array.nut line [3] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w249_array.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w249_array.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w249_array.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w249_array.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w250.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w250.diag.txt similarity index 74% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w250.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w250.diag.txt index b36506244..e4e217131 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w250.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w250.diag.txt @@ -1,5 +1,5 @@ WARNING: w250 (cmp-with-container) Comparison with a array. -testData/static_analyser/w250.nut:5:10 +testData/static_analyzer/w250.nut:5:10 let _x = (::a != []) ^-------- @@ -7,7 +7,7 @@ let _y = (::a != {}) WARNING: w250 (cmp-with-container) Comparison with a declaration. -testData/static_analyser/w250.nut:6:10 +testData/static_analyzer/w250.nut:6:10 let _x = (::a != []) let _y = (::a != {}) @@ -16,7 +16,7 @@ let _z = (::a != t) WARNING: w250 (cmp-with-container) Comparison with a declaration. -testData/static_analyser/w250.nut:8:11 +testData/static_analyzer/w250.nut:8:11 let _z = (::a != t) let _xx = (::a == @ (v) v) @@ -27,7 +27,7 @@ let _xx = (::a == @ (v) v) AN ERROR HAS OCCURRED [the index 'a' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w250.nut line [5] +*FUNCTION [__main__()] testData/static_analyzer/w250.nut line [5] LOCALS [t] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w250.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w250.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w250.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w250.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w251.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w251.diag.txt similarity index 67% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w251.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w251.diag.txt index 3601fc4c9..0696a48f7 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w251.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w251.diag.txt @@ -1,5 +1,5 @@ WARNING: w250 (cmp-with-container) Comparison with a declaration. -testData/static_analyser/w251.nut:3:8 +testData/static_analyzer/w251.nut:3:8 return (::a != {}) ^--------- @@ -9,7 +9,7 @@ return (::a != {}) AN ERROR HAS OCCURRED [the index 'a' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w251.nut line [3] +*FUNCTION [__main__()] testData/static_analyzer/w251.nut line [3] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w251.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w251.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w251.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w251.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254.diag.txt similarity index 72% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254.diag.txt index 73bd4c46b..d24e07828 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254.diag.txt @@ -1,5 +1,5 @@ WARNING: w254 (bool-passed-to-strange) Boolean passed to 'in' operator. -testData/static_analyser/w254.nut:3:4 +testData/static_analyzer/w254.nut:3:4 if (!"weapModSlotName" in ::item) ^--------------------------- @@ -10,7 +10,7 @@ if (!"weapModSlotName" in ::item) AN ERROR HAS OCCURRED [the index 'item' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w254.nut line [3] +*FUNCTION [__main__()] testData/static_analyzer/w254.nut line [3] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254_instanceof.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254_instanceof.diag.txt similarity index 74% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254_instanceof.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254_instanceof.diag.txt index 4419553c0..1fed77fe0 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254_instanceof.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254_instanceof.diag.txt @@ -1,5 +1,5 @@ WARNING: w254 (bool-passed-to-strange) Boolean passed to 'instanceof' operator. -testData/static_analyser/w254_instanceof.nut:3:4 +testData/static_analyzer/w254_instanceof.nut:3:4 local x = 10 if (x instanceof !"weapModSlotName") @@ -11,7 +11,7 @@ if (x instanceof !"weapModSlotName") AN ERROR HAS OCCURRED [cannot apply instanceof between a bool and a integer] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w254_instanceof.nut line [3] +*FUNCTION [__main__()] testData/static_analyzer/w254_instanceof.nut line [3] LOCALS [x] 10 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254_instanceof.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254_instanceof.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254_instanceof.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254_instanceof.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254_notin.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254_notin.diag.txt similarity index 72% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254_notin.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254_notin.diag.txt index cedac77fe..362799ef4 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254_notin.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254_notin.diag.txt @@ -1,5 +1,5 @@ WARNING: w254 (bool-passed-to-strange) Boolean passed to 'in' operator. -testData/static_analyser/w254_notin.nut:3:4 +testData/static_analyzer/w254_notin.nut:3:4 if (!"weapModSlotName" not in ::item) ^------------------------------- @@ -10,7 +10,7 @@ if (!"weapModSlotName" not in ::item) AN ERROR HAS OCCURRED [the index 'item' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w254_notin.nut line [3] +*FUNCTION [__main__()] testData/static_analyzer/w254_notin.nut line [3] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254_notin.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254_notin.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w254_notin.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w254_notin.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w255.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w255.diag.txt similarity index 84% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w255.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w255.diag.txt index d1015659b..213c42ed4 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w255.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w255.diag.txt @@ -1,5 +1,5 @@ WARNING: w255 (duplicate-function) Duplicate function body. Consider functions 'onTimer2' and 'onTimer'. -testData/static_analyser/w255.nut:29:21 +testData/static_analyzer/w255.nut:29:21 function onTimer(obj, dt) { ^ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w255.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w255.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w255.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w255.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w255_2.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w255_2.diag.txt similarity index 83% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w255_2.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w255_2.diag.txt index b51e54c8d..5c7ed5095 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w255_2.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w255_2.diag.txt @@ -1,5 +1,5 @@ WARNING: w255 (duplicate-function) Duplicate function body. Consider functions 'onTimer2' and 'onTimer'. -testData/static_analyser/w255_2.nut:31:21 +testData/static_analyzer/w255_2.nut:31:21 function onTimer(obj, dt) { ^ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w255_2.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w255_2.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w255_2.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w255_2.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w256.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w256.diag.txt similarity index 85% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w256.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w256.diag.txt index a4cae1b68..b70194525 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w256.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w256.diag.txt @@ -1,5 +1,5 @@ WARNING: w256 (key-and-function-name) Key and function name are not the same ('_foo2' and 'bar2'). -testData/static_analyser/w256.nut:8:12 +testData/static_analyzer/w256.nut:8:12 "1foo" : function bar1() {}, // FP 1, not id "_foo2" : function bar2() {}, // EXPECTED 1 @@ -8,7 +8,7 @@ testData/static_analyser/w256.nut:8:12 WARNING: w256 (key-and-function-name) Key and function name are not the same ('foo4' and 'bar5'). -testData/static_analyser/w256.nut:11:9 +testData/static_analyzer/w256.nut:11:9 foo4 = function bar5() {}, // EXPECTED 2 ^------------------ @@ -16,7 +16,7 @@ testData/static_analyser/w256.nut:11:9 WARNING: w256 (key-and-function-name) Key and function name are not the same ('foo7' and 'bar7'). -testData/static_analyser/w256.nut:14:13 +testData/static_analyzer/w256.nut:14:13 ["foo7"] = function bar7() {}, // EXPECTED 3 ^------------------ @@ -24,7 +24,7 @@ testData/static_analyser/w256.nut:14:13 WARNING: w256 (key-and-function-name) Key and function name are not the same ('qux' and 'fex'). -testData/static_analyser/w256.nut:21:10 +testData/static_analyzer/w256.nut:21:10 tt.foo <- bar // FP 5 tt.qux <- function fex() {} // EXPECTED 4 @@ -33,7 +33,7 @@ tt["fk"] <- function uyte() {} // EXPECTED 5 WARNING: w256 (key-and-function-name) Key and function name are not the same ('fk' and 'uyte'). -testData/static_analyser/w256.nut:22:12 +testData/static_analyzer/w256.nut:22:12 tt.qux <- function fex() {} // EXPECTED 4 tt["fk"] <- function uyte() {} // EXPECTED 5 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w256.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w256.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w256.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w256.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w257.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w257.diag.txt similarity index 77% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w257.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w257.diag.txt index 35d806b51..b5a691e4f 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w257.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w257.diag.txt @@ -1,5 +1,5 @@ WARNING: w257 (duplicate-assigned-expr) Duplicate of the assigned expression. -testData/static_analyser/w257.nut:14:26 +testData/static_analyzer/w257.nut:14:26 ] local numTextAnimations = [ @@ -11,7 +11,7 @@ local numTextAnimations = [ AN ERROR HAS OCCURRED [the index 'aaa' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w257.nut line [5] +*FUNCTION [__main__()] testData/static_analyzer/w257.nut line [5] LOCALS [OutCubic] 2 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w257.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w257.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w257.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w257.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w258.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w258.diag.txt similarity index 87% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w258.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w258.diag.txt index c662b7980..b7380c7ba 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w258.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w258.diag.txt @@ -1,5 +1,5 @@ WARNING: w258 (similar-function) Function bodies are very similar. Consider functions 'updateEventLb' and 'updateEventLbSelfRow'. -testData/static_analyser/w258.nut:46:34 +testData/static_analyzer/w258.nut:46:34 function updateEventLbSelfRow(requestData, id) { ^ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w258.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w258.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w258.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w258.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w258_2.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w258_2.diag.txt similarity index 87% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w258_2.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w258_2.diag.txt index abca848aa..808eb5fc4 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w258_2.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w258_2.diag.txt @@ -1,5 +1,5 @@ WARNING: w258 (similar-function) Function bodies are very similar. Consider functions 'updateEventLb' and 'updateEventLbSelfRow'. -testData/static_analyser/w258_2.nut:47:34 +testData/static_analyzer/w258_2.nut:47:34 function updateEventLbSelfRow(requestData, id) { ^ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w258_2.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w258_2.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w258_2.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w258_2.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w259.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w259.diag.txt similarity index 78% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w259.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w259.diag.txt index 44065762d..02152c82b 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w259.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w259.diag.txt @@ -1,5 +1,5 @@ WARNING: w259 (similar-assigned-expr) Assigned expression is very similar to one of the previous ones. -testData/static_analyser/w259.nut:20:26 +testData/static_analyzer/w259.nut:20:26 local numAnimations = ::a + ::b + ::c + ::d - (::a + ::b + ::c + ::d) * ::x + 123 local numTextAnimations = ::a + ::b + ::c + ::d - (::a + ::b + ::c + ::d) * ::x + 124 // EXPECTED @@ -10,16 +10,14 @@ local numTextAnimations = ::a + ::b + ::c + ::d - (::a + ::b + ::c + ::d) * ::x AN ERROR HAS OCCURRED [the index 'a' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w259.nut line [19] +*FUNCTION [__main__()] testData/static_analyzer/w259.nut line [19] LOCALS [_M] CLOSURE=FN:_M [_P] CLOSURE=FN:_P [S] CLOSURE=FN:S [_ttp] NULL -[$ch1] CLOSURE=FN:(w259.nut:12) [_tt] NULL -[$ch0] CLOSURE=FN:(w259.nut:9) [au] TABLE={} [Computed] CLOSURE=FN:Computed [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w259.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w259.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w259.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w259.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w260_local_function.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w260_local_function.diag.txt similarity index 83% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w260_local_function.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w260_local_function.diag.txt index 26e336191..0b93561c6 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w260_local_function.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w260_local_function.diag.txt @@ -1,5 +1,5 @@ WARNING: w260 (named-like-must-return-result) Function 'getSettings' has name like it should return a value, but not all control paths returns a value. -testData/static_analyser/w260_local_function.nut:3:6 +testData/static_analyzer/w260_local_function.nut:3:6 local function getSettings(path) { return } //-declared-never-used ^------------------------------------ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w260_local_function.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w260_local_function.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w260_local_function.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w260_local_function.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w260_table.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w260_table.diag.txt similarity index 87% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w260_table.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w260_table.diag.txt index 8bd6eae55..07add99da 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w260_table.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w260_table.diag.txt @@ -1,5 +1,5 @@ WARNING: w260 (named-like-must-return-result) Function 'get_setting_by_blk_path' has name like it should return a value, but not all control paths returns a value. -testData/static_analyser/w260_table.nut:4:28 +testData/static_analyzer/w260_table.nut:4:28 ::game <- { get_setting_by_blk_path = function(path) { return } //-declared-never-used diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w260_table.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w260_table.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w260_table.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w260_table.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w260_table_sqconfig.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w260_table_sqconfig.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w260_table_sqconfig.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w260_table_sqconfig.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w260_table_sqconfig.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w260_table_sqconfig.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w260_table_sqconfig.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w260_table_sqconfig.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w262.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w262.diag.txt similarity index 75% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w262.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w262.diag.txt index 6f8da67d8..72259243c 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w262.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w262.diag.txt @@ -1,5 +1,5 @@ WARNING: w262 (suspicious-formatting) Suspicious code formatting. -testData/static_analyser/w262.nut:8:0 +testData/static_analyzer/w262.nut:8:0 print(1) else // EXPECTED 1 @@ -8,7 +8,7 @@ else // EXPECTED 1 WARNING: w262 (suspicious-formatting) Suspicious code formatting. -testData/static_analyser/w262.nut:13:4 +testData/static_analyzer/w262.nut:13:4 print(3) print(4) // EXPECTED 2 @@ -16,7 +16,7 @@ testData/static_analyser/w262.nut:13:4 WARNING: w262 (suspicious-formatting) Suspicious code formatting. -testData/static_analyser/w262.nut:17:4 +testData/static_analyzer/w262.nut:17:4 print(x) print(y) // EXPECTED 3 @@ -24,7 +24,7 @@ testData/static_analyser/w262.nut:17:4 WARNING: w262 (suspicious-formatting) Suspicious code formatting. -testData/static_analyser/w262.nut:25:4 +testData/static_analyzer/w262.nut:25:4 print(x) print(y) // EXPECTED 4 @@ -32,7 +32,7 @@ print(x) WARNING: w262 (suspicious-formatting) Suspicious code formatting. -testData/static_analyser/w262.nut:29:4 +testData/static_analyzer/w262.nut:29:4 print(x) print(y) // EXPECTED 5 @@ -40,7 +40,7 @@ testData/static_analyser/w262.nut:29:4 WARNING: w262 (suspicious-formatting) Suspicious code formatting. -testData/static_analyser/w262.nut:38:4 +testData/static_analyzer/w262.nut:38:4 print(x) print(y) // EXPECTED 6 @@ -48,7 +48,7 @@ testData/static_analyser/w262.nut:38:4 WARNING: w262 (suspicious-formatting) Suspicious code formatting. -testData/static_analyser/w262.nut:24:0 +testData/static_analyzer/w262.nut:24:0 while (false) print(x) @@ -57,7 +57,7 @@ print(x) WARNING: w262 (suspicious-formatting) Suspicious code formatting. -testData/static_analyser/w262.nut:57:0 +testData/static_analyzer/w262.nut:57:0 if (x) // EXPECTED 7 if (y) @@ -66,7 +66,7 @@ if (y) WARNING: w262 (suspicious-formatting) Suspicious code formatting. -testData/static_analyser/w262.nut:87:0 +testData/static_analyzer/w262.nut:87:0 while (x > 100) print(3) // EXPECTED 8 @@ -74,7 +74,7 @@ print(3) // EXPECTED 8 WARNING: w262 (suspicious-formatting) Suspicious code formatting. -testData/static_analyser/w262.nut:91:0 +testData/static_analyzer/w262.nut:91:0 for (;false;) print(3) // EXPECTED 9 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w262.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w262.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w262.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w262.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w263.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w263.diag.txt similarity index 75% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w263.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w263.diag.txt index 4ad6742ac..d0e90d7df 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w263.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w263.diag.txt @@ -1,5 +1,5 @@ WARNING: w263 (egyptian-braces) Indentation style: 'egyptian braces' required. -testData/static_analyser/w263.nut:2:0 +testData/static_analyzer/w263.nut:2:0 for (local i = 0; i < 5; i++) // EXPECTED { @@ -8,7 +8,7 @@ for (local i = 0; i < 5; i++) // EXPECTED WARNING: w263 (egyptian-braces) Indentation style: 'egyptian braces' required. -testData/static_analyser/w263.nut:7:0 +testData/static_analyzer/w263.nut:7:0 function _foo(_p) // EXPECTED { @@ -16,7 +16,7 @@ function _foo(_p) // EXPECTED WARNING: w263 (egyptian-braces) Indentation style: 'egyptian braces' required. -testData/static_analyser/w263.nut:21:0 +testData/static_analyzer/w263.nut:21:0 class _B // EXPECTED { @@ -24,7 +24,7 @@ class _B // EXPECTED WARNING: w263 (egyptian-braces) Indentation style: 'egyptian braces' required. -testData/static_analyser/w263.nut:30:0 +testData/static_analyzer/w263.nut:30:0 enum _E2 // EXPECTED { @@ -32,7 +32,7 @@ enum _E2 // EXPECTED WARNING: w263 (egyptian-braces) Indentation style: 'egyptian braces' required. -testData/static_analyser/w263.nut:41:0 +testData/static_analyzer/w263.nut:41:0 try // EXPECTED { @@ -41,7 +41,7 @@ try // EXPECTED WARNING: w263 (egyptian-braces) Indentation style: 'egyptian braces' required. -testData/static_analyser/w263.nut:44:0 +testData/static_analyzer/w263.nut:44:0 } catch (_e) // EXPECTED { @@ -50,7 +50,7 @@ testData/static_analyser/w263.nut:44:0 WARNING: w263 (egyptian-braces) Indentation style: 'egyptian braces' required. -testData/static_analyser/w263.nut:57:0 +testData/static_analyzer/w263.nut:57:0 if (!t) // EXPECTED { @@ -59,7 +59,7 @@ if (!t) // EXPECTED WARNING: w263 (egyptian-braces) Indentation style: 'egyptian braces' required. -testData/static_analyser/w263.nut:60:0 +testData/static_analyzer/w263.nut:60:0 } else // EXPECTED { @@ -68,7 +68,7 @@ testData/static_analyser/w263.nut:60:0 WARNING: w263 (egyptian-braces) Indentation style: 'egyptian braces' required. -testData/static_analyser/w263.nut:69:0 +testData/static_analyzer/w263.nut:69:0 while (t) // EXPECTED { @@ -77,7 +77,7 @@ while (t) // EXPECTED WARNING: w263 (egyptian-braces) Indentation style: 'egyptian braces' required. -testData/static_analyser/w263.nut:78:0 +testData/static_analyzer/w263.nut:78:0 do // EXPECTED { @@ -86,7 +86,7 @@ do // EXPECTED WARNING: w263 (egyptian-braces) Indentation style: 'egyptian braces' required. -testData/static_analyser/w263.nut:89:4 +testData/static_analyzer/w263.nut:89:4 default: // EXPECTED { @@ -95,7 +95,7 @@ testData/static_analyser/w263.nut:89:4 WARNING: w263 (egyptian-braces) Indentation style: 'egyptian braces' required. -testData/static_analyser/w263.nut:95:0 +testData/static_analyzer/w263.nut:95:0 switch (t) // EXPECTED { diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w263.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w263.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w263.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w263.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w264.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w264.diag.txt similarity index 71% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w264.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w264.diag.txt index 68f4aef30..bd00ea53b 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w264.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w264.diag.txt @@ -1,5 +1,5 @@ WARNING: w264 (plus-string) Usage of '+' for string concatenation. -testData/static_analyser/w264.nut:7:7 +testData/static_analyzer/w264.nut:7:7 return a + 3 ^---- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w264.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w264.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w264.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w264.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w266.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w266.diag.txt similarity index 69% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w266.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w266.diag.txt index e661b4eb0..5e381d183 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w266.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w266.diag.txt @@ -1,5 +1,5 @@ WARNING: w266 (forgotten-do) 'while' after the statement list (forgot 'do' ?) -testData/static_analyser/w266.nut:5:2 +testData/static_analyzer/w266.nut:5:2 ::x++ } while (::x) @@ -8,7 +8,7 @@ testData/static_analyser/w266.nut:5:2 WARNING: w262 (suspicious-formatting) Suspicious code formatting. -testData/static_analyser/w266.nut:6:0 +testData/static_analyzer/w266.nut:6:0 } while (::x) ::x-- @@ -19,7 +19,7 @@ testData/static_analyser/w266.nut:6:0 AN ERROR HAS OCCURRED [the index 'x' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w266.nut line [4] +*FUNCTION [__main__()] testData/static_analyzer/w266.nut line [4] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w266.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w266.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w266.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w266.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w267.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w267.diag.txt similarity index 78% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w267.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w267.diag.txt index b11cef605..c9000d488 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w267.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w267.diag.txt @@ -1,19 +1,19 @@ WARNING: w267 (suspicious-bracket) '(' will be parsed as 'function call' (forgot ',' ?) -testData/static_analyser/w267.nut:4:11 +testData/static_analyzer/w267.nut:4:11 foo("sum=" (6+7)) // EXPECTED 1 ^ WARNING: w267 (suspicious-bracket) '[' will be parsed as 'access to member' (forgot ',' ?) -testData/static_analyser/w267.nut:6:13 +testData/static_analyzer/w267.nut:6:13 foo("array:" [6]) // EXPECTED 2 access ^ WARNING: w267 (suspicious-bracket) '[' will be parsed as 'access to member' (forgot ',' ?) -testData/static_analyser/w267.nut:11:8 +testData/static_analyzer/w267.nut:11:8 // [6] // compilation error "t" [7] // EXPECTED 3 access @@ -22,7 +22,7 @@ testData/static_analyser/w267.nut:11:8 WARNING: w267 (suspicious-bracket) '(' will be parsed as 'function call' (forgot ',' ?) -testData/static_analyser/w267.nut:12:8 +testData/static_analyzer/w267.nut:12:8 "t" [7] // EXPECTED 3 access "y" (6+7) // EXPECTED 4 @@ -31,7 +31,7 @@ testData/static_analyser/w267.nut:12:8 WARNING: w267 (suspicious-bracket) '(' will be parsed as 'function call' (forgot ',' ?) -testData/static_analyser/w267.nut:14:4 +testData/static_analyzer/w267.nut:14:4 "z" (6+7) // EXPECTED 5 @@ -43,7 +43,7 @@ testData/static_analyser/w267.nut:14:4 AN ERROR HAS OCCURRED [attempt to call 'string'] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w267.nut line [4] +*FUNCTION [__main__()] testData/static_analyzer/w267.nut line [4] LOCALS [foo] CLOSURE=FN:foo diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w267.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w267.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w267.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w267.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w269.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w269.diag.txt similarity index 75% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w269.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w269.diag.txt index 756770d09..3ba115fd2 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w269.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w269.diag.txt @@ -1,5 +1,5 @@ WARNING: w269 (mixed-separators) Mixed spaces and commas to separate elements of array. -testData/static_analyser/w269.nut:4:6 +testData/static_analyzer/w269.nut:4:6 20 30, @@ -8,7 +8,7 @@ testData/static_analyser/w269.nut:4:6 WARNING: w269 (mixed-separators) Mixed spaces and commas to separate elements of array. -testData/static_analyser/w269.nut:13:4 +testData/static_analyzer/w269.nut:13:4 30 40 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w269.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w269.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w269.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w269.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w270.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w270.diag.txt similarity index 81% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w270.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w270.diag.txt index 46b449d6d..6c9d896aa 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w270.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w270.diag.txt @@ -1,5 +1,5 @@ HINT: h270 (extent-to-append) It is better to use 'append(A, B, ...)' instead of 'extend([A, B, ...])'. -testData/static_analyser/w270.nut:3:0 +testData/static_analyzer/w270.nut:3:0 ::handlersManager[::PERSISTENT_DATA_PARAMS].extend([ "curControlsAllowMask", "isCurSceneBgBlurred" ]) // -undefined-global ^---------------------------------------------------------------------------------------------------- @@ -9,7 +9,7 @@ testData/static_analyser/w270.nut:3:0 AN ERROR HAS OCCURRED [the index 'handlersManager' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w270.nut line [3] +*FUNCTION [__main__()] testData/static_analyzer/w270.nut line [3] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w270.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w270.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w270.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w270.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w271.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w271.diag.txt similarity index 80% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w271.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w271.diag.txt index 6128c9753..bc81fac07 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w271.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w271.diag.txt @@ -1,5 +1,5 @@ WARNING: w271 (forgot-subst) '{}' found inside string (forgot 'subst' or '$' ?). -testData/static_analyser/w271.nut:9:15 +testData/static_analyzer/w271.nut:9:15 else return "$ xxxx={x}" // EXPECTED diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w271.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w271.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w271.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w271.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w272.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w272.diag.txt similarity index 82% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w272.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w272.diag.txt index f250b492a..5afd638f0 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w272.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w272.diag.txt @@ -1,5 +1,5 @@ WARNING: w272 (not-unary-op) This '-' is not unary operator. Please use ' ' after it or ',' before it for better understandability. -testData/static_analyser/w272.nut:5:4 +testData/static_analyzer/w272.nut:5:4 2 -3 // EXPECTED 1 @@ -8,7 +8,7 @@ testData/static_analyser/w272.nut:5:4 WARNING: w272 (not-unary-op) This '+' is not unary operator. Please use ' ' after it or ',' before it for better understandability. -testData/static_analyser/w272.nut:12:5 +testData/static_analyzer/w272.nut:12:5 0 +1 // EXPECTED 2 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w272.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w272.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w272.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w272.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w273.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w273.diag.txt similarity index 56% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w273.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w273.diag.txt index a0d32362e..4ff6b9b7c 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w273.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w273.diag.txt @@ -1,5 +1,5 @@ ERROR: Unknown variable [x] -testData/static_analyser/w273.nut:3:0 +testData/static_analyzer/w273.nut:3:0 x <- 6 ^ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w273.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w273.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w273.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w273.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w274.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w274.diag.txt similarity index 79% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w274.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w274.diag.txt index a36d684ed..b5b045ccc 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w274.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w274.diag.txt @@ -1,5 +1,5 @@ WARNING: w274 (iterator-in-lambda) Iterator 'v' is trying to be captured in closure. -testData/static_analyser/w274.nut:6:17 +testData/static_analyzer/w274.nut:6:17 foreach (v in [1, 2, 3]) tab.append(@() v) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w274.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w274.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w274.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w274.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w274_2.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w274_2.diag.txt similarity index 82% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w274_2.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w274_2.diag.txt index 7a8494116..f74f55171 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w274_2.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w274_2.diag.txt @@ -1,5 +1,5 @@ WARNING: w274 (iterator-in-lambda) Iterator 'x' is trying to be captured in closure. -testData/static_analyser/w274_2.nut:5:23 +testData/static_analyzer/w274_2.nut:5:23 foreach (_, x in a) { m.append(@() print(x)) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w274_2.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w274_2.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w274_2.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w274_2.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275.diag.txt similarity index 82% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275.diag.txt index 83db12ed8..30a8d6a4c 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275.diag.txt @@ -1,5 +1,5 @@ WARNING: w275 (missed-break) A 'break' statement is probably missing in a 'switch' statement. -testData/static_analyser/w275.nut:19:6 +testData/static_analyzer/w275.nut:19:6 case 4: ::print("ccc") diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275_all_variants.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275_all_variants.diag.txt similarity index 75% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275_all_variants.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275_all_variants.diag.txt index 57692bb6d..bc5d16d6a 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275_all_variants.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275_all_variants.diag.txt @@ -1,5 +1,5 @@ WARNING: w275 (missed-break) A 'break' statement is probably missing in a 'switch' statement. -testData/static_analyser/w275_all_variants.nut:9:8 +testData/static_analyzer/w275_all_variants.nut:9:8 case 1: foo(); // OK @@ -8,7 +8,7 @@ testData/static_analyser/w275_all_variants.nut:9:8 WARNING: w275 (missed-break) A 'break' statement is probably missing in a 'switch' statement. -testData/static_analyser/w275_all_variants.nut:20:8 +testData/static_analyzer/w275_all_variants.nut:20:8 case 3: foo() // OK diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275_all_variants.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275_all_variants.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275_all_variants.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275_all_variants.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275_complex.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275_complex.diag.txt similarity index 77% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275_complex.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275_complex.diag.txt index 56af04d1c..8782360c0 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275_complex.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275_complex.diag.txt @@ -2,7 +2,7 @@ AN ERROR HAS OCCURRED [the index 'a' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w275_complex.nut line [11] +*FUNCTION [__main__()] testData/static_analyzer/w275_complex.nut line [11] LOCALS [someBound] false diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275_complex.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275_complex.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w275_complex.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w275_complex.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w277.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w277.diag.txt similarity index 68% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w277.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w277.diag.txt index ddb000bf3..b07271478 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w277.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w277.diag.txt @@ -1,5 +1,5 @@ WARNING: w277 (space-at-eol) Whitespace at the end of line. -testData/static_analyser/w277.nut.txt:1:0 +testData/static_analyzer/w277.nut.txt:1:0 ^ @@ -7,7 +7,7 @@ testData/static_analyser/w277.nut.txt:1:0 WARNING: w277 (space-at-eol) Whitespace at the end of line. -testData/static_analyser/w277.nut.txt:3:0 +testData/static_analyzer/w277.nut.txt:3:0 //expect:w277 @@ -16,14 +16,14 @@ testData/static_analyser/w277.nut.txt:3:0 WARNING: w277 (space-at-eol) Whitespace at the end of line. -testData/static_analyser/w277.nut.txt:4:13 +testData/static_analyzer/w277.nut.txt:4:13 //expect:w277 ^ WARNING: w277 (space-at-eol) Whitespace at the end of line. -testData/static_analyser/w277.nut.txt:8:17 +testData/static_analyzer/w277.nut.txt:8:17 function _foo() { ^ @@ -31,7 +31,7 @@ function _foo() { WARNING: w277 (space-at-eol) Whitespace at the end of line. -testData/static_analyser/w277.nut.txt:13:14 +testData/static_analyzer/w277.nut.txt:13:14 //expect:w277 let _z = @"abc @@ -40,7 +40,7 @@ let _z = @"abc WARNING: w277 (space-at-eol) Whitespace at the end of line. -testData/static_analyser/w277.nut.txt:18:9 +testData/static_analyzer/w277.nut.txt:18:9 /* aas ^ @@ -48,7 +48,7 @@ testData/static_analyser/w277.nut.txt:18:9 WARNING: w277 (space-at-eol) Whitespace at the end of line. -testData/static_analyser/w277.nut.txt:21:0 +testData/static_analyzer/w277.nut.txt:21:0 ^ diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w277.nut.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w277.nut.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w277.nut.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w277.nut.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w279_1.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w279_1.diag.txt similarity index 81% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w279_1.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w279_1.diag.txt index e2d0aa0b8..7ffb0512b 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w279_1.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w279_1.diag.txt @@ -1,5 +1,5 @@ WARNING: w279 (mismatch-loop-variable) The variable used in for-loop does not match the initialized one. -testData/static_analyser/w279_1.nut:4:18 +testData/static_analyzer/w279_1.nut:4:18 local j; for (local k = 0; j < 5; k++) { @@ -8,7 +8,7 @@ for (local k = 0; j < 5; k++) { WARNING: w200 (potentially-nulled-ops) Comparison operation with potentially nullable expression. -testData/static_analyser/w279_1.nut:4:18 +testData/static_analyzer/w279_1.nut:4:18 local j; for (local k = 0; j < 5; k++) { diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w279_1.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w279_1.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w279_1.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w279_1.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w279_2.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w279_2.diag.txt similarity index 82% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w279_2.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w279_2.diag.txt index 7cf079be2..b8bd81a1e 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w279_2.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w279_2.diag.txt @@ -1,5 +1,5 @@ WARNING: w279 (mismatch-loop-variable) The variable used in for-loop does not match the initialized one. -testData/static_analyser/w279_2.nut:4:25 +testData/static_analyzer/w279_2.nut:4:25 local j = 0; for (local k = 0; k < 5; j++) { diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w279_2.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w279_2.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w279_2.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w279_2.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w280.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w280.diag.txt similarity index 82% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w280.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w280.diag.txt index 6714444e6..c86d8e5c5 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w280.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w280.diag.txt @@ -1,5 +1,5 @@ WARNING: w280 (forbidden-parent-dir) Access to the parent directory is forbidden in this function. -testData/static_analyser/w280.nut:3:9 +testData/static_analyzer/w280.nut:3:9 let _x = require_optional("../a.nut") ^--------------------------- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w280.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w280.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w280.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w280.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w281.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w281.diag.txt similarity index 85% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w281.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w281.diag.txt index 0c5ce232b..b8f34fd00 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w281.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w281.diag.txt @@ -1,5 +1,5 @@ WARNING: w281 (unwanted-modification) Function 'extend' modifies object. You probably didn't want to modify the object here. -testData/static_analyser/w281.nut:9:9 +testData/static_analyzer/w281.nut:9:9 let function fn(arr) { return (arr ?? []).extend(x) @@ -8,7 +8,7 @@ let function fn(arr) { WARNING: w281 (unwanted-modification) Function 'extend' modifies object. You probably didn't want to modify the object here. -testData/static_analyser/w281.nut:14:12 +testData/static_analyzer/w281.nut:14:12 local tab = { watch = (::alertWatched ? ::alertWatched : []).extend(::titleWatched ? ::titleWatched : []) @@ -17,7 +17,7 @@ local tab = { WARNING: w281 (unwanted-modification) Function 'extend' modifies object. You probably didn't want to modify the object here. -testData/static_analyser/w281.nut:17:36 +testData/static_analyzer/w281.nut:17:36 local getSeasonMainPrizesData = @() (::premiumUnlock.value?.meta.promo ?? []) ^----------- @@ -28,7 +28,7 @@ local getSeasonMainPrizesData = @() (::premiumUnlock.value?.meta.promo ?? []) AN ERROR HAS OCCURRED [the index 'alertWatched' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w281.nut line [14] +*FUNCTION [__main__()] testData/static_analyzer/w281.nut line [14] LOCALS [fn] CLOSURE=FN:fn diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w281.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w281.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w281.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w281.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w282.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w282.diag.txt similarity index 73% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w282.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w282.diag.txt index 2e4e24a7d..c85578bf4 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w282.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w282.diag.txt @@ -1,5 +1,5 @@ WARNING: w282 (inexpr-assign-priority) Operator ':=' has lower priority. Perhaps parentheses are missing?. -testData/static_analyser/w282.nut:3:4 +testData/static_analyzer/w282.nut:3:4 local uid = 0 if (uid := invite_info?.leader.id.tostring()!=null) @@ -8,7 +8,7 @@ if (uid := invite_info?.leader.id.tostring()!=null) WARNING: w282 (inexpr-assign-priority) Operator ':=' has lower priority. Perhaps parentheses are missing?. -testData/static_analyser/w282.nut:7:9 +testData/static_analyzer/w282.nut:7:9 local a = ::fn() while (a := ::fn() || ::fn2()) @@ -20,32 +20,32 @@ local a = ::fn() AN ERROR HAS OCCURRED [the index 'fn' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w282.nut line [1] +*FUNCTION [__main__()] testData/static_analyzer/w282.nut line [1] LOCALS [vargv] ARRAY=[] [this] TABLE={} WARNING: w295 (undefined-global) Undefined global identifier 'Contact'. -testData/static_analyser/w282.nut:4:23 +testData/static_analyzer/w282.nut:4:23 WARNING: w295 (undefined-global) Undefined global identifier 'addInviteByContact'. -testData/static_analyser/w282.nut:4:2 +testData/static_analyzer/w282.nut:4:2 WARNING: w295 (undefined-global) Undefined global identifier 'fn'. -testData/static_analyser/w282.nut:1:20 +testData/static_analyzer/w282.nut:1:20 WARNING: w295 (undefined-global) Undefined global identifier 'fn'. -testData/static_analyser/w282.nut:6:10 +testData/static_analyzer/w282.nut:6:10 WARNING: w295 (undefined-global) Undefined global identifier 'fn'. -testData/static_analyser/w282.nut:7:14 +testData/static_analyzer/w282.nut:7:14 WARNING: w295 (undefined-global) Undefined global identifier 'fn2'. -testData/static_analyser/w282.nut:7:24 +testData/static_analyzer/w282.nut:7:24 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w282.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w282.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w282.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w282.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w283.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w283.diag.txt similarity index 73% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w283.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w283.diag.txt index b65654df8..4329798ac 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w283.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w283.diag.txt @@ -1,5 +1,5 @@ WARNING: w283 (useless-null-coalescing) The expression to the right of the '??' is null. -testData/static_analyser/w283.nut:5:21 +testData/static_analyzer/w283.nut:5:21 let function fn(x) { return ::y.cc ?? x ?? null @@ -8,7 +8,7 @@ let function fn(x) { WARNING: w283 (useless-null-coalescing) The expression to the right of the '??' is null. -testData/static_analyser/w283.nut:9:10 +testData/static_analyzer/w283.nut:9:10 local s = null local x = ::y ?? s @@ -20,7 +20,7 @@ return x AN ERROR HAS OCCURRED [the index 'y' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w283.nut line [9] +*FUNCTION [__main__()] testData/static_analyzer/w283.nut line [9] LOCALS [s] NULL @@ -28,10 +28,10 @@ LOCALS [vargv] ARRAY=[] [this] TABLE={} WARNING: w295 (undefined-global) Undefined global identifier 'y'. -testData/static_analyser/w283.nut:5:11 +testData/static_analyzer/w283.nut:5:11 WARNING: w295 (undefined-global) Undefined global identifier 'y'. -testData/static_analyser/w283.nut:9:10 +testData/static_analyzer/w283.nut:9:10 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w283.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w283.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w283.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w283.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w284.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w284.diag.txt similarity index 70% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w284.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w284.diag.txt index acd75b997..879a2c13d 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w284.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w284.diag.txt @@ -1,12 +1,12 @@ WARNING: w284 (can-be-simplified) Expression can be simplified. -testData/static_analyser/w284.nut:5:10 +testData/static_analyzer/w284.nut:5:10 let _c1 = ::a > 2 || ::a > 100 ^------------------- WARNING: w284 (can-be-simplified) Expression can be simplified. -testData/static_analyser/w284.nut:7:10 +testData/static_analyzer/w284.nut:7:10 let _c2 = fn(1) != null ? fn(1) : null ^--------------------------- @@ -16,21 +16,21 @@ let _c2 = fn(1) != null ? fn(1) : null AN ERROR HAS OCCURRED [the index 'a' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w284.nut line [5] +*FUNCTION [__main__()] testData/static_analyzer/w284.nut line [5] LOCALS [fn] CLOSURE=FN:fn [vargv] ARRAY=[] [this] TABLE={} WARNING: w295 (undefined-global) Undefined global identifier 'a'. -testData/static_analyser/w284.nut:5:10 +testData/static_analyzer/w284.nut:5:10 WARNING: w295 (undefined-global) Undefined global identifier 'a'. -testData/static_analyser/w284.nut:5:21 +testData/static_analyzer/w284.nut:5:21 WARNING: w295 (undefined-global) Undefined global identifier 'y'. -testData/static_analyser/w284.nut:2:11 +testData/static_analyzer/w284.nut:2:11 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w284.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w284.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w284.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w284.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w285.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w285.diag.txt similarity index 80% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w285.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w285.diag.txt index c2f093110..0d5d52463 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w285.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w285.diag.txt @@ -1,5 +1,5 @@ WARNING: w285 (expr-cannot-be-null) The expression to the left of the 'null coalescing' cannot be null. -testData/static_analyser/w285.nut:6:5 +testData/static_analyzer/w285.nut:6:5 local uu = ::sys.gh("fff") ?? "" if ((uu ?? "") != "") @@ -8,14 +8,14 @@ if ((uu ?? "") != "") WARNING: w285 (expr-cannot-be-null) The expression to the left of the 'null coalescing' cannot be null. -testData/static_analyser/w285.nut:10:42 +testData/static_analyzer/w285.nut:10:42 local regions = ::unlock?.meta.regions ?? [::unlock?.meta.region] ?? [] ^---------------------- WARNING: w285 (expr-cannot-be-null) The expression to the left of the 'null coalescing' cannot be null. -testData/static_analyser/w285.nut:13:9 +testData/static_analyzer/w285.nut:13:9 local regions2 = ::x ? [] : {} let _g = regions2 ?? 123 @@ -23,7 +23,7 @@ let _g = regions2 ?? 123 WARNING: w285 (expr-cannot-be-null) The expression to the left of the 'equal check' cannot be null. -testData/static_analyser/w285.nut:17:9 +testData/static_analyzer/w285.nut:17:9 local regions3 = ::x ? 2 : 4 let _h = regions3 != null @@ -34,7 +34,7 @@ let _h = regions3 != null AN ERROR HAS OCCURRED [the index 'sys' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w285.nut line [5] +*FUNCTION [__main__()] testData/static_analyzer/w285.nut line [5] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w285.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w285.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w285.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w285.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286.diag.txt similarity index 78% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286.diag.txt index a65ea0e9b..0157aadb3 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286.diag.txt @@ -1,5 +1,5 @@ WARNING: w286 (decl-in-expression) Declaration used in arith expression as operand. -testData/static_analyser/w286.nut:5:7 +testData/static_analyzer/w286.nut:5:7 return fn1 || ::fn2 // -undefined-global ^-- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286_2.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286_2.diag.txt similarity index 79% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286_2.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286_2.diag.txt index 80d6c7bc9..ae55f706f 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286_2.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286_2.diag.txt @@ -1,5 +1,5 @@ WARNING: w286 (decl-in-expression) Declaration used in arith expression as operand. -testData/static_analyser/w286_2.nut:10:7 +testData/static_analyzer/w286_2.nut:10:7 return cls || ::fn2 //-const-in-bool-expr -undefined-global ^-- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286_2.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286_2.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286_2.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286_2.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286_oror_andand.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286_oror_andand.diag.txt similarity index 74% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286_oror_andand.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286_oror_andand.diag.txt index b32190241..f37863fd0 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286_oror_andand.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286_oror_andand.diag.txt @@ -1,5 +1,5 @@ WARNING: w286 (decl-in-expression) Declaration used in arith expression as operand. -testData/static_analyser/w286_oror_andand.nut:6:9 +testData/static_analyzer/w286_oror_andand.nut:6:9 let _p = t?.x || {} let _x = {} || t?.y @@ -8,7 +8,7 @@ let _y = t?.z && [] WARNING: w233 (const-in-bool-expr) Constant in a boolean expression. -testData/static_analyser/w286_oror_andand.nut:6:9 +testData/static_analyzer/w286_oror_andand.nut:6:9 let _p = t?.x || {} let _x = {} || t?.y @@ -17,7 +17,7 @@ let _y = t?.z && [] WARNING: w233 (const-in-bool-expr) Constant in a boolean expression. -testData/static_analyser/w286_oror_andand.nut:7:9 +testData/static_analyzer/w286_oror_andand.nut:7:9 let _x = {} || t?.y let _y = t?.z && [] @@ -26,7 +26,7 @@ let _z = [] && t?.g WARNING: w286 (decl-in-expression) Declaration used in arith expression as operand. -testData/static_analyser/w286_oror_andand.nut:8:9 +testData/static_analyzer/w286_oror_andand.nut:8:9 let _y = t?.z && [] let _z = [] && t?.g @@ -34,7 +34,7 @@ let _z = [] && t?.g WARNING: w233 (const-in-bool-expr) Constant in a boolean expression. -testData/static_analyser/w286_oror_andand.nut:8:9 +testData/static_analyzer/w286_oror_andand.nut:8:9 let _y = t?.z && [] let _z = [] && t?.g diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286_oror_andand.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286_oror_andand.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w286_oror_andand.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w286_oror_andand.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w287.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w287.diag.txt similarity index 86% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w287.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w287.diag.txt index 0dc85935b..6ccab13a5 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w287.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w287.diag.txt @@ -1,5 +1,5 @@ WARNING: w287 (range-check) It looks like the range boundaries are not checked correctly. Pay attention to checking with minimum and maximum index. -testData/static_analyser/w287.nut:7:10 +testData/static_analyzer/w287.nut:7:10 let e1 = (curVal < 0 || curVal > x) // EXPECTED 1 ^----------------------- @@ -7,7 +7,7 @@ let c1 = (curVal < 0 || curVal >= x) // FP 1 WARNING: w287 (range-check) It looks like the range boundaries are not checked correctly. Pay attention to checking with minimum and maximum index. -testData/static_analyser/w287.nut:13:9 +testData/static_analyzer/w287.nut:13:9 let e2 = (value >= 0) && (value <= cnt) // EXPECTED 2 ^----------------------------- @@ -15,7 +15,7 @@ let c2 = (value >= 0) && (value < cnt) // FP 2 WARNING: w287 (range-check) It looks like the range boundaries are not checked correctly. Pay attention to checking with minimum and maximum index. -testData/static_analyser/w287.nut:16:10 +testData/static_analyzer/w287.nut:16:10 let e3 = (::idx < 0 || ::idx > ::tblObj.childrenCount()) // EXPECTED 3 ^-------------------------------------------- @@ -23,7 +23,7 @@ let c3 = (::idx < 0 || ::idx >= ::tblObj.childrenCount()) // FP 3 WARNING: w287 (range-check) It looks like the range boundaries are not checked correctly. Pay attention to checking with minimum and maximum index. -testData/static_analyser/w287.nut:19:10 +testData/static_analyzer/w287.nut:19:10 let e4 = (0 <= value && value <= ::obj.childrenCount()) // EXPECTED 4 ^------------------------------------------- @@ -34,7 +34,7 @@ let c4 = (0 <= value && value < ::obj.childrenCount()) // FP 4 AN ERROR HAS OCCURRED [the index 'a' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w287.nut line [4] +*FUNCTION [__main__()] testData/static_analyzer/w287.nut line [4] LOCALS [vargv] ARRAY=[] diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w287.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w287.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w287.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w287.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w288.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w288.diag.txt similarity index 65% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w288.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w288.diag.txt index 682bf5ce4..6aa96dddd 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w288.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w288.diag.txt @@ -1,5 +1,5 @@ WARNING: w288 (param-count) Function '(w288.nut:2)' is called with the wrong number of parameters. -testData/static_analyser/w288.nut:5:7 +testData/static_analyzer/w288.nut:5:7 local b = 1; return fn(b); @@ -7,10 +7,10 @@ return fn(b); -AN ERROR HAS OCCURRED [wrong number of parameters passed to '(w288.nut:2)' testData/static_analyser/w288.nut:2 (2 passed, at least 3 required)] +AN ERROR HAS OCCURRED [wrong number of parameters passed to '(w288.nut:2)' testData/static_analyzer/w288.nut:2 (2 passed, at least 3 required)] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w288.nut line [5] +*FUNCTION [__main__()] testData/static_analyzer/w288.nut line [5] LOCALS [b] 1 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w288.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w288.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w288.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w288.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w288_dp_va.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w288_dp_va.diag.txt similarity index 71% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w288_dp_va.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w288_dp_va.diag.txt index df3b6b604..c39a16afe 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w288_dp_va.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w288_dp_va.diag.txt @@ -1,5 +1,5 @@ WARNING: w288 (param-count) Function 'foo' is called with the wrong number of parameters. -testData/static_analyser/w288_dp_va.nut:11:0 +testData/static_analyzer/w288_dp_va.nut:11:0 foo(10, 20, 30, 40) ^------------------ @@ -7,7 +7,7 @@ foo(10, 20, 30) WARNING: w288 (param-count) Function 'foo' is called with the wrong number of parameters. -testData/static_analyser/w288_dp_va.nut:15:0 +testData/static_analyzer/w288_dp_va.nut:15:0 foo(10) foo() @@ -15,7 +15,7 @@ foo() WARNING: w288 (param-count) Function 'bar' is called with the wrong number of parameters. -testData/static_analyser/w288_dp_va.nut:18:0 +testData/static_analyzer/w288_dp_va.nut:18:0 bar() ^---- @@ -23,7 +23,7 @@ bar(10) WARNING: w288 (param-count) Function 'bar' is called with the wrong number of parameters. -testData/static_analyser/w288_dp_va.nut:19:0 +testData/static_analyzer/w288_dp_va.nut:19:0 bar() bar(10) @@ -32,7 +32,7 @@ bar(10, 20) WARNING: w228 (declared-never-used) let 'hflow' was declared but never used. -testData/static_analyser/w288_dp_va.nut:9:0 +testData/static_analyzer/w288_dp_va.nut:9:0 let FlowH = 30 let hflow = @(...) comp(FlowH, vargv) @@ -40,10 +40,10 @@ let hflow = @(...) comp(FlowH, vargv) -AN ERROR HAS OCCURRED [wrong number of parameters passed to 'foo' testData/static_analyser/w288_dp_va.nut:4 (5 passed, 4 required)] +AN ERROR HAS OCCURRED [wrong number of parameters passed to 'foo' testData/static_analyzer/w288_dp_va.nut:4 (5 passed, 4 required)] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w288_dp_va.nut line [11] +*FUNCTION [__main__()] testData/static_analyzer/w288_dp_va.nut line [11] LOCALS [hflow] CLOSURE=FN:hflow diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w288_dp_va.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w288_dp_va.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w288_dp_va.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w288_dp_va.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w288_lambdas2.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w288_lambdas2.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w288_lambdas2.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w288_lambdas2.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w288_lambdas2.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w288_lambdas2.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w288_lambdas2.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w288_lambdas2.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w289.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w289.diag.txt similarity index 78% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w289.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w289.diag.txt index c428fd407..a8ea94c09 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w289.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w289.diag.txt @@ -1,5 +1,5 @@ WARNING: w289 (param-pos) The function parameter 'aaax' seems to be in the wrong position. -testData/static_analyser/w289.nut:6:13 +testData/static_analyzer/w289.nut:6:13 local b = -1; return fn(b, aaaX); diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w289.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w289.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w289.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w289.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w291.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w291.diag.txt similarity index 84% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w291.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w291.diag.txt index 3ece7ff8f..590856e4d 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w291.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w291.diag.txt @@ -1,5 +1,5 @@ WARNING: w291 (invalid-underscore) The name of parameter '_y' is invalid. The identifier is marked as an unused with a prefix underscore, but it is used. -testData/static_analyser/w291.nut:6:4 +testData/static_analyzer/w291.nut:6:4 x, _y, // EXPECTED diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w291.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w291.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w291.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w291.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w292.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w292.diag.txt similarity index 82% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w292.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w292.diag.txt index c644a8f24..87b235e29 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w292.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w292.diag.txt @@ -1,5 +1,5 @@ WARNING: w292 (modified-container) The container was modified within the loop. -testData/static_analyser/w292.nut:13:4 +testData/static_analyzer/w292.nut:13:4 foreach (a in c) { delete c.x // EXPECTED 1 @@ -8,7 +8,7 @@ foreach (a in c) { WARNING: w292 (modified-container) The container was modified within the loop. -testData/static_analyser/w292.nut:14:4 +testData/static_analyzer/w292.nut:14:4 delete c.x // EXPECTED 1 c.rawdelete("y") // EXPECTED 2 @@ -17,7 +17,7 @@ testData/static_analyser/w292.nut:14:4 WARNING: w292 (modified-container) The container was modified within the loop. -testData/static_analyser/w292.nut:34:8 +testData/static_analyzer/w292.nut:34:8 if (a < 0) { c.clear() // EXPECTED 3 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w292.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w292.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w292.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w292.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w293.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w293.diag.txt similarity index 82% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w293.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w293.diag.txt index eb0a393b6..af21d0f3c 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w293.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w293.diag.txt @@ -1,5 +1,5 @@ WARNING: w293 (duplicate-persist-id) Duplicate id 'x' passed to 'persist'. -testData/static_analyser/w293.nut:5:17 +testData/static_analyzer/w293.nut:5:17 let _y = persist("y", @() {}) // FP let _z = persist("x", @() {}) // EXPECTED @@ -7,7 +7,7 @@ let _z = persist("x", @() {}) // EXPECTED WARNING: w293 (duplicate-persist-id) Duplicate id 'a' passed to 'persist'. -testData/static_analyser/w293.nut:14:28 +testData/static_analyzer/w293.nut:14:28 let _c = mkWatched(foo, "b") // FP let _d = mkWatched(persist, "a") // EXPECTED diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w293.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w293.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w293.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w293.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w295.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w295.diag.txt similarity index 64% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w295.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w295.diag.txt index 34dda0057..7041105a5 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w295.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w295.diag.txt @@ -2,20 +2,20 @@ AN ERROR HAS OCCURRED [the index 'h' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w295.nut line [9] +*FUNCTION [__main__()] testData/static_analyzer/w295.nut line [9] LOCALS [vargv] ARRAY=[] [this] TABLE={} WARNING: w307 (global-id-redef) Redefinition of existed global name 'g'. -testData/static_analyser/w295.nut:5:0 +testData/static_analyzer/w295.nut:5:0 WARNING: w307 (global-id-redef) Redefinition of existed global name 'g'. -testData/static_analyser/w295.nut:7:0 +testData/static_analyzer/w295.nut:7:0 WARNING: w295 (undefined-global) Undefined global identifier 'h'. -testData/static_analyser/w295.nut:9:6 +testData/static_analyzer/w295.nut:9:6 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w295.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w295.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w295.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w295.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w297.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w297.diag.txt similarity index 80% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w297.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w297.diag.txt index 5792ad399..7f65550a0 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w297.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w297.diag.txt @@ -1,5 +1,5 @@ WARNING: w297 (call-from-root) Function 'keepref' must be called from the root scope. -testData/static_analyser/w297.nut:9:4 +testData/static_analyzer/w297.nut:9:4 function _foo(p) { keepref(p) // EXPECTED 1 @@ -8,14 +8,14 @@ function _foo(p) { WARNING: w297 (call-from-root) Function 'keepref' must be called from the root scope. -testData/static_analyser/w297.nut:13:14 +testData/static_analyzer/w297.nut:13:14 let _z = @(y) keepref(y) // EXPECTED 2 ^--------- WARNING: w297 (call-from-root) Function 'keepref' must be called from the root scope. -testData/static_analyser/w297.nut:17:8 +testData/static_analyzer/w297.nut:17:8 constructor(t){ keepref(t) // EXPECTED 3 diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w297.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w297.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w297.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w297.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w301_closure.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w301_closure.diag.txt similarity index 75% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w301_closure.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w301_closure.diag.txt index 51e903504..38cf5d515 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w301_closure.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w301_closure.diag.txt @@ -2,7 +2,7 @@ AN ERROR HAS OCCURRED [the index 'null' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w301_closure.nut line [8] +*FUNCTION [__main__()] testData/static_analyzer/w301_closure.nut line [8] LOCALS [tree] NULL diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w301_closure.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w301_closure.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w301_closure.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w301_closure.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w301_closure2.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w301_closure2.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w301_closure2.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w301_closure2.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w301_closure2.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w301_closure2.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w301_closure2.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w301_closure2.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302_dowhile.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302_dowhile.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302_dowhile.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302_dowhile.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302_dowhile.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302_dowhile.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302_dowhile.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302_dowhile.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302_loop.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302_loop.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302_loop.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302_loop.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302_loop.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302_loop.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302_loop.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302_loop.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302_try_catch.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302_try_catch.diag.txt similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302_try_catch.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302_try_catch.diag.txt diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302_try_catch.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302_try_catch.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w302_try_catch.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w302_try_catch.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w303_destruct.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w303_destruct.diag.txt similarity index 73% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w303_destruct.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w303_destruct.diag.txt index 4c8f3c6b6..a35fe5848 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w303_destruct.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w303_destruct.diag.txt @@ -1,5 +1,5 @@ WARNING: w228 (declared-never-used) parameter 'u' was declared but never used. -testData/static_analyser/w303_destruct.nut:3:13 +testData/static_analyzer/w303_destruct.nut:3:13 function foo() { return "1" } function bar(u) {} @@ -10,7 +10,7 @@ function bar(u) {} AN ERROR HAS OCCURRED [the index 'x' does not exist] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w303_destruct.nut line [5] +*FUNCTION [__main__()] testData/static_analyzer/w303_destruct.nut line [5] LOCALS [x] NULL diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w303_destruct.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w303_destruct.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w303_destruct.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w303_destruct.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w305.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w305.diag.txt similarity index 74% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w305.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w305.diag.txt index 59d23ed3b..0f20d302d 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w305.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w305.diag.txt @@ -1,5 +1,5 @@ WARNING: w223 (compared-with-bool) Comparison with boolean. -testData/static_analyser/w305.nut:10:4 +testData/static_analyzer/w305.nut:10:4 if (B == x > y) ^--------- @@ -7,7 +7,7 @@ if (B == x > y) WARNING: w305 (relative-bool-cmp) Relative comparison non-boolean with boolean. It is potential runtime error -testData/static_analyser/w305.nut:13:4 +testData/static_analyzer/w305.nut:13:4 if ((B == x) > y) ^----------- @@ -18,7 +18,7 @@ if ((B == x) > y) AN ERROR HAS OCCURRED [comparison between 'bool' and '1'] CALLSTACK -*FUNCTION [__main__()] testData/static_analyser/w305.nut line [13] +*FUNCTION [__main__()] testData/static_analyzer/w305.nut line [13] LOCALS [b2] false diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w305.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w305.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w305.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w305.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w306.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w306.diag.txt similarity index 79% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w306.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w306.diag.txt index 4654edc33..733443268 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w306.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w306.diag.txt @@ -1,5 +1,5 @@ WARNING: w306 (eq-paren-miss) Suspicious expression, probably parens are missed. -testData/static_analyser/w306.nut:6:4 +testData/static_analyzer/w306.nut:6:4 local z = 2 if (x == y != z) diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w306.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w306.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w306.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w306.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w308.diag.txt b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w308.diag.txt similarity index 86% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w308.diag.txt rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w308.diag.txt index 65d3ee6b5..ab67b869a 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w308.diag.txt +++ b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w308.diag.txt @@ -1,5 +1,5 @@ WARNING: w308 (bool-lambda-required) Function 'findindex' reuires lambda which returns boolean. -testData/static_analyser/w308.nut:5:28 +testData/static_analyzer/w308.nut:5:28 let _expected = o.findindex(@(t) t.id = 10) // EXPECTED ^-------------- diff --git a/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w308.nut b/prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w308.nut similarity index 100% rename from prog/1stPartyLibs/quirrel/quirrel/testData/static_analyser/w308.nut rename to prog/1stPartyLibs/quirrel/quirrel/testData/static_analyzer/w308.nut diff --git a/prog/1stPartyLibs/quirrel/quirrel/testRunner.py b/prog/1stPartyLibs/quirrel/quirrel/testRunner.py index 9967d03a0..331452c20 100644 --- a/prog/1stPartyLibs/quirrel/quirrel/testRunner.py +++ b/prog/1stPartyLibs/quirrel/quirrel/testRunner.py @@ -151,7 +151,7 @@ def runDiagTest(compiler, workingDir, dirname, name): runTestGeneric(compiler, workingDir, dirname, name, "Diagnostics", '.diag.txt', ["-diag-file"], False) def runSATest(compiler, workingDir, dirname, name): - runTestGeneric(compiler, workingDir, dirname, name, "Static Analyser", '.diag.txt', ["-sa", "-diag-file"], False) + runTestGeneric(compiler, workingDir, dirname, name, "Static Analyzer", '.diag.txt', ["-sa", "-diag-file"], False) def runExecuteTest(compiler, workingDir, dirname, name): runTestGeneric(compiler, workingDir, dirname, name, "Exec", '.out', [], True) @@ -230,7 +230,7 @@ def main(): walkDirectory(Path(computePath('testData', 'exec')), 0, lambda a: runTestForData(a, compiler, workingDir, 'exec')) walkDirectory(Path(computePath('testData', 'diagnostics')), 0, lambda a: runTestForData(a, compiler, workingDir, 'diag')) walkDirectory(Path(computePath('testData', 'ast')), 0, lambda a: runTestForData(a, compiler, workingDir, 'ast')) - walkDirectory(Path(computePath('testData', 'static_analyser')), 0, lambda a: runTestForData(a, compiler, workingDir, 'sa')) + walkDirectory(Path(computePath('testData', 'static_analyzer')), 0, lambda a: runTestForData(a, compiler, workingDir, 'sa')) if numOfFailedTests: xprint(f"Failed tests: {numOfFailedTests}", CBOLD + CRED) diff --git a/prog/1stPartyLibs/quirrel/sqrat/include/sqrat/sqratFunction.h b/prog/1stPartyLibs/quirrel/sqrat/include/sqrat/sqratFunction.h index 23287e722..8a5c0cb8c 100644 --- a/prog/1stPartyLibs/quirrel/sqrat/include/sqrat/sqratFunction.h +++ b/prog/1stPartyLibs/quirrel/sqrat/include/sqrat/sqratFunction.h @@ -399,4 +399,6 @@ struct Var : Var {Var(HSQUIRRELVM vm, SQInteger idx) } +DAG_DECLARE_RELOCATABLE(Sqrat::Function); + #endif diff --git a/prog/1stPartyLibs/quirrel/sqrat/include/sqrat/sqratObject.h b/prog/1stPartyLibs/quirrel/sqrat/include/sqrat/sqratObject.h index 4d63ff2a7..8e6f18725 100644 --- a/prog/1stPartyLibs/quirrel/sqrat/include/sqrat/sqratObject.h +++ b/prog/1stPartyLibs/quirrel/sqrat/include/sqrat/sqratObject.h @@ -36,6 +36,7 @@ #include "sqratAllocator.h" #include "sqratTypes.h" #include "sqratUtil.h" +#include namespace Sqrat { @@ -644,6 +645,8 @@ inline bool Object::Next(iterator& iter) const { } } -} +} // namespace Sqrat + +DAG_DECLARE_RELOCATABLE(Sqrat::Object); #endif diff --git a/prog/1stPartyLibs/vecmath/dag_vecMath.h b/prog/1stPartyLibs/vecmath/dag_vecMath.h index d78d18498..0b0c449a3 100644 --- a/prog/1stPartyLibs/vecmath/dag_vecMath.h +++ b/prog/1stPartyLibs/vecmath/dag_vecMath.h @@ -282,6 +282,8 @@ VECTORCALL VECMATH_FINLINE vec4i v_negi(vec4i a); //! fabs(a) VECTORCALL VECMATH_FINLINE vec4f v_abs(vec4f a); VECTORCALL VECMATH_FINLINE vec4i v_absi(vec4i a); +//! check if /a can produce NaN's or inf +VECTORCALL VECMATH_FINLINE vec4f v_is_unsafe_divisor(vec4f a); //! LERP a to b using parameter tttt VECTORCALL VECMATH_FINLINE quat4f v_lerp_vec4f(vec4f tttt, quat4f a, quat4f b); diff --git a/prog/1stPartyLibs/vecmath/dag_vecMath_common.h b/prog/1stPartyLibs/vecmath/dag_vecMath_common.h index 82d52109c..f7ae64cd1 100644 --- a/prog/1stPartyLibs/vecmath/dag_vecMath_common.h +++ b/prog/1stPartyLibs/vecmath/dag_vecMath_common.h @@ -65,15 +65,20 @@ VECTORCALL VECMATH_FINLINE vec4i v_clampi(vec4i t, vec4i min_val, vec4i max_val) return v_maxi(v_mini(t, max_val), min_val); } +VECTORCALL VECMATH_FINLINE vec4f v_is_unsafe_divisor(vec4f a) +{ + return v_cmp_lt(v_abs(a), V_C_VERY_SMALL_VAL); +} + VECTORCALL VECMATH_FINLINE vec4f v_safediv(vec4f a, vec4f b, vec4f def) { - vec4f isDiv0 = v_cmp_lt(v_abs(b), V_C_VERY_SMALL_VAL); + vec4f isDiv0 = v_is_unsafe_divisor(b); return v_sel(v_div(a, b), def, isDiv0); } VECTORCALL VECMATH_FINLINE vec4f v_rcp_safe(vec4f a, vec4f def) { - vec4f isDiv0 = v_cmp_lt(v_abs(a), V_C_VERY_SMALL_VAL); + vec4f isDiv0 = v_is_unsafe_divisor(a); return v_sel(v_rcp(a), def, isDiv0); } @@ -1643,7 +1648,7 @@ VECTORCALL VECMATH_FINLINE vec3f three_plane_intersection(plane3f p0, plane3f p1 { vec4f n1_n2 = v_cross3(p1, p2), n2_n0 = v_cross3(p2, p0), n0_n1 = v_cross3(p0, p1); vec4f cosTheta = v_dot3(p0, n1_n2); - invalid = v_cmp_lt(v_abs(cosTheta), V_C_VERY_SMALL_VAL); + invalid = v_is_unsafe_divisor(cosTheta); vec4f secTheta = v_rcp(cosTheta); vec4f intersectPt; diff --git a/prog/1stPartyLibs/vecmath/dag_vecMath_trig.h b/prog/1stPartyLibs/vecmath/dag_vecMath_trig.h index 209cff2ec..c86e62af9 100644 --- a/prog/1stPartyLibs/vecmath/dag_vecMath_trig.h +++ b/prog/1stPartyLibs/vecmath/dag_vecMath_trig.h @@ -300,41 +300,35 @@ VECTORCALL VECMATH_FINLINE vec4f v_atan2(vec4f y, vec4f x) tmp2 = v_and(maskYlt0, V_C_PI); vec4f offs = v_sub(tmp1, tmp2); - vec4f maskXeq0 = v_cmp_gt(v_cast_vec4f(v_splatsi(/* FLT_MIN */ 0x00800000)), v_abs(x)); + vec4f maskXeq0 = v_is_unsafe_divisor(x); vec4f atan = v_atan(v_div(y, x)); atan = v_add(atan, offs); - atan = v_andnot(maskXeq0, atan); - val = v_and(maskXeq0, val); - return v_add(atan, val); + return v_sel(atan, val, maskXeq0); } -// fast approx atan version. |error| is < 0.0004 -// ~40% faster then v_atan -// NOTE: does not handle any of the following inputs: -// (+0, +0), (+0, -0), (-0, +0), (-0, -0) -// could be fixed to handle -// calculates 4 in 2x speed of win libc implementation for 1, with same precision +// fast approx atan2 version. |error| is < 0.0004 +// calculates 4 in ~1.47x+ (untested, faster than v_atan2) speed of win libc implementation for 1 VECTORCALL VECMATH_INLINE vec4f v_atan2_est(vec4f y, vec4f x) { - // compute the atan - vec4f raw_atan = v_atan_est(v_div_est(y, x)); - - vec4f neg_x = is_neg_special(x); - vec4f neg_y = is_neg_special(y); + vec4f maskYgt0 = v_cmp_ge(y, v_zero()); + vec4f maskYlt0 = v_cmp_ge(v_zero(), y); + vec4f tmp1 = v_and(maskYgt0, V_C_HALFPI); + vec4f tmp2 = v_and(maskYlt0, V_C_HALFPI); + vec4f val = v_sub(tmp1, tmp2); - vec4f in_quad2 = v_andnot(neg_y, neg_x); - vec4f quad2_fixed = v_sel(raw_atan, v_add(raw_atan, V_C_PI), in_quad2); + vec4f maskXlt0 = v_cmp_ge(v_zero(), x); + maskYgt0 = v_andnot(maskYlt0, maskXlt0); + maskYlt0 = v_and(maskYlt0, maskXlt0); + tmp1 = v_and(maskYgt0, V_C_PI); + tmp2 = v_and(maskYlt0, V_C_PI); + vec4f offs = v_sub(tmp1, tmp2); - // move from quadrant 1 to 3 by subtracting PI - vec4f in_quad3 = v_and(neg_x, neg_y); - vec4f quad23_fixed = v_sel(quad2_fixed, v_sub(raw_atan, V_C_PI), in_quad3); + vec4f maskXeq0 = v_is_unsafe_divisor(x); + vec4f atan = v_atan_est(v_div(y, x)); - vec4f y_zero = v_cmp_eq(x, v_zero()); - vec4f halfpi = v_cast_vec4f(v_splatsi(0x3fc90fdb)); - vec4f yzeropos_fixed = v_sel(quad23_fixed, halfpi, v_and(y_zero, v_cmp_gt(y, v_zero()))); - vec4f yzeroneg_fixed = v_sel(yzeropos_fixed, v_neg(halfpi), v_and(y_zero, v_cmp_ge(v_zero(), y))); - return yzeroneg_fixed; + atan = v_add(atan, offs); + return v_sel(atan, val, maskXeq0); } VECTORCALL VECMATH_FINLINE void v_sincos_x(vec4f ang, vec4f& s, vec4f& c) diff --git a/prog/3rdPartyLibs/codecs/dav1d/include/common/attributes.h b/prog/3rdPartyLibs/codecs/dav1d/include/common/attributes.h index 2af42ef78..71c34f257 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/include/common/attributes.h +++ b/prog/3rdPartyLibs/codecs/dav1d/include/common/attributes.h @@ -43,7 +43,11 @@ #ifdef __GNUC__ #define ATTR_ALIAS __attribute__((may_alias)) +#if defined(__MINGW32__) && !defined(__clang__) +#define ATTR_FORMAT_PRINTF(fmt, attr) __attribute__((__format__(__gnu_printf__, fmt, attr))) +#else #define ATTR_FORMAT_PRINTF(fmt, attr) __attribute__((__format__(__printf__, fmt, attr))) +#endif #define COLD __attribute__((cold)) #else #define ATTR_ALIAS diff --git a/prog/3rdPartyLibs/codecs/dav1d/include/common/bitdepth.h b/prog/3rdPartyLibs/codecs/dav1d/include/common/bitdepth.h index 88a822aab..8a8f5740b 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/include/common/bitdepth.h +++ b/prog/3rdPartyLibs/codecs/dav1d/include/common/bitdepth.h @@ -34,7 +34,7 @@ #include "common/attributes.h" #if !defined(BITDEPTH) -typedef void pixel; +typedef uint8_t pixel; /* can't be void due to pointer-to-array usage */ typedef void coef; #define HIGHBD_DECL_SUFFIX /* nothing */ #define HIGHBD_CALL_SUFFIX /* nothing */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/include/common/validate.h b/prog/3rdPartyLibs/codecs/dav1d/include/common/validate.h index 3096f3db8..3aaed5bb9 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/include/common/validate.h +++ b/prog/3rdPartyLibs/codecs/dav1d/include/common/validate.h @@ -32,24 +32,26 @@ #include #if defined(NDEBUG) -#define debug_abort() +#define debug_print(...) do {} while (0) +#define debug_abort() do {} while (0) #else +#define debug_print(...) fprintf(stderr, __VA_ARGS__) #define debug_abort abort #endif #define validate_input_or_ret_with_msg(x, r, ...) \ if (!(x)) { \ - fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \ - #x, __func__); \ - fprintf(stderr, __VA_ARGS__); \ + debug_print("Input validation check \'%s\' failed in %s!\n", \ + #x, __func__); \ + debug_print(__VA_ARGS__); \ debug_abort(); \ return r; \ } #define validate_input_or_ret(x, r) \ if (!(x)) { \ - fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \ - #x, __func__); \ + debug_print("Input validation check \'%s\' failed in %s!\n", \ + #x, __func__); \ debug_abort(); \ return r; \ } diff --git a/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/common.h b/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/common.h index 8685b4f07..290e6ace3 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/common.h +++ b/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/common.h @@ -32,6 +32,10 @@ #include #include +#ifdef __cplusplus +extern "C" { +#endif + #ifndef DAV1D_API #if defined _WIN32 #if defined DAV1D_BUILDING_DLL @@ -83,4 +87,8 @@ typedef struct Dav1dDataProps { */ DAV1D_API void dav1d_data_props_unref(Dav1dDataProps *props); +#ifdef __cplusplus +} /* extern "C" */ +#endif + #endif /* DAV1D_COMMON_H */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/data.h b/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/data.h index f945a0424..e551ad650 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/data.h +++ b/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/data.h @@ -33,6 +33,10 @@ #include "common.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct Dav1dData { const uint8_t *data; ///< data pointer size_t sz; ///< data size @@ -106,4 +110,8 @@ DAV1D_API int dav1d_data_wrap_user_data(Dav1dData *data, */ DAV1D_API void dav1d_data_unref(Dav1dData *data); +#ifdef __cplusplus +} /* extern "C" */ +#endif + #endif /* DAV1D_DATA_H */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/dav1d.h b/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/dav1d.h index 237ce3eab..e8f070577 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/dav1d.h +++ b/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/dav1d.h @@ -28,10 +28,6 @@ #ifndef DAV1D_H #define DAV1D_H -#ifdef __cplusplus -extern "C" { -#endif - #include #include @@ -40,6 +36,10 @@ extern "C" { #include "data.h" #include "version.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct Dav1dContext Dav1dContext; typedef struct Dav1dRef Dav1dRef; @@ -68,6 +68,13 @@ enum Dav1dInloopFilterType { DAV1D_INLOOPFILTER_RESTORATION, }; +enum Dav1dDecodeFrameType { + DAV1D_DECODEFRAMETYPE_ALL = 0, ///< decode and return all frames + DAV1D_DECODEFRAMETYPE_REFERENCE = 1,///< decode and return frames referenced by other frames only + DAV1D_DECODEFRAMETYPE_INTRA = 2, ///< decode and return intra frames only (includes keyframes) + DAV1D_DECODEFRAMETYPE_KEY = 3, ///< decode and return keyframes only +}; + typedef struct Dav1dSettings { int n_threads; ///< number of threads (0 = number of logical cores in host system, default 0) int max_frame_delay; ///< Set to 1 for low-latency decoding (0 = ceil(sqrt(n_threads)), default 0) @@ -86,7 +93,9 @@ typedef struct Dav1dSettings { ///< once when shown, default 0) enum Dav1dInloopFilterType inloop_filters; ///< postfilters to enable during decoding (default ///< DAV1D_INLOOPFILTER_ALL) - uint8_t reserved[20]; ///< reserved for future use + enum Dav1dDecodeFrameType decode_frame_type; ///< frame types to decode (default + ///< DAV1D_DECODEFRAMETYPE_ALL) + uint8_t reserved[16]; ///< reserved for future use } Dav1dSettings; /** @@ -94,6 +103,15 @@ typedef struct Dav1dSettings { */ DAV1D_API const char *dav1d_version(void); +/** + * Get library API version. + * + * @return A value in the format 0x00XXYYZZ, where XX is the major version, + * YY the minor version, and ZZ the patch version. + * @see DAV1D_API_MAJOR, DAV1D_API_MINOR, DAV1D_API_PATCH + */ +DAV1D_API unsigned dav1d_version_api(void); + /** * Initialize settings to default values. * @@ -304,8 +322,8 @@ DAV1D_API int dav1d_get_decode_error_data_props(Dav1dContext *c, Dav1dDataProps */ DAV1D_API int dav1d_get_frame_delay(const Dav1dSettings *s); -# ifdef __cplusplus -} -# endif +#ifdef __cplusplus +} /* extern "C" */ +#endif #endif /* DAV1D_H */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/headers.h b/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/headers.h index e2f7aa07f..b9037f23d 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/headers.h +++ b/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/headers.h @@ -31,6 +31,10 @@ #include #include +#ifdef __cplusplus +extern "C" { +#endif + // Constants from Section 3. "Symbols and abbreviated terms" #define DAV1D_MAX_CDEF_STRENGTHS 8 #define DAV1D_MAX_OPERATING_POINTS 32 @@ -178,8 +182,8 @@ enum Dav1dChromaSamplePosition { }; typedef struct Dav1dContentLightLevel { - int max_content_light_level; - int max_frame_average_light_level; + uint16_t max_content_light_level; + uint16_t max_frame_average_light_level; } Dav1dContentLightLevel; typedef struct Dav1dMasteringDisplay { @@ -206,7 +210,7 @@ typedef struct Dav1dSequenceHeader { * 1 for 8-10 bits/component 4:4:4; 2 for 4:2:2 at any bits/component, * or 12 bits/component at any chroma subsampling. */ - int profile; + uint8_t profile; /** * Maximum dimensions for this stream. In non-scalable streams, these * are often the actual dimensions of the stream, although that is not @@ -225,60 +229,60 @@ typedef struct Dav1dSequenceHeader { * (twelve_bit) to distinguish between 10 and 12 bits/component. To get * the spec's hbd, use !!our_hbd, and to get twelve_bit, use hbd == 2. */ - int hbd; + uint8_t hbd; /** * Pixel data uses JPEG pixel range ([0,255] for 8bits) instead of * MPEG pixel range ([16,235] for 8bits luma, [16,240] for 8bits chroma). */ - int color_range; + uint8_t color_range; - int num_operating_points; + uint8_t num_operating_points; struct Dav1dSequenceHeaderOperatingPoint { - int major_level, minor_level; - int initial_display_delay; - int idc; - int tier; - int decoder_model_param_present; - int display_model_param_present; + uint8_t major_level, minor_level; + uint8_t initial_display_delay; + uint16_t idc; + uint8_t tier; + uint8_t decoder_model_param_present; + uint8_t display_model_param_present; } operating_points[DAV1D_MAX_OPERATING_POINTS]; - int still_picture; - int reduced_still_picture_header; - int timing_info_present; - int num_units_in_tick; - int time_scale; - int equal_picture_interval; - unsigned num_ticks_per_picture; - int decoder_model_info_present; - int encoder_decoder_buffer_delay_length; - int num_units_in_decoding_tick; - int buffer_removal_delay_length; - int frame_presentation_delay_length; - int display_model_info_present; - int width_n_bits, height_n_bits; - int frame_id_numbers_present; - int delta_frame_id_n_bits; - int frame_id_n_bits; - int sb128; - int filter_intra; - int intra_edge_filter; - int inter_intra; - int masked_compound; - int warped_motion; - int dual_filter; - int order_hint; - int jnt_comp; - int ref_frame_mvs; + uint8_t still_picture; + uint8_t reduced_still_picture_header; + uint8_t timing_info_present; + uint32_t num_units_in_tick; + uint32_t time_scale; + uint8_t equal_picture_interval; + uint32_t num_ticks_per_picture; + uint8_t decoder_model_info_present; + uint8_t encoder_decoder_buffer_delay_length; + uint32_t num_units_in_decoding_tick; + uint8_t buffer_removal_delay_length; + uint8_t frame_presentation_delay_length; + uint8_t display_model_info_present; + uint8_t width_n_bits, height_n_bits; + uint8_t frame_id_numbers_present; + uint8_t delta_frame_id_n_bits; + uint8_t frame_id_n_bits; + uint8_t sb128; + uint8_t filter_intra; + uint8_t intra_edge_filter; + uint8_t inter_intra; + uint8_t masked_compound; + uint8_t warped_motion; + uint8_t dual_filter; + uint8_t order_hint; + uint8_t jnt_comp; + uint8_t ref_frame_mvs; enum Dav1dAdaptiveBoolean screen_content_tools; enum Dav1dAdaptiveBoolean force_integer_mv; - int order_hint_n_bits; - int super_res; - int cdef; - int restoration; - int ss_hor, ss_ver, monochrome; - int color_description_present; - int separate_uv_delta_q; - int film_grain_present; + uint8_t order_hint_n_bits; + uint8_t super_res; + uint8_t cdef; + uint8_t restoration; + uint8_t ss_hor, ss_ver, monochrome; + uint8_t color_description_present; + uint8_t separate_uv_delta_q; + uint8_t film_grain_present; // Dav1dSequenceHeaders of the same sequence are required to be // bit-identical until this offset. See 7.5 "Ordering of OBUs": @@ -287,29 +291,29 @@ typedef struct Dav1dSequenceHeader { // sequence header appears except for the contents of // operating_parameters_info. struct Dav1dSequenceHeaderOperatingParameterInfo { - int decoder_buffer_delay; - int encoder_buffer_delay; - int low_delay_mode; + uint32_t decoder_buffer_delay; + uint32_t encoder_buffer_delay; + uint8_t low_delay_mode; } operating_parameter_info[DAV1D_MAX_OPERATING_POINTS]; } Dav1dSequenceHeader; typedef struct Dav1dSegmentationData { - int delta_q; - int delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v; - int ref; - int skip; - int globalmv; + int16_t delta_q; + int8_t delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v; + int8_t ref; + uint8_t skip; + uint8_t globalmv; } Dav1dSegmentationData; typedef struct Dav1dSegmentationDataSet { Dav1dSegmentationData d[DAV1D_MAX_SEGMENTS]; - int preskip; - int last_active_segid; + uint8_t preskip; + int8_t last_active_segid; } Dav1dSegmentationDataSet; typedef struct Dav1dLoopfilterModeRefDeltas { - int mode_delta[2 /* is_zeromv */]; - int ref_delta[DAV1D_TOTAL_REFS_PER_FRAME]; + int8_t mode_delta[2 /* is_zeromv */]; + int8_t ref_delta[DAV1D_TOTAL_REFS_PER_FRAME]; } Dav1dLoopfilterModeRefDeltas; typedef struct Dav1dFilmGrainData { @@ -335,101 +339,106 @@ typedef struct Dav1dFilmGrainData { typedef struct Dav1dFrameHeader { struct { Dav1dFilmGrainData data; - int present, update; + uint8_t present, update; } film_grain; ///< film grain parameters enum Dav1dFrameType frame_type; ///< type of the picture int width[2 /* { coded_width, superresolution_upscaled_width } */], height; - int frame_offset; ///< frame number - int temporal_id; ///< temporal id of the frame for SVC - int spatial_id; ///< spatial id of the frame for SVC - - int show_existing_frame; - int existing_frame_idx; - int frame_id; - int frame_presentation_delay; - int show_frame; - int showable_frame; - int error_resilient_mode; - int disable_cdf_update; - int allow_screen_content_tools; - int force_integer_mv; - int frame_size_override; - int primary_ref_frame; - int buffer_removal_time_present; + uint8_t frame_offset; ///< frame number + uint8_t temporal_id; ///< temporal id of the frame for SVC + uint8_t spatial_id; ///< spatial id of the frame for SVC + + uint8_t show_existing_frame; + uint8_t existing_frame_idx; + uint32_t frame_id; + uint32_t frame_presentation_delay; + uint8_t show_frame; + uint8_t showable_frame; + uint8_t error_resilient_mode; + uint8_t disable_cdf_update; + uint8_t allow_screen_content_tools; + uint8_t force_integer_mv; + uint8_t frame_size_override; + uint8_t primary_ref_frame; + uint8_t buffer_removal_time_present; struct Dav1dFrameHeaderOperatingPoint { - int buffer_removal_time; + uint32_t buffer_removal_time; } operating_points[DAV1D_MAX_OPERATING_POINTS]; - int refresh_frame_flags; + uint8_t refresh_frame_flags; int render_width, render_height; struct { - int width_scale_denominator; - int enabled; + uint8_t width_scale_denominator; + uint8_t enabled; } super_res; - int have_render_size; - int allow_intrabc; - int frame_ref_short_signaling; - int refidx[DAV1D_REFS_PER_FRAME]; - int hp; + uint8_t have_render_size; + uint8_t allow_intrabc; + uint8_t frame_ref_short_signaling; + int8_t refidx[DAV1D_REFS_PER_FRAME]; + uint8_t hp; enum Dav1dFilterMode subpel_filter_mode; - int switchable_motion_mode; - int use_ref_frame_mvs; - int refresh_context; + uint8_t switchable_motion_mode; + uint8_t use_ref_frame_mvs; + uint8_t refresh_context; struct { - int uniform; - unsigned n_bytes; - int min_log2_cols, max_log2_cols, log2_cols, cols; - int min_log2_rows, max_log2_rows, log2_rows, rows; + uint8_t uniform; + uint8_t n_bytes; + uint8_t min_log2_cols, max_log2_cols, log2_cols, cols; + uint8_t min_log2_rows, max_log2_rows, log2_rows, rows; uint16_t col_start_sb[DAV1D_MAX_TILE_COLS + 1]; uint16_t row_start_sb[DAV1D_MAX_TILE_ROWS + 1]; - int update; + uint16_t update; } tiling; struct { - int yac; - int ydc_delta; - int udc_delta, uac_delta, vdc_delta, vac_delta; - int qm, qm_y, qm_u, qm_v; + uint8_t yac; + int8_t ydc_delta; + int8_t udc_delta, uac_delta, vdc_delta, vac_delta; + uint8_t qm, qm_y, qm_u, qm_v; } quant; struct { - int enabled, update_map, temporal, update_data; + uint8_t enabled, update_map, temporal, update_data; Dav1dSegmentationDataSet seg_data; - int lossless[DAV1D_MAX_SEGMENTS], qidx[DAV1D_MAX_SEGMENTS]; + uint8_t lossless[DAV1D_MAX_SEGMENTS], qidx[DAV1D_MAX_SEGMENTS]; } segmentation; struct { struct { - int present; - int res_log2; + uint8_t present; + uint8_t res_log2; } q; struct { - int present; - int res_log2; - int multi; + uint8_t present; + uint8_t res_log2; + uint8_t multi; } lf; } delta; - int all_lossless; + uint8_t all_lossless; struct { - int level_y[2 /* dir */]; - int level_u, level_v; - int mode_ref_delta_enabled; - int mode_ref_delta_update; + uint8_t level_y[2 /* dir */]; + uint8_t level_u, level_v; + uint8_t mode_ref_delta_enabled; + uint8_t mode_ref_delta_update; Dav1dLoopfilterModeRefDeltas mode_ref_deltas; - int sharpness; + uint8_t sharpness; } loopfilter; struct { - int damping; - int n_bits; - int y_strength[DAV1D_MAX_CDEF_STRENGTHS]; - int uv_strength[DAV1D_MAX_CDEF_STRENGTHS]; + uint8_t damping; + uint8_t n_bits; + uint8_t y_strength[DAV1D_MAX_CDEF_STRENGTHS]; + uint8_t uv_strength[DAV1D_MAX_CDEF_STRENGTHS]; } cdef; struct { enum Dav1dRestorationType type[3 /* plane */]; - int unit_size[2 /* y, uv */]; + uint8_t unit_size[2 /* y, uv */]; } restoration; enum Dav1dTxfmMode txfm_mode; - int switchable_comp_refs; - int skip_mode_allowed, skip_mode_enabled, skip_mode_refs[2]; - int warp_motion; - int reduced_txtp_set; + uint8_t switchable_comp_refs; + uint8_t skip_mode_allowed, skip_mode_enabled; + int8_t skip_mode_refs[2]; + uint8_t warp_motion; + uint8_t reduced_txtp_set; Dav1dWarpedMotionParams gmv[DAV1D_REFS_PER_FRAME]; } Dav1dFrameHeader; +#ifdef __cplusplus +} /* extern "C" */ +#endif + #endif /* DAV1D_HEADERS_H */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/picture.h b/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/picture.h index 2eb0b62e3..cc291a4ab 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/picture.h +++ b/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/picture.h @@ -34,6 +34,10 @@ #include "common.h" #include "headers.h" +#ifdef __cplusplus +extern "C" { +#endif + /* Number of bytes to align AND pad picture memory buffers by, so that SIMD * implementations can over-read by a few bytes, and use aligned read/write * instructions. */ @@ -78,10 +82,15 @@ typedef struct Dav1dPicture { */ Dav1dMasteringDisplay *mastering_display; /** - * ITU-T T.35 metadata as defined in section 5.8.2 and 6.7.2 + * Array of ITU-T T.35 metadata as defined in section 5.8.2 and 6.7.2 */ Dav1dITUTT35 *itut_t35; + /** + * Number of ITU-T T35 metadata entries in the array + */ + size_t n_itut_t35; + uintptr_t reserved[4]; ///< reserved for future use struct Dav1dRef *frame_hdr_ref; ///< Dav1dFrameHeader allocation origin @@ -141,4 +150,8 @@ typedef struct Dav1dPicAllocator { */ DAV1D_API void dav1d_picture_unref(Dav1dPicture *p); +#ifdef __cplusplus +} /* extern "C" */ +#endif + #endif /* DAV1D_PICTURE_H */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/version.h b/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/version.h index b25f9cb1c..a6115fb1b 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/version.h +++ b/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/version.h @@ -27,8 +27,24 @@ #ifndef DAV1D_VERSION_H #define DAV1D_VERSION_H -#define DAV1D_API_VERSION_MAJOR 6 -#define DAV1D_API_VERSION_MINOR 7 +#ifdef __cplusplus +extern "C" { +#endif + +#define DAV1D_API_VERSION_MAJOR 7 +#define DAV1D_API_VERSION_MINOR 0 #define DAV1D_API_VERSION_PATCH 0 +/** + * Extract version components from the value returned by + * dav1d_version_int() + */ +#define DAV1D_API_MAJOR(v) (((v) >> 16) & 0xFF) +#define DAV1D_API_MINOR(v) (((v) >> 8) & 0xFF) +#define DAV1D_API_PATCH(v) (((v) >> 0) & 0xFF) + +#ifdef __cplusplus +} /* extern "C" */ +#endif + #endif /* DAV1D_VERSION_H */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/version.h.in b/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/version.h.in new file mode 100644 index 000000000..4fa420ded --- /dev/null +++ b/prog/3rdPartyLibs/codecs/dav1d/include/dav1d/version.h.in @@ -0,0 +1,50 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_VERSION_H +#define DAV1D_VERSION_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define DAV1D_API_VERSION_MAJOR @DAV1D_API_VERSION_MAJOR@ +#define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@ +#define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@ + +/** + * Extract version components from the value returned by + * dav1d_version_int() + */ +#define DAV1D_API_MAJOR(v) (((v) >> 16) & 0xFF) +#define DAV1D_API_MINOR(v) (((v) >> 8) & 0xFF) +#define DAV1D_API_PATCH(v) (((v) >> 0) & 0xFF) + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* DAV1D_VERSION_H */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/include/vcs_version.h b/prog/3rdPartyLibs/codecs/dav1d/include/vcs_version.h index 068b9edbc..6b64edee4 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/include/vcs_version.h +++ b/prog/3rdPartyLibs/codecs/dav1d/include/vcs_version.h @@ -1,2 +1,2 @@ /* auto-generated, do not edit */ -#define DAV1D_VERSION "1.0.0-90-g4b9f5b7" +#define DAV1D_VERSION "1.3.0" diff --git a/prog/3rdPartyLibs/codecs/dav1d/include/vcs_version.h.in b/prog/3rdPartyLibs/codecs/dav1d/include/vcs_version.h.in new file mode 100644 index 000000000..71ed2f698 --- /dev/null +++ b/prog/3rdPartyLibs/codecs/dav1d/include/vcs_version.h.in @@ -0,0 +1,2 @@ +/* auto-generated, do not edit */ +#define DAV1D_VERSION "@VCS_TAG@" diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/filmgrain.S b/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/filmgrain.S index d1f83efb9..9d59d5d5e 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/filmgrain.S +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/filmgrain.S @@ -1481,8 +1481,8 @@ function fgy_32x32_8bpc_neon, export=1 calc_offset r6, lr, r6, 0, 0 add_offset r5, r6, lr, r5, r9 - add r4, r4, #32 // grain_lut += BLOCK_SIZE * bx - add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add r4, r4, #32 // grain_lut += FG_BLOCK_SIZE * bx + add r6, r11, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by ldr r10, [sp, #120] // type adr r11, L(fgy_loop_tbl) @@ -1490,8 +1490,8 @@ function fgy_32x32_8bpc_neon, export=1 tst r10, #1 ldr r10, [r11, r10, lsl #2] - add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by - add r8, r8, #32 // grain_lut += BLOCK_SIZE * bx + add r8, r8, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by + add r8, r8, #32 // grain_lut += FG_BLOCK_SIZE * bx add r11, r11, r10 @@ -1695,10 +1695,10 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1 calc_offset r8, r12, r8, \sx, \sy add_offset r5, r8, r12, r5, r10 - add r4, r4, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx - add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add r4, r4, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx + add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by + add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by + add r11, r11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx movrel_local r12, overlap_coeffs_\sx ldr lr, [sp, #132] // type diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/filmgrain16.S b/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/filmgrain16.S index 6c36cacea..d10bffff2 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/filmgrain16.S +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/filmgrain16.S @@ -1353,8 +1353,8 @@ function fgy_32x32_16bpc_neon, export=1 calc_offset r6, lr, r6, 0, 0 add_offset r5, r6, lr, r5, r9 - add r4, r4, #32*2 // grain_lut += BLOCK_SIZE * bx - add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add r4, r4, #32*2 // grain_lut += FG_BLOCK_SIZE * bx + add r6, r11, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by ldr r10, [sp, #120] // type adr r11, L(fgy_loop_tbl) @@ -1362,8 +1362,8 @@ function fgy_32x32_16bpc_neon, export=1 tst r10, #1 ldr r10, [r11, r10, lsl #2] - add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by - add r8, r8, #32*2 // grain_lut += BLOCK_SIZE * bx + add r8, r8, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by + add r8, r8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx add r11, r11, r10 @@ -1651,10 +1651,10 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1 vmov.16 d31[3], r7 // overlap y [1] - add r4, r4, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx - add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add r11, r11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add r4, r4, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx + add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by + add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by + add r11, r11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx movrel_local r12, overlap_coeffs_\sx ldr lr, [sp, #132] // type diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/ipred.S b/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/ipred.S index ff55d95d4..8c6d539a4 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/ipred.S +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/ipred.S @@ -1576,17 +1576,17 @@ L(ipred_filter_tbl): endfunc // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, -// const uint16_t *const pal, const uint8_t *idx, +// const pixel *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_8bpc_neon, export=1 push {r4-r5, lr} ldrd r4, r5, [sp, #12] - vld1.16 {q0}, [r2, :128] + vld1.8 {d0}, [r2, :64] clz lr, r4 adr r12, L(pal_pred_tbl) sub lr, lr, #25 + vmov.i8 q15, #7 ldr lr, [r12, lr, lsl #2] - vmovn.i16 d0, q0 add r12, r12, lr add r2, r0, r1 bx r12 @@ -1602,8 +1602,11 @@ L(pal_pred_tbl): 40: lsl r1, r1, #1 4: - vld1.8 {q1}, [r3, :128]! + vld1.8 {d2}, [r3, :64]! subs r5, r5, #4 + vshr.u8 d3, d2, #4 + vand.u8 d2, d2, d30 + vzip.8 d2, d3 vtbl.8 d2, {d0}, d2 vtbl.8 d3, {d0}, d3 vst1.32 {d2[0]}, [r0, :32], r1 @@ -1615,8 +1618,11 @@ L(pal_pred_tbl): 80: lsl r1, r1, #1 8: - vld1.8 {q1, q2}, [r3, :128]! + vld1.8 {q1}, [r3, :64]! subs r5, r5, #4 + vshr.u8 q2, q1, #4 + vand.u8 q1, q1, q15 + vzip.8 q1, q2 vtbl.8 d2, {d0}, d2 vtbl.8 d3, {d0}, d3 vst1.8 {d2}, [r0, :64], r1 @@ -1630,9 +1636,14 @@ L(pal_pred_tbl): 160: lsl r1, r1, #1 16: - vld1.8 {q8, q9}, [r3, :128]! + vld1.8 {q10, q11}, [r3, :64]! subs r5, r5, #4 - vld1.8 {q10, q11}, [r3, :128]! + vand.u8 q8, q10, q15 + vshr.u8 q9, q10, #4 + vand.u8 q10, q11, q15 + vshr.u8 q11, q11, #4 + vzip.8 q8, q9 + vzip.8 q10, q11 vtbl.8 d16, {d0}, d16 vtbl.8 d17, {d0}, d17 vtbl.8 d18, {d0}, d18 @@ -1650,9 +1661,14 @@ L(pal_pred_tbl): 320: lsl r1, r1, #1 32: - vld1.8 {q8, q9}, [r3, :128]! + vld1.8 {q10, q11}, [r3, :64]! subs r5, r5, #2 - vld1.8 {q10, q11}, [r3, :128]! + vand.u8 q8, q10, q15 + vshr.u8 q9, q10, #4 + vand.u8 q10, q11, q15 + vshr.u8 q11, q11, #4 + vzip.8 q8, q9 + vzip.8 q10, q11 vtbl.8 d16, {d0}, d16 vtbl.8 d17, {d0}, d17 vtbl.8 d18, {d0}, d18 @@ -1668,9 +1684,14 @@ L(pal_pred_tbl): 640: sub r1, r1, #32 64: - vld1.8 {q8, q9}, [r3, :128]! + vld1.8 {q10, q11}, [r3, :64]! subs r5, r5, #1 - vld1.8 {q10, q11}, [r3, :128]! + vand.u8 q8, q10, q15 + vshr.u8 q9, q10, #4 + vand.u8 q10, q11, q15 + vshr.u8 q11, q11, #4 + vzip.8 q8, q9 + vzip.8 q10, q11 vtbl.8 d16, {d0}, d16 vtbl.8 d17, {d0}, d17 vtbl.8 d18, {d0}, d18 diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/ipred16.S b/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/ipred16.S index 993d9500a..fa7804976 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/ipred16.S +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/ipred16.S @@ -1732,7 +1732,7 @@ function ipred_filter_16bpc_neon, export=1 endfunc // void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, -// const uint16_t *const pal, const uint8_t *idx, +// const pixel *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_16bpc_neon, export=1 push {r4-r5, lr} @@ -1742,6 +1742,7 @@ function pal_pred_16bpc_neon, export=1 clz lr, r4 adr r12, L(pal_pred_tbl) sub lr, lr, #25 + vmov.i8 q13, #7 ldr lr, [r12, lr, lsl #2] vmov.i16 q15, #0x100 add r12, r12, lr @@ -1759,8 +1760,11 @@ L(pal_pred_tbl): 40: lsl r1, r1, #1 4: - vld1.8 {q1}, [r3, :128]! + vld1.8 {d2}, [r3, :64]! subs r5, r5, #4 + vshr.u8 d3, d2, #4 + vand.u8 d2, d2, d26 + vzip.8 d2, d3 // Restructure q1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... vadd.i8 q0, q1, q1 vadd.i8 q1, q1, q1 @@ -1780,8 +1784,11 @@ L(pal_pred_tbl): 80: lsl r1, r1, #1 8: - vld1.8 {q1, q2}, [r3, :128]! + vld1.8 {q1}, [r3, :64]! subs r5, r5, #4 + vshr.u8 q2, q1, #4 + vand.u8 q1, q1, q13 + vzip.8 q1, q2 // Prefer doing the adds twice, instead of chaining a vmov after // the add. vadd.i8 q0, q1, q1 @@ -1811,9 +1818,14 @@ L(pal_pred_tbl): 160: lsl r1, r1, #1 16: - vld1.8 {q2, q3}, [r3, :128]! + vld1.8 {q10, q11}, [r3, :64]! subs r5, r5, #4 - vld1.8 {q10, q11}, [r3, :128]! + vand.u8 q2, q10, q13 + vshr.u8 q3, q10, #4 + vand.u8 q10, q11, q13 + vshr.u8 q11, q11, #4 + vzip.8 q2, q3 + vzip.8 q10, q11 vadd.i8 q0, q2, q2 vadd.i8 q1, q2, q2 vadd.i8 q2, q3, q3 @@ -1860,9 +1872,14 @@ L(pal_pred_tbl): lsl r1, r1, #1 sub r1, r1, #32 32: - vld1.8 {q2, q3}, [r3, :128]! + vld1.8 {q10, q11}, [r3, :64]! subs r5, r5, #2 - vld1.8 {q10, q11}, [r3, :128]! + vand.u8 q2, q10, q13 + vshr.u8 q3, q10, #4 + vand.u8 q10, q11, q13 + vshr.u8 q11, q11, #4 + vzip.8 q2, q3 + vzip.8 q10, q11 vadd.i8 q0, q2, q2 vadd.i8 q1, q2, q2 vadd.i8 q2, q3, q3 @@ -1908,9 +1925,14 @@ L(pal_pred_tbl): 640: sub r1, r1, #96 64: - vld1.8 {q2, q3}, [r3, :128]! + vld1.8 {q10, q11}, [r3, :64]! subs r5, r5, #1 - vld1.8 {q10, q11}, [r3, :128]! + vand.u8 q2, q10, q13 + vshr.u8 q3, q10, #4 + vand.u8 q10, q11, q13 + vshr.u8 q11, q11, #4 + vzip.8 q2, q3 + vzip.8 q10, q11 vadd.i8 q0, q2, q2 vadd.i8 q1, q2, q2 vadd.i8 q2, q3, q3 diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/refmvs.S b/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/refmvs.S index e16c5448d..7f31db11e 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/refmvs.S +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/32/refmvs.S @@ -95,3 +95,209 @@ L(splat_tbl): bgt 1b pop {r4, pc} endfunc + +const mv_tbls, align=4 + .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 + .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 + .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 + .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 +endconst + +const mask_mult, align=4 + .byte 1, 2, 1, 2, 0, 0, 0, 0 +endconst + +// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride, +// refmvs_block **rr, const uint8_t *ref_sign, +// int col_end8, int row_end8, +// int col_start8, int row_start8) +function save_tmvs_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] + + vmov.i8 d30, #0 + vld1.8 {d31}, [r3] + adr r8, L(save_tmvs_tbl) + movrel_local lr, mask_mult + movrel_local r12, mv_tbls + vld1.8 {d29}, [lr] + vext.8 d31, d30, d31, #7 // [0, ref_sign] + mov r3, #5 + mul r1, r1, r3 // stride *= 5 + sub r5, r5, r7 // h = row_end8 - row_start8 + lsl r7, r7, #1 // row_start8 <<= 1 +1: + mov r3, #5 + mov r11, #12*2 + and r9, r7, #30 // (y & 15) * 2 + ldr r9, [r2, r9, lsl #2] // b = rr[(y & 15) * 2] + add r9, r9, #12 // &b[... + 1] + mla r10, r4, r11, r9 // end_cand_b = &b[col_end8*2 + 1] + mla r9, r6, r11, r9 // cand_b = &b[x*2 + 1] + + mla r3, r6, r3, r0 // &rp[x] + + push {r2,r4,r6} + +2: + ldrb r11, [r9, #10] // cand_b->bs + add lr, r9, #8 + vld1.8 {d0, d1}, [r9] // cand_b->mv + add r11, r8, r11, lsl #3 + vld1.16 {d2[]}, [lr] // cand_b->ref + ldrh lr, [r11] // bw8 + mov r2, r8 + add r9, r9, lr, lsl #1 // cand_b += bw8*2 + cmp r9, r10 + vmov d4, d0 + bge 3f + + ldrb r2, [r9, #10] // cand_b->bs + add lr, r9, #8 + vld1.8 {d6, d7}, [r9] // cand_b->mv + add r2, r8, r2, lsl #3 + vld1.16 {d2[1]}, [lr] // cand_b->ref + ldrh lr, [r2] // bw8 + add r9, r9, lr, lsl #1 // cand_b += bw8*2 + vmov d5, d6 + +3: + vabs.s16 q2, q2 // abs(mv[].xy) + vtbl.8 d2, {d31}, d2 // ref_sign[ref] + vshr.u16 q2, q2, #12 // abs(mv[].xy) >> 12 + vmull.u8 q1, d2, d29 // ref_sign[ref] * {1, 2} + vceq.i32 q2, q2, #0 // abs(mv[].xy) <= 4096 + vmovn.i32 d4, q2 // abs() condition to 16 bit + vand d2, d2, d4 // h[0-3] contains conditions for mv[0-1] + vpadd.i16 d2, d2, d2 // Combine condition for [1] and [0] + vmov.u16 r4, d2[0] // Extract case for first block + vmov.u16 r6, d2[1] + ldr r11, [r11, #4] // Fetch jump table entry + ldr r2, [r2, #4] + add r4, r12, r4, lsl #4 + add r6, r12, r6, lsl #4 + vld1.8 {d2, d3}, [r4] // Load permutation table base on case + vld1.8 {d4, d5}, [r6] + add r11, r8, r11 // Find jump table target + add r2, r8, r2 + vtbl.8 d16, {d0, d1}, d2 // Permute cand_b to output refmvs_temporal_block + vtbl.8 d17, {d0, d1}, d3 + vtbl.8 d18, {d6, d7}, d4 + vtbl.8 d19, {d6, d7}, d5 + vmov q0, q8 + + // q1 follows on q0 (q8), with another 3 full repetitions of the pattern. + vext.8 q1, q8, q8, #1 + vext.8 q10, q9, q9, #1 + // q2 ends with 3 complete repetitions of the pattern. + vext.8 q2, q8, q1, #4 + vext.8 q11, q9, q10, #4 + + blx r11 + bge 4f // if (cand_b >= end) + vmov q0, q9 + vmov q1, q10 + vmov q2, q11 + cmp r9, r10 + blx r2 + blt 2b // if (cand_b < end) + +4: + pop {r2,r4,r6} + + subs r5, r5, #1 // h-- + add r7, r7, #2 // y += 2 + add r0, r0, r1 // rp += stride + bgt 1b + + pop {r4-r11,pc} + + .align 2 +L(save_tmvs_tbl): + .word 16 * 12 + .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 16 * 12 + .word 160f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 8 * 12 + .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 8 * 12 + .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 8 * 12 + .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 8 * 12 + .word 80f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 4 * 12 + .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 4 * 12 + .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 4 * 12 + .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 4 * 12 + .word 40f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 2 * 12 + .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 2 * 12 + .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 2 * 12 + .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 2 * 12 + .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 2 * 12 + .word 20f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 1 * 12 + .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 1 * 12 + .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 1 * 12 + .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 1 * 12 + .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 1 * 12 + .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 1 * 12 + .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB + .word 1 * 12 + .word 10f - L(save_tmvs_tbl) + CONFIG_THUMB + +10: + add r4, r3, #4 + vst1.32 {d0[0]}, [r3] + vst1.8 {d0[4]}, [r4] + add r3, r3, #5 + bx lr +20: + add r4, r3, #8 + vst1.8 {d0}, [r3] + vst1.16 {d1[0]}, [r4] + add r3, r3, #2*5 + bx lr +40: + add r4, r3, #16 + vst1.8 {q0}, [r3] + vst1.32 {d2[0]}, [r4] + add r3, r3, #4*5 + bx lr +80: + add r4, r3, #(8*5-16) + // This writes 6 full entries plus 2 extra bytes + vst1.8 {q0, q1}, [r3] + // Write the last few, overlapping with the first write. + vst1.8 {q2}, [r4] + add r3, r3, #8*5 + bx lr +160: + add r4, r3, #6*5 + add r6, r3, #12*5 + // This writes 6 full entries plus 2 extra bytes + vst1.8 {q0, q1}, [r3] + // Write another 6 full entries, slightly overlapping with the first set + vst1.8 {q0, q1}, [r4] + add r4, r3, #(16*5-16) + // Write 8 bytes (one full entry) after the first 12 + vst1.8 {d0}, [r6] + // Write the last 3 entries + vst1.8 {q2}, [r4] + add r3, r3, #16*5 + bx lr +endfunc diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/filmgrain.S b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/filmgrain.S index 6cdd7ec5f..aa7f18bf3 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/filmgrain.S +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/filmgrain.S @@ -1409,14 +1409,14 @@ function fgy_32x32_8bpc_neon, export=1 ldr w11, [sp, #24] // type adr x13, L(fgy_loop_tbl) - add x4, x12, #32 // grain_lut += BLOCK_SIZE * bx - add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add x4, x12, #32 // grain_lut += FG_BLOCK_SIZE * bx + add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by tst w11, #1 ldrh w11, [x13, w11, uxtw #1] - add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by - add x8, x8, #32 // grain_lut += BLOCK_SIZE * bx + add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by + add x8, x8, #32 // grain_lut += FG_BLOCK_SIZE * bx sub x11, x13, w11, uxtw @@ -1638,10 +1638,10 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1 add_offset x17, w16, x17, x5, x10 add_offset x5, w8, x11, x5, x10 - add x4, x13, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx - add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add x11, x11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add x4, x13, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx + add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by + add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by + add x11, x11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx ldr w13, [sp, #64] // type diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/filmgrain16.S b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/filmgrain16.S index 7c4ff6dda..75252acfb 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/filmgrain16.S +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/filmgrain16.S @@ -1308,14 +1308,14 @@ function fgy_32x32_16bpc_neon, export=1 ldr w11, [sp, #88] // type adr x13, L(fgy_loop_tbl) - add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx - add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by + add x4, x12, #32*2 // grain_lut += FG_BLOCK_SIZE * bx + add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by tst w11, #1 ldrh w11, [x13, w11, uxtw #1] - add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by - add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx + add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by + add x8, x8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx sub x11, x13, w11, uxtw @@ -1581,10 +1581,10 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1 add_offset x17, w16, x17, x5, x10 add_offset x5, w8, x11, x5, x10 - add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx - add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by - add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx + add x4, x13, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx + add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by + add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by + add x11, x11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx ldr w13, [sp, #112] // type diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/ipred.S b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/ipred.S index 84c34b8f5..709238e2f 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/ipred.S +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/ipred.S @@ -1375,6 +1375,2390 @@ L(ipred_smooth_h_tbl): .hword L(ipred_smooth_h_tbl) - 40b endfunc +const padding_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +padding_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + +// void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz, +// const pixel *const in, const int end); +function ipred_z1_upsample_edge_8bpc_neon, export=1 + movrel x4, padding_mask + ld1 {v0.16b}, [x2] // in[] + add x5, x2, w3, uxtw // in[end] + sub x4, x4, w3, uxtw + + ld1r {v1.16b}, [x5] // padding + ld1 {v3.16b}, [x4] // padding_mask + + movi v31.8h, #9 + + bit v0.16b, v1.16b, v3.16b // padded in[] + + ext v4.16b, v0.16b, v1.16b, #1 + ext v5.16b, v0.16b, v1.16b, #2 + ext v6.16b, v0.16b, v1.16b, #3 + + uaddl v16.8h, v4.8b, v5.8b // in[i+1] + in[i+2] + uaddl2 v17.8h, v4.16b, v5.16b + uaddl v18.8h, v0.8b, v6.8b // in[i+0] + in[i+3] + uaddl2 v19.8h, v0.16b, v6.16b + mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2]) + mul v17.8h, v17.8h, v31.8h + sub v16.8h, v16.8h, v18.8h + sub v17.8h, v17.8h, v19.8h + + sqrshrun v16.8b, v16.8h, #4 + sqrshrun2 v16.16b, v17.8h, #4 + + zip1 v0.16b, v4.16b, v16.16b + zip2 v1.16b, v4.16b, v16.16b + + st1 {v0.16b, v1.16b}, [x0] + + ret +endfunc + +// void ipred_z2_upsample_edge_8bpc_neon(pixel *out, const int sz, +// const pixel *const in); +function ipred_z2_upsample_edge_8bpc_neon, export=1 + // Here, sz is 4 or 8, and we produce 2*sz+1 output elements. + movrel x4, padding_mask + ld1 {v0.16b}, [x2] // in[] + add x5, x2, w1, uxtw // in[sz] + sub x4, x4, w1, uxtw + + ld1r {v2.16b}, [x2] // in[0] for padding + ld1r {v1.16b}, [x5] // padding + ld1 {v3.16b}, [x4] // padding_mask + + movi v31.8h, #9 + + bit v0.16b, v1.16b, v3.16b // padded in[] + + ext v4.16b, v2.16b, v0.16b, #15 + ext v5.16b, v0.16b, v1.16b, #1 + ext v6.16b, v0.16b, v1.16b, #2 + + uaddl v16.8h, v0.8b, v5.8b // in[i+0] + in[i+1] + uaddl v18.8h, v4.8b, v6.8b // in[i-1] + in[i+2] + mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2]) + sub v16.8h, v16.8h, v18.8h + + sqrshrun v16.8b, v16.8h, #4 + + add x5, x0, #16 + + zip1 v2.16b, v0.16b, v16.16b + + st1 {v1.b}[0], [x5] + // In case sz=8, output one single pixel in out[16]. + st1 {v2.16b}, [x0] + + ret +endfunc + +const edge_filter + .byte 0, 4, 8, 0 + .byte 0, 5, 6, 0 +// Leaving out the coeffs for strength=3 +// .byte 2, 4, 4, 0 +endconst + +// void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz, +// const pixel *const in, const int end, +// const int strength); +function ipred_z1_filter_edge_8bpc_neon, export=1 + cmp w4, #3 + b.eq L(fivetap) // if (strength == 3) goto fivetap + + movrel x5, edge_filter, -3 + add x5, x5, w4, uxtw #2 // edge_filter + (strength - 1)*4 + 1 + + ld1 {v31.h}[0], [x5] // kernel[1-2] + + ld1 {v0.16b}, [x2], #16 + + dup v30.16b, v31.b[0] + dup v31.16b, v31.b[1] +1: + // in[end], is the last valid pixel. We produce 16 pixels out by + // using 18 pixels in - the last pixel used is [17] of the ones + // read/buffered. + cmp w3, #17 + ld1 {v1.16b}, [x2], #16 + b.lt 2f + ext v2.16b, v0.16b, v1.16b, #1 + ext v3.16b, v0.16b, v1.16b, #2 + umull v4.8h, v0.8b, v30.8b + umlal v4.8h, v2.8b, v31.8b + umlal v4.8h, v3.8b, v30.8b + umull2 v5.8h, v0.16b, v30.16b + umlal2 v5.8h, v2.16b, v31.16b + umlal2 v5.8h, v3.16b, v30.16b + subs w1, w1, #16 + mov v0.16b, v1.16b + rshrn v4.8b, v4.8h, #4 + rshrn2 v4.16b, v5.8h, #4 + sub w3, w3, #16 + st1 {v4.16b}, [x0], #16 + b.gt 1b + ret +2: + // Right padding + + // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead) + movrel x5, padding_mask + sub w6, w3, #32 + sub x5, x5, w3, uxtw + add x6, x2, w6, sxtw + + ld1 {v2.16b}, [x5] // padding_mask + + ld1r {v1.16b}, [x6] + bit v0.16b, v1.16b, v2.16b // Pad v0-v1 + + // Filter one block + ext v2.16b, v0.16b, v1.16b, #1 + ext v3.16b, v0.16b, v1.16b, #2 + umull v4.8h, v0.8b, v30.8b + umlal v4.8h, v2.8b, v31.8b + umlal v4.8h, v3.8b, v30.8b + umull2 v5.8h, v0.16b, v30.16b + umlal2 v5.8h, v2.16b, v31.16b + umlal2 v5.8h, v3.16b, v30.16b + subs w1, w1, #16 + rshrn v4.8b, v4.8h, #4 + rshrn2 v4.16b, v5.8h, #4 + st1 {v4.16b}, [x0], #16 + b.le 9f +5: + // After one block, any remaining output would only be filtering + // padding - thus just store the padding. + subs w1, w1, #16 + st1 {v1.16b}, [x0], #16 + b.gt 5b +9: + ret + +L(fivetap): + sub x2, x2, #1 // topleft -= 1 + movi v29.16b, #2 + ld1 {v0.16b}, [x2], #16 + movi v30.16b, #4 + movi v31.16b, #4 + ins v0.b[0], v0.b[1] +1: + // in[end+1], is the last valid pixel. We produce 16 pixels out by + // using 20 pixels in - the last pixel used is [19] of the ones + // read/buffered. + cmp w3, #18 + ld1 {v1.16b}, [x2], #16 + b.lt 2f // if (end + 1 < 19) + ext v2.16b, v0.16b, v1.16b, #1 + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v0.16b, v1.16b, #3 + ext v5.16b, v0.16b, v1.16b, #4 + umull v6.8h, v0.8b, v29.8b + umlal v6.8h, v2.8b, v30.8b + umlal v6.8h, v3.8b, v31.8b + umlal v6.8h, v4.8b, v30.8b + umlal v6.8h, v5.8b, v29.8b + umull2 v7.8h, v0.16b, v29.16b + umlal2 v7.8h, v2.16b, v30.16b + umlal2 v7.8h, v3.16b, v31.16b + umlal2 v7.8h, v4.16b, v30.16b + umlal2 v7.8h, v5.16b, v29.16b + subs w1, w1, #16 + mov v0.16b, v1.16b + rshrn v6.8b, v6.8h, #4 + rshrn2 v6.16b, v7.8h, #4 + sub w3, w3, #16 + st1 {v6.16b}, [x0], #16 + b.gt 1b + ret +2: + // Right padding + + // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead) + movrel x5, padding_mask, -1 + sub w6, w3, #31 + sub x5, x5, w3, uxtw + add x6, x2, w6, sxtw + + ld1 {v2.16b, v3.16b}, [x5] // padding_mask + + ld1r {v28.16b}, [x6] + bit v0.16b, v28.16b, v2.16b // Pad v0-v1 + bit v1.16b, v28.16b, v3.16b +4: + // Filter one block + ext v2.16b, v0.16b, v1.16b, #1 + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v0.16b, v1.16b, #3 + ext v5.16b, v0.16b, v1.16b, #4 + umull v6.8h, v0.8b, v29.8b + umlal v6.8h, v2.8b, v30.8b + umlal v6.8h, v3.8b, v31.8b + umlal v6.8h, v4.8b, v30.8b + umlal v6.8h, v5.8b, v29.8b + umull2 v7.8h, v0.16b, v29.16b + umlal2 v7.8h, v2.16b, v30.16b + umlal2 v7.8h, v3.16b, v31.16b + umlal2 v7.8h, v4.16b, v30.16b + umlal2 v7.8h, v5.16b, v29.16b + subs w1, w1, #16 + mov v0.16b, v1.16b + mov v1.16b, v28.16b + rshrn v6.8b, v6.8h, #4 + rshrn2 v6.16b, v7.8h, #4 + sub w3, w3, #16 + st1 {v6.16b}, [x0], #16 + b.le 9f + // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to + // filter properly once more - aka (w3 >= 0). + cmp w3, #0 + b.ge 4b +5: + // When w3 <= 0, all remaining pixels in v0-v1 are equal to the + // last valid pixel - thus just output that without filtering. + subs w1, w1, #16 + st1 {v1.16b}, [x0], #16 + b.gt 5b +9: + ret +endfunc + +// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px, +// const int n); +function ipred_pixel_set_8bpc_neon, export=1 + dup v0.16b, w1 +1: + subs w2, w2, #16 + st1 {v0.16b}, [x0], #16 + b.gt 1b + ret +endfunc + +// void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const top, +// const int width, const int height, +// const int dx, const int max_base_x); +function ipred_z1_fill1_8bpc_neon, export=1 + clz w9, w3 + adr x8, L(ipred_z1_fill1_tbl) + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + add x10, x2, w6, uxtw // top[max_base_x] + sub x8, x8, w9, uxtw + ld1r {v31.16b}, [x10] // padding + mov w7, w5 + mov w15, #64 + br x8 +40: + AARCH64_VALID_JUMP_TARGET +4: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 49f + ldr d0, [x2, w8, uxtw] // top[base] + ldr d2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + ext v1.8b, v0.8b, v0.8b, #1 // top[base+1] + ext v3.8b, v2.8b, v2.8b, #1 + usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] + usubl v7.8h, v3.8b, v2.8b + ushll v16.8h, v0.8b, #6 // top[base]*64 + ushll v17.8h, v2.8b, #6 + mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac + mla v17.4h, v7.4h, v5.4h + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + st1 {v16.s}[0], [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.s}[0], [x0], x1 + b.gt 4b + ret + +49: + st1 {v31.s}[0], [x0], x1 + subs w4, w4, #2 + st1 {v31.s}[0], [x0], x1 + b.gt 49b + ret + +80: + AARCH64_VALID_JUMP_TARGET +8: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 89f + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.8b, w9 // frac + dup v5.8b, w11 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.8b, w9 // 64 - frac + dup v7.8b, w11 + ext v1.16b, v0.16b, v0.16b, #1 // top[base+1] + ext v3.16b, v2.16b, v2.16b, #1 + umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac) + umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac + umull v17.8h, v2.8b, v7.8b + umlal v17.8h, v3.8b, v5.8b + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + st1 {v16.8b}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.8b}, [x0], x1 + b.gt 8b + ret + +89: + st1 {v31.8b}, [x0], x1 + subs w4, w4, #2 + st1 {v31.8b}, [x0], x1 + b.gt 89b + ret + +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + + mov w12, w3 + + add x13, x0, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw +1: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 169f + add x8, x2, w8, uxtw + add x10, x2, w10, uxtw + dup v4.16b, w9 // frac + dup v5.16b, w11 + ld1 {v0.16b, v1.16b}, [x8], #32 // top[base] + ld1 {v2.16b, v3.16b}, [x10], #32 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.16b, w9 // 64 - frac + dup v7.16b, w11 + add w7, w7, w5 // xpos += dx +2: + ext v16.16b, v0.16b, v1.16b, #1 // top[base+1] + ext v17.16b, v2.16b, v3.16b, #1 + subs w3, w3, #16 + umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac) + umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac + umull2 v19.8h, v0.16b, v6.16b + umlal2 v19.8h, v16.16b, v4.16b + umull v20.8h, v2.8b, v7.8b + umlal v20.8h, v17.8b, v5.8b + umull2 v21.8h, v2.16b, v7.16b + umlal2 v21.8h, v17.16b, v5.16b + rshrn v16.8b, v18.8h, #6 + rshrn2 v16.16b, v19.8h, #6 + rshrn v17.8b, v20.8h, #6 + rshrn2 v17.16b, v21.8h, #6 + st1 {v16.16b}, [x0], #16 + st1 {v17.16b}, [x13], #16 + b.le 3f + mov v0.16b, v1.16b + ld1 {v1.16b}, [x8], #16 // top[base] + mov v2.16b, v3.16b + ld1 {v3.16b}, [x10], #16 + b 2b + +3: + subs w4, w4, #2 + b.le 9f + add x0, x0, x1 + add x13, x13, x1 + mov w3, w12 + b 1b +9: + ret + +169: + st1 {v31.16b}, [x0], #16 + subs w3, w3, #16 + st1 {v31.16b}, [x13], #16 + b.gt 169b + subs w4, w4, #2 + b.le 9b + add x0, x0, x1 + add x13, x13, x1 + mov w3, w12 + b 169b + +L(ipred_z1_fill1_tbl): + .hword L(ipred_z1_fill1_tbl) - 640b + .hword L(ipred_z1_fill1_tbl) - 320b + .hword L(ipred_z1_fill1_tbl) - 160b + .hword L(ipred_z1_fill1_tbl) - 80b + .hword L(ipred_z1_fill1_tbl) - 40b +endfunc + +function ipred_z1_fill2_8bpc_neon, export=1 + cmp w3, #8 + add x10, x2, w6, uxtw // top[max_base_x] + ld1r {v31.16b}, [x10] // padding + mov w7, w5 + mov w15, #64 + b.eq 8f + +4: // w == 4 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 49f + ldr d0, [x2, w8, uxtw] // top[base] + ldr d2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + uzp2 v1.8b, v0.8b, v0.8b // top[base+1] + uzp1 v0.8b, v0.8b, v0.8b // top[base] + uzp2 v3.8b, v2.8b, v2.8b + uzp1 v2.8b, v2.8b, v2.8b + usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] + usubl v7.8h, v3.8b, v2.8b + ushll v16.8h, v0.8b, #6 // top[base]*64 + ushll v17.8h, v2.8b, #6 + mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac + mla v17.4h, v7.4h, v5.4h + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + st1 {v16.s}[0], [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.s}[0], [x0], x1 + b.gt 4b + ret + +49: + st1 {v31.s}[0], [x0], x1 + subs w4, w4, #2 + st1 {v31.s}[0], [x0], x1 + b.gt 49b + ret + +8: // w == 8 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 89f + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.8b, w9 // frac + dup v5.8b, w11 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.8b, w9 // 64 - frac + dup v7.8b, w11 + uzp2 v1.16b, v0.16b, v0.16b // top[base+1] + uzp1 v0.16b, v0.16b, v0.16b // top[base] + uzp2 v3.16b, v2.16b, v2.16b + uzp1 v2.16b, v2.16b, v2.16b + umull v16.8h, v1.8b, v4.8b // top[base+1]*frac + umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac) + umull v17.8h, v3.8b, v5.8b + umlal v17.8h, v2.8b, v7.8b + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + st1 {v16.8b}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.8b}, [x0], x1 + b.gt 8b + ret + +89: + st1 {v31.8b}, [x0], x1 + subs w4, w4, #2 + st1 {v31.8b}, [x0], x1 + b.gt 89b + ret +endfunc + +// void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src, +// const int n); +function ipred_reverse_8bpc_neon, export=1 + sub x1, x1, #16 + add x3, x0, #8 + mov x4, #16 +1: + ld1 {v0.16b}, [x1] + subs w2, w2, #16 + rev64 v0.16b, v0.16b + sub x1, x1, #16 + st1 {v0.d}[1], [x0], x4 + st1 {v0.d}[0], [x3], x4 + b.gt 1b + ret +endfunc + +const increments + .short 0, 1, 2, 3, 4, 5, 6, 7 + .short 8, 9, 10, 11, 12, 13, 14, 15 +endconst + +// void ipred_z2_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const top, +// const pixel *const left, +// const int width, const int height, +// const int dx, const int dy); +function ipred_z2_fill1_8bpc_neon, export=1 + clz w10, w4 + adr x9, L(ipred_z2_fill1_tbl) + sub w10, w10, #25 + ldrh w10, [x9, w10, uxtw #1] + mov w8, #(1 << 6) // xpos = 1 << 6 + sub x9, x9, w10, uxtw + sub w8, w8, w6 // xpos -= dx + + movrel x11, increments + ld1 {v31.8h}, [x11] // increments + neg w7, w7 // -dy + + br x9 +40: + AARCH64_VALID_JUMP_TARGET + + dup v30.4h, w7 // -dy + movi v17.8b, #1 + + mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy + movi v25.16b, #0x3e + add v30.4h, v16.4h, v30.4h // -= dy + + xtn v31.8b, v31.8h // {0,1,2,3} + + // Worst case height for w=4 is 16, but we need at least h+1 elements + ld1 {v0.16b, v1.16b}, [x3] // left[] + + movi v26.16b, #64 + movi v19.16b, #2 + + xtn v27.8b, v30.8h // (uint8_t)ypos + shrn v29.8b, v30.8h, #6 // ypos >> 6 + and v27.8b, v27.8b, v25.8b // frac_y + + add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 + + add v30.8b, v29.8b, v17.8b // base_y + 1 + add v28.8b, v29.8b, v19.8b // base_y + 2 + + tbl v16.8b, {v0.16b}, v29.8b // left[base_y] + + trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2 + + sub v28.8b, v26.8b, v27.8b // 64 - frac_y + + trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} + + trn1 v27.2s, v27.2s, v27.2s // frac_y + trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y + + movi v29.8b, #2 +4: + asr w9, w8, #6 // base_x + dup v6.4h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, #-4 // base_x <= -4 + asr w11, w8, #6 // base_x + b.le 49f + + dup v7.4h, w8 // xpos + + ldr d2, [x2, w9, sxtw] // top[base_x] + ldr d4, [x2, w11, sxtw] + + trn1 v6.2d, v6.2d, v7.2d // xpos + + // Cut corners here; only doing tbl over v0 here; we only + // seem to need the last pixel, from v1, after skipping to the + // left-only codepath below. + tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] + + shrn v20.8b, v6.8h, #6 // first base_x for each row + xtn v6.8b, v6.8h // (uint8_t)xpos + + ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1] + ext v5.8b, v4.8b, v4.8b, #1 + + and v6.8b, v6.8b, v25.8b // frac_x + + trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] + + trn1 v2.2s, v2.2s, v4.2s // top[base_x] + trn1 v3.2s, v3.2s, v5.2s // top[base_x+1] + + sub v7.8b, v26.8b, v6.8b // 64 - frac_x + + add v20.8b, v20.8b, v31.8b // actual base_x + + umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) + umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y + + umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) + umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x + + cmge v20.8b, v20.8b, #0 + + rshrn v16.8b, v16.8h, #6 + rshrn v22.8b, v22.8h, #6 + + bit v16.8b, v22.8b, v20.8b + + st1 {v16.s}[0], [x0], x1 + sub w8, w8, w6 // xpos -= dx + subs w5, w5, #2 + st1 {v16.s}[1], [x0], x1 + b.le 9f + + ext v16.8b, v17.8b, v17.8b, #4 + add v30.8b, v30.8b, v29.8b // base_y += 2 + b 4b + +49: + tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+2] + + trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] + + umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) + umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y + rshrn v18.8b, v18.8h, #6 + + st1 {v18.s}[0], [x0], x1 + subs w5, w5, #2 + st1 {v18.s}[1], [x0], x1 + b.le 9f + + ext v16.8b, v17.8b, v17.8b, #4 + add v30.8b, v30.8b, v29.8b // base_y += 2 + b 49b + +9: + ret + +80: + AARCH64_VALID_JUMP_TARGET + + dup v30.8h, w7 // -dy + movi v17.8b, #1 + + mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy + movi v25.16b, #0x3e + add v30.8h, v16.8h, v30.8h // -= dy + + xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} + + // Worst case height for w=8 is 32, but we need at least h+1 elements + ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[] + + movi v26.16b, #64 + movi v19.16b, #2 + + xtn v27.8b, v30.8h // (uint8_t)ypos + shrn v29.8b, v30.8h, #6 // ypos >> 6 + and v27.8b, v27.8b, v25.8b // frac_y + + add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 + + // Cut corners here; for the first row we don't expect to need to + // read outside of v0. + tbl v18.8b, {v0.16b}, v29.8b // left[base_y] + + add v30.8b, v29.8b, v19.8b // base_y + 2 + add v29.8b, v29.8b, v17.8b // base_y + 1 + + sub v28.8b, v26.8b, v27.8b // 64 - frac_y + + trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} + + movi v24.8b, #2 // 2 +8: + asr w9, w8, #6 // base_x + dup v16.8h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, #-8 // base_x <= -8 + asr w11, w8, #6 // base_x + b.le 89f + + dup v17.8h, w8 // xpos + + ldr q4, [x2, w9, sxtw] // top[base_x] + ldr q6, [x2, w11, sxtw] + + // Cut corners here; only doing tbl over v0-v1 here; we only + // seem to need the last pixel, from v2, after skipping to the + // left-only codepath below. + tbl v19.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+1] + + shrn v21.8b, v16.8h, #6 // first base_x + shrn2 v21.16b, v17.8h, #6 + xtn v16.8b, v16.8h // (uint8_t)xpos + xtn2 v16.16b, v17.8h + + tbl v20.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+2] + + ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1] + ext v7.16b, v6.16b, v6.16b, #1 + + and v16.16b, v16.16b, v25.16b // frac_x + + trn1 v4.2d, v4.2d, v6.2d // top[base_x] + trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] + + sub v7.16b, v26.16b, v16.16b // 64 - frac_x + + add v21.16b, v21.16b, v31.16b // actual base_x + + umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) + umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y + umull v17.8h, v19.8b, v28.8b + umlal v17.8h, v20.8b, v27.8b + + umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) + umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x + umull2 v23.8h, v4.16b, v7.16b + umlal2 v23.8h, v5.16b, v16.16b + + cmge v21.16b, v21.16b, #0 + + rshrn v6.8b, v6.8h, #6 + rshrn2 v6.16b, v17.8h, #6 + rshrn v22.8b, v22.8h, #6 + rshrn2 v22.16b, v23.8h, #6 + + bit v6.16b, v22.16b, v21.16b + + st1 {v6.d}[0], [x0], x1 + sub w8, w8, w6 // xpos -= dx + subs w5, w5, #2 + st1 {v6.d}[1], [x0], x1 + b.le 9f + + mov v18.8b, v20.8b + add v29.8b, v29.8b, v24.8b // base_y += 2 + add v30.8b, v30.8b, v24.8b // base_y += 2 + b 8b + +89: + tbl v19.8b, {v0.16b, v1.16b, v2.16b}, v29.8b // left[base_y+1] + tbl v20.8b, {v0.16b, v1.16b, v2.16b}, v30.8b // left[base_y+2] + + umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) + umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y + umull v17.8h, v19.8b, v28.8b + umlal v17.8h, v20.8b, v27.8b + + rshrn v6.8b, v6.8h, #6 + rshrn2 v6.16b, v17.8h, #6 + + st1 {v6.d}[0], [x0], x1 + subs w5, w5, #2 + st1 {v6.d}[1], [x0], x1 + b.le 9f + + mov v18.8b, v20.8b + add v29.8b, v29.8b, v24.8b // base_y += 2 + add v30.8b, v30.8b, v24.8b // base_y += 2 + b 89b + +9: + ret + +160: + AARCH64_VALID_JUMP_TARGET + + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + add x11, x11, #16 // increments + + dup v18.8h, w7 // -dy + movi v17.16b, #1 + add x3, x3, #1 // Skip past left[0] + + ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15} + + mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy + mul v19.8h, v14.8h, v18.8h // {8,9,10,11,12,13,14,15}* -dy + movi v25.16b, #0x3e + add v16.8h, v16.8h, v18.8h // -= dy + add v18.8h, v19.8h, v18.8h + + xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} + xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15} + + // Worst case height is 64. + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[] + ld1r {v15.16b}, [x2] // left[0] == top[0] + + movi v26.16b, #64 + movi v19.16b, #2 + + xtn v27.8b, v16.8h // (uint8_t)ypos + xtn2 v27.16b, v18.8h + shrn v29.8b, v16.8h, #6 // ypos >> 6 + shrn2 v29.16b, v18.8h, #6 + mov v18.16b, v15.16b // left[0] + and v27.16b, v27.16b, v25.16b // frac_y + + // Cut corners here; for the first row we don't expect to need to + // read outside of v0. + tbx v18.16b, {v0.16b}, v29.16b // left[base_y] + + add v30.16b, v29.16b, v19.16b // base_y + 2 + add v29.16b, v29.16b, v17.16b // base_y + 1 + + sub v28.16b, v26.16b, v27.16b // 64 - frac_y + + movi v24.16b, #2 // 2 +16: + asr w9, w8, #6 // base_x + dup v16.8h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, #-16 // base_x <= -16 + asr w11, w8, #6 // base_x + b.le 169f + + dup v17.8h, w8 // xpos + + add x9, x2, w9, sxtw + add x11, x2, w11, sxtw + + ld1 {v4.16b, v5.16b}, [x9] // top[base_x] + mov v19.16b, v15.16b // left[0] + ld1 {v6.16b, v7.16b}, [x11] + + tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] + + mov v20.16b, v15.16b // left[0] + + shrn v21.8b, v16.8h, #6 // first base_x + shrn v22.8b, v17.8h, #6 + xtn v16.8b, v16.8h // (uint8_t)xpos + xtn v17.8b, v17.8h + + tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] + + trn1 v21.2d, v21.2d, v21.2d // first base_x + trn1 v22.2d, v22.2d, v22.2d + trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos + trn1 v17.2d, v17.2d, v17.2d + + ext v5.16b, v4.16b, v5.16b, #1 // top[base_x+1] + ext v7.16b, v6.16b, v7.16b, #1 + + and v16.16b, v16.16b, v25.16b // frac_x + and v17.16b, v17.16b, v25.16b + + umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) + umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y + + sub v8.16b, v26.16b, v16.16b // 64 - frac_x + sub v9.16b, v26.16b, v17.16b + + umull2 v11.8h, v18.16b, v28.16b + umlal2 v11.8h, v19.16b, v27.16b + + add v21.16b, v21.16b, v31.16b // actual base_x + add v22.16b, v22.16b, v31.16b + + umull v12.8h, v19.8b, v28.8b + umlal v12.8h, v20.8b, v27.8b + umull2 v13.8h, v19.16b, v28.16b + umlal2 v13.8h, v20.16b, v27.16b + + rshrn v10.8b, v10.8h, #6 + rshrn2 v10.16b, v11.8h, #6 + rshrn v11.8b, v12.8h, #6 + rshrn2 v11.16b, v13.8h, #6 + + umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) + umlal v12.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x + umull2 v13.8h, v4.16b, v8.16b + umlal2 v13.8h, v5.16b, v16.16b + umull v14.8h, v6.8b, v9.8b + umlal v14.8h, v7.8b, v17.8b + umull2 v18.8h, v6.16b, v9.16b + umlal2 v18.8h, v7.16b, v17.16b + + cmge v21.16b, v21.16b, #0 + cmge v22.16b, v22.16b, #0 + + rshrn v12.8b, v12.8h, #6 + rshrn2 v12.16b, v13.8h, #6 + rshrn v13.8b, v14.8h, #6 + rshrn2 v13.16b, v18.8h, #6 + + bit v10.16b, v12.16b, v21.16b + bit v11.16b, v13.16b, v22.16b + + st1 {v10.16b}, [x0], x1 + subs w5, w5, #2 + sub w8, w8, w6 // xpos -= dx + st1 {v11.16b}, [x0], x1 + b.le 9f + + mov v18.16b, v20.16b + add v29.16b, v29.16b, v24.16b // base_y += 2 + add v30.16b, v30.16b, v24.16b // base_y += 2 + b 16b + +169: + mov v19.16b, v15.16b + mov v20.16b, v15.16b + tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] + tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] + + umull v4.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) + umlal v4.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y + umull2 v5.8h, v18.16b, v28.16b + umlal2 v5.8h, v19.16b, v27.16b + umull v6.8h, v19.8b, v28.8b + umlal v6.8h, v20.8b, v27.8b + umull2 v7.8h, v19.16b, v28.16b + umlal2 v7.8h, v20.16b, v27.16b + + rshrn v4.8b, v4.8h, #6 + rshrn2 v4.16b, v5.8h, #6 + rshrn v5.8b, v6.8h, #6 + rshrn2 v5.16b, v7.8h, #6 + + st1 {v4.16b}, [x0], x1 + subs w5, w5, #2 + st1 {v5.16b}, [x0], x1 + b.le 9f + + mov v18.16b, v20.16b + add v29.16b, v29.16b, v24.16b // base_y += 2 + add v30.16b, v30.16b, v24.16b // base_y += 2 + b 169b + +9: + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + ret + +320: +640: + AARCH64_VALID_JUMP_TARGET + + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + add x11, x11, #16 // increments + + dup v25.8h, w7 // -dy + add x3, x3, #1 // Skip past left[0] + + ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15} + + add x13, x0, x1 // alternating row + lsl x1, x1, #1 // stride *= 2 + sub x1, x1, w4, uxtw // stride -= width + + movi v11.8h, #8 + mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy + add v26.8h, v26.8h, v25.8h // -= dy + mul v25.8h, v25.8h, v11.8h // -8*dy + + xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} + xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15} + + // Worst case height is 64. + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[] + ld1r {v15.16b}, [x2] // left[0] == top[0] + + mov w12, w4 // orig w + neg w14, w4 // -w + +1: + mov v23.16b, v26.16b // reset ypos + + asr w9, w8, #6 // base_x + dup v16.8h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, w14 // base_x <= -w + asr w11, w8, #6 // base_x + b.le 329f + + dup v17.8h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + + add x9, x2, w9, sxtw + add x11, x2, w11, sxtw + + sqshrn v21.8b, v16.8h, #6 // first base_x + sqshrn v22.8b, v17.8h, #6 + xtn v16.8b, v16.8h // (uint8_t)xpos + xtn v17.8b, v17.8h + + ld1 {v4.16b}, [x9], #16 // top[base_x] + ld1 {v6.16b}, [x11], #16 + + trn1 v21.2d, v21.2d, v21.2d // first base_x + trn1 v22.2d, v22.2d, v22.2d + trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos + trn1 v17.2d, v17.2d, v17.2d + + movi v10.16b, #0x3e + movi v11.16b, #64 + + and v16.16b, v16.16b, v10.16b // frac_x + and v17.16b, v17.16b, v10.16b + + sub v8.16b, v11.16b, v16.16b // 64 - frac_x + sub v9.16b, v11.16b, v17.16b + + add v21.16b, v21.16b, v31.16b // actual base_x + add v22.16b, v22.16b, v31.16b + +2: + add v13.8h, v23.8h, v25.8h // ypos -= 8*dy + movi v12.16b, #64 + movi v20.16b, #2 + movi v10.16b, #0x3e + + smov w10, v22.b[0] + + xtn v27.8b, v23.8h // (uint8_t)ypos + xtn2 v27.16b, v13.8h + shrn v29.8b, v23.8h, #6 // ypos >> 6 + shrn2 v29.16b, v13.8h, #6 + cmp w10, #0 // base_x (bottom left) >= 0 + and v27.16b, v27.16b, v10.16b // frac_y + + mov v18.16b, v15.16b // left[0] + + b.ge 4f + + add v23.8h, v13.8h, v25.8h // ypos -= 8*dy + movi v13.16b, #1 + + tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] + add v29.16b, v29.16b, v13.16b // base_y + 1 + mov v19.16b, v15.16b // left[0] + + sub v28.16b, v12.16b, v27.16b // 64 - frac_y + + ld1 {v5.16b}, [x9], #16 // top[base_x] + ld1 {v7.16b}, [x11], #16 + + tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] + add v29.16b, v29.16b, v13.16b // base_y + 2 + + mov v20.16b, v15.16b // left[0] + tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] + + umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) + umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y + umull2 v11.8h, v18.16b, v28.16b + umlal2 v11.8h, v19.16b, v27.16b + umull v12.8h, v19.8b, v28.8b + umlal v12.8h, v20.8b, v27.8b + umull2 v13.8h, v19.16b, v28.16b + umlal2 v13.8h, v20.16b, v27.16b + + ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1] + ext v19.16b, v6.16b, v7.16b, #1 + + rshrn v10.8b, v10.8h, #6 + rshrn2 v10.16b, v11.8h, #6 + rshrn v11.8b, v12.8h, #6 + rshrn2 v11.16b, v13.8h, #6 + + umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) + umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x + umull2 v13.8h, v4.16b, v8.16b + umlal2 v13.8h, v18.16b, v16.16b + umull v14.8h, v6.8b, v9.8b + umlal v14.8h, v19.8b, v17.8b + umull2 v20.8h, v6.16b, v9.16b + umlal2 v20.8h, v19.16b, v17.16b + + cmge v18.16b, v21.16b, #0 + cmge v19.16b, v22.16b, #0 + + rshrn v12.8b, v12.8h, #6 + rshrn2 v12.16b, v13.8h, #6 + rshrn v13.8b, v14.8h, #6 + rshrn2 v13.16b, v20.8h, #6 + + bit v10.16b, v12.16b, v18.16b + bit v11.16b, v13.16b, v19.16b + + st1 {v10.16b}, [x0], #16 + subs w4, w4, #16 + st1 {v11.16b}, [x13], #16 + b.le 3f + + movi v10.16b, #16 + mov v4.16b, v5.16b + mov v6.16b, v7.16b + add v21.16b, v21.16b, v10.16b // base_x += 16 + add v22.16b, v22.16b, v10.16b + b 2b + +3: + subs w5, w5, #2 + b.le 9f + movi v10.8h, #128 + add x0, x0, x1 + add x13, x13, x1 + mov w4, w12 // reset w + add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6) + b 1b + +4: // The rest of the row only predicted from top[] + ld1 {v5.16b}, [x9], #16 // top[base_x] + ld1 {v7.16b}, [x11], #16 + + ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1] + ext v19.16b, v6.16b, v7.16b, #1 + + umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) + umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x + umull2 v13.8h, v4.16b, v8.16b + umlal2 v13.8h, v18.16b, v16.16b + umull v14.8h, v6.8b, v9.8b + umlal v14.8h, v19.8b, v17.8b + umull2 v20.8h, v6.16b, v9.16b + umlal2 v20.8h, v19.16b, v17.16b + + rshrn v12.8b, v12.8h, #6 + rshrn2 v12.16b, v13.8h, #6 + rshrn v13.8b, v14.8h, #6 + rshrn2 v13.16b, v20.8h, #6 + + st1 {v12.16b}, [x0], #16 + subs w4, w4, #16 + st1 {v13.16b}, [x13], #16 + b.le 3b + + mov v4.16b, v5.16b + mov v6.16b, v7.16b + b 4b + +329: // The rest of the block only predicted from left[] + add x1, x1, w4, uxtw // restore stride + mov w12, w5 // orig remaining h +1: + add v13.8h, v23.8h, v25.8h // ypos -= 8*dy + movi v12.16b, #64 + movi v10.16b, #0x3e + + xtn v27.8b, v23.8h // (uint8_t)ypos + xtn2 v27.16b, v13.8h + shrn v29.8b, v23.8h, #6 // ypos >> 6 + shrn2 v29.16b, v13.8h, #6 + and v27.16b, v27.16b, v10.16b // frac_y + + mov v18.16b, v15.16b // left[0] + add v23.8h, v13.8h, v25.8h // ypos -= 8*dy + movi v21.16b, #1 + + tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] + add v29.16b, v29.16b, v21.16b // base_y + 1 + + sub v28.16b, v12.16b, v27.16b // 64 - frac_y +2: + mov v19.16b, v15.16b // left[0] + tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] + add v29.16b, v29.16b, v21.16b // base_y + 2 + mov v20.16b, v15.16b // left[0] + tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] + add v29.16b, v29.16b, v21.16b // next base_y + + umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) + umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y + umull2 v11.8h, v18.16b, v28.16b + umlal2 v11.8h, v19.16b, v27.16b + umull v12.8h, v19.8b, v28.8b + umlal v12.8h, v20.8b, v27.8b + umull2 v13.8h, v19.16b, v28.16b + umlal2 v13.8h, v20.16b, v27.16b + + rshrn v10.8b, v10.8h, #6 + rshrn2 v10.16b, v11.8h, #6 + rshrn v11.8b, v12.8h, #6 + rshrn2 v11.16b, v13.8h, #6 + + st1 {v10.16b}, [x0], x1 + subs w5, w5, #2 + st1 {v11.16b}, [x13], x1 + b.le 3f + mov v18.16b, v20.16b + b 2b + +3: + subs w4, w4, #16 + b.le 9f + + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + lsl x1, x1, #1 + add x0, x0, #16 + add x13, x13, #16 + mov w5, w12 // reset h + b 1b + +9: + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + ret + +L(ipred_z2_fill1_tbl): + .hword L(ipred_z2_fill1_tbl) - 640b + .hword L(ipred_z2_fill1_tbl) - 320b + .hword L(ipred_z2_fill1_tbl) - 160b + .hword L(ipred_z2_fill1_tbl) - 80b + .hword L(ipred_z2_fill1_tbl) - 40b +endfunc + +function ipred_z2_fill2_8bpc_neon, export=1 + cmp w4, #8 + mov w8, #(2 << 6) // xpos = 2 << 6 + sub w8, w8, w6 // xpos -= dx + + movrel x11, increments + ld1 {v31.8h}, [x11] // increments + neg w7, w7 // -dy + b.eq 80f + +40: + dup v30.4h, w7 // -dy + movi v17.8b, #1 + + mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy + movi v25.16b, #0x3e + add v30.4h, v16.4h, v30.4h // -= dy + + xtn v31.8b, v31.8h // {0,1,2,3} + + // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements + // from left. + ld1 {v0.16b}, [x3] // left[] + + movi v26.16b, #64 + movi v19.16b, #2 + + xtn v27.8b, v30.8h // (uint8_t)ypos + shrn v29.8b, v30.8h, #6 // ypos >> 6 + and v27.8b, v27.8b, v25.8b // frac_y + + add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 + + add v30.8b, v29.8b, v17.8b // base_y + 1 + add v28.8b, v29.8b, v19.8b // base_y + 2 + + tbl v16.8b, {v0.16b}, v29.8b // left[base_y] + + trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2 + + sub v28.8b, v26.8b, v27.8b // 64 - frac_y + + trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} + + trn1 v27.2s, v27.2s, v27.2s // frac_y + trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y + + movi v29.8b, #2 + add v31.8b, v31.8b, v31.8b // {0,2,4,6,0,2,4,6} +4: + asr w9, w8, #6 // base_x + dup v6.4h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, #-8 // base_x <= -8 + asr w11, w8, #6 // base_x + b.le 49f + + dup v7.4h, w8 // xpos + + ldr d2, [x2, w9, sxtw] // top[base_x] + ldr d4, [x2, w11, sxtw] + + trn1 v6.2d, v6.2d, v7.2d // xpos + + tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] + + shrn v20.8b, v6.8h, #6 // first base_x for each row + xtn v6.8b, v6.8h // (uint8_t)xpos + + uzp2 v3.8b, v2.8b, v4.8b // top[base_x+1] + uzp1 v2.8b, v2.8b, v4.8b // top[base_x] + + and v6.8b, v6.8b, v25.8b // frac_x + + trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] + + sub v7.8b, v26.8b, v6.8b // 64 - frac_x + + add v20.8b, v20.8b, v31.8b // actual base_x + + umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) + umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y + + umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) + umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x + + cmge v20.8b, v20.8b, #0 + + rshrn v16.8b, v16.8h, #6 + rshrn v22.8b, v22.8h, #6 + + bit v16.8b, v22.8b, v20.8b + + st1 {v16.s}[0], [x0], x1 + sub w8, w8, w6 // xpos -= dx + subs w5, w5, #2 + st1 {v16.s}[1], [x0], x1 + b.le 9f + + ext v16.8b, v17.8b, v17.8b, #4 + add v30.8b, v30.8b, v29.8b // base_y += 2 + b 4b + +49: + tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] + + trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] + + umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) + umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y + rshrn v18.8b, v18.8h, #6 + + st1 {v18.s}[0], [x0], x1 + subs w5, w5, #2 + st1 {v18.s}[1], [x0], x1 + b.le 9f + + ext v16.8b, v17.8b, v17.8b, #4 + add v30.8b, v30.8b, v29.8b // base_y += 2 + b 49b + +9: + ret + +80: + dup v30.8h, w7 // -dy + movi v17.8b, #1 + + mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy + movi v25.16b, #0x3e + add v30.8h, v16.8h, v30.8h // -= dy + + xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} + + // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements + // from left. + ld1 {v0.16b}, [x3] // left[] + + movi v26.16b, #64 + movi v19.16b, #2 + + xtn v27.8b, v30.8h // (uint8_t)ypos + shrn v29.8b, v30.8h, #6 // ypos >> 6 + and v27.8b, v27.8b, v25.8b // frac_y + + add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 + + tbl v18.8b, {v0.16b}, v29.8b // left[base_y] + + add v30.8b, v29.8b, v19.8b // base_y + 2 + add v29.8b, v29.8b, v17.8b // base_y + 1 + + sub v28.8b, v26.8b, v27.8b // 64 - frac_y + + trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} + + movi v24.8b, #2 // 2 + add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14} +8: + asr w9, w8, #6 // base_x + dup v16.8h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, #-16 // base_x <= -16 + asr w11, w8, #6 // base_x + b.le 89f + + dup v17.8h, w8 // xpos + + ldr q4, [x2, w9, sxtw] // top[base_x] + ldr q6, [x2, w11, sxtw] + + tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1] + + shrn v21.8b, v16.8h, #6 // first base_x + shrn2 v21.16b, v17.8h, #6 + xtn v16.8b, v16.8h // (uint8_t)xpos + xtn2 v16.16b, v17.8h + + tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2] + + uzp2 v5.16b, v4.16b, v6.16b // top[base_x+1] + uzp1 v4.16b, v4.16b, v6.16b // top[base_x] + + and v16.16b, v16.16b, v25.16b // frac_x + + sub v7.16b, v26.16b, v16.16b // 64 - frac_x + + add v21.16b, v21.16b, v31.16b // actual base_x + + umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) + umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y + umull v17.8h, v19.8b, v28.8b + umlal v17.8h, v20.8b, v27.8b + + umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) + umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x + umull2 v23.8h, v4.16b, v7.16b + umlal2 v23.8h, v5.16b, v16.16b + + cmge v21.16b, v21.16b, #0 + + rshrn v6.8b, v6.8h, #6 + rshrn2 v6.16b, v17.8h, #6 + rshrn v22.8b, v22.8h, #6 + rshrn2 v22.16b, v23.8h, #6 + + bit v6.16b, v22.16b, v21.16b + + st1 {v6.d}[0], [x0], x1 + sub w8, w8, w6 // xpos -= dx + subs w5, w5, #2 + st1 {v6.d}[1], [x0], x1 + b.le 9f + + mov v18.8b, v20.8b + add v29.8b, v29.8b, v24.8b // base_y += 2 + add v30.8b, v30.8b, v24.8b // base_y += 2 + b 8b + +89: + tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1] + tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2] + + umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) + umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y + umull v17.8h, v19.8b, v28.8b + umlal v17.8h, v20.8b, v27.8b + + rshrn v6.8b, v6.8h, #6 + rshrn2 v6.16b, v17.8h, #6 + + st1 {v6.d}[0], [x0], x1 + subs w5, w5, #2 + st1 {v6.d}[1], [x0], x1 + b.le 9f + + mov v18.8b, v20.8b + add v29.8b, v29.8b, v24.8b // base_y += 2 + add v30.8b, v30.8b, v24.8b // base_y += 2 + b 89b + +9: + ret +endfunc + +function ipred_z2_fill3_8bpc_neon, export=1 + cmp w4, #8 + mov w8, #(1 << 6) // xpos = 1 << 6 + sub w8, w8, w6 // xpos -= dx + + movrel x11, increments + ld1 {v31.8h}, [x11] // increments + neg w7, w7 // -dy + b.eq 80f + +40: + dup v30.4h, w7 // -dy + movi v17.8b, #1 + + mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy + movi v25.16b, #0x3e + add v30.4h, v16.4h, v30.4h // -= dy + + xtn v31.8b, v31.8h // {0,1,2,3} + + // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. + ld1 {v0.16b, v1.16b}, [x3] // left[] + + movi v26.16b, #64 + movi v19.16b, #2 + + xtn v27.8b, v30.8h // (uint8_t)ypos + shrn v29.8b, v30.8h, #6 // ypos >> 6 + and v27.8b, v27.8b, v25.8b // frac_y + + add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 + + add v30.8b, v29.8b, v17.8b // base_y + 1 + add v28.8b, v29.8b, v19.8b // base_y + 2 + + trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} + + add v24.8b, v30.8b, v19.8b // base_y + 3 + + trn1 v29.2s, v29.2s, v28.2s // base_y + 0, base_y + 2 + trn1 v30.2s, v30.2s, v24.2s // base_y + 1, base_y + 3 + + sub v28.8b, v26.8b, v27.8b // 64 - frac_y + + trn1 v27.2s, v27.2s, v27.2s // frac_y + trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y + + movi v24.8b, #4 +4: + asr w9, w8, #6 // base_x + dup v6.4h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, #-4 // base_x <= -4 + asr w11, w8, #6 // base_x + b.le 49f + + dup v7.4h, w8 // xpos + + ldr d2, [x2, w9, sxtw] // top[base_x] + ldr d4, [x2, w11, sxtw] + + trn1 v6.2d, v6.2d, v7.2d // xpos + + tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2] + tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3] + + shrn v20.8b, v6.8h, #6 // first base_x for each row + xtn v6.8b, v6.8h // (uint8_t)xpos + + ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1] + ext v5.8b, v4.8b, v4.8b, #1 + + and v6.8b, v6.8b, v25.8b // frac_x + + trn1 v2.2s, v2.2s, v4.2s // top[base_x] + trn1 v3.2s, v3.2s, v5.2s // top[base_x+1] + + sub v7.8b, v26.8b, v6.8b // 64 - frac_x + + add v20.8b, v20.8b, v31.8b // actual base_x + + umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) + umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y + + umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) + umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x + + cmge v20.8b, v20.8b, #0 + + rshrn v16.8b, v16.8h, #6 + rshrn v22.8b, v22.8h, #6 + + bit v16.8b, v22.8b, v20.8b + + st1 {v16.s}[0], [x0], x1 + sub w8, w8, w6 // xpos -= dx + subs w5, w5, #2 + st1 {v16.s}[1], [x0], x1 + b.le 9f + + add v29.8b, v29.8b, v24.8b // base_y += 4 + add v30.8b, v30.8b, v24.8b // base_y += 4 + b 4b + +49: + tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2] + tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3] + + umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) + umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y + rshrn v18.8b, v18.8h, #6 + + st1 {v18.s}[0], [x0], x1 + subs w5, w5, #2 + st1 {v18.s}[1], [x0], x1 + b.le 9f + + add v29.8b, v29.8b, v24.8b // base_y += 4 + add v30.8b, v30.8b, v24.8b // base_y += 4 + b 49b + +9: + ret + +80: + dup v30.8h, w7 // -dy + movi v17.8b, #1 + + mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy + movi v25.16b, #0x3e + add v30.8h, v16.8h, v30.8h // -= dy + + xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} + + // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. + ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[] + + movi v26.16b, #64 + movi v19.16b, #2 + + xtn v27.8b, v30.8h // (uint8_t)ypos + shrn v29.8b, v30.8h, #6 // ypos >> 6 + and v27.8b, v27.8b, v25.8b // frac_y + + add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 + + add v28.8b, v29.8b, v17.8b // base_y + 1 + add v30.8b, v29.8b, v19.8b // base_y + 2 + + trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} + add v24.8b, v28.8b, v19.8b // base_y + 3 + + trn1 v29.2d, v29.2d, v30.2d // base_y + 0, base_y + 2 + trn1 v30.2d, v28.2d, v24.2d // base_y + 1, base_y + 3 + + sub v28.8b, v26.8b, v27.8b // 64 - frac_y + + movi v24.16b, #4 + + trn1 v27.2d, v27.2d, v27.2d // frac_y + trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y +8: + asr w9, w8, #6 // base_x + dup v16.8h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, #-8 // base_x <= -8 + asr w11, w8, #6 // base_x + b.le 89f + + dup v17.8h, w8 // xpos + + ldr q4, [x2, w9, sxtw] // top[base_x] + ldr q6, [x2, w11, sxtw] + + tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] + tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] + + shrn v21.8b, v16.8h, #6 // first base_x + shrn2 v21.16b, v17.8h, #6 + xtn v16.8b, v16.8h // (uint8_t)xpos + xtn2 v16.16b, v17.8h + + ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1] + ext v7.16b, v6.16b, v6.16b, #1 + + and v16.16b, v16.16b, v25.16b // frac_x + + trn1 v4.2d, v4.2d, v6.2d // top[base_x] + trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] + + sub v7.16b, v26.16b, v16.16b // 64 - frac_x + + add v21.16b, v21.16b, v31.16b // actual base_x + + umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) + umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y + umull2 v17.8h, v18.16b, v28.16b + umlal2 v17.8h, v19.16b, v27.16b + + umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) + umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x + umull2 v23.8h, v4.16b, v7.16b + umlal2 v23.8h, v5.16b, v16.16b + + cmge v21.16b, v21.16b, #0 + + rshrn v6.8b, v6.8h, #6 + rshrn2 v6.16b, v17.8h, #6 + rshrn v22.8b, v22.8h, #6 + rshrn2 v22.16b, v23.8h, #6 + + bit v6.16b, v22.16b, v21.16b + + st1 {v6.d}[0], [x0], x1 + sub w8, w8, w6 // xpos -= dx + subs w5, w5, #2 + st1 {v6.d}[1], [x0], x1 + b.le 9f + + add v29.16b, v29.16b, v24.16b // base_y += 4 + add v30.16b, v30.16b, v24.16b // base_y += 4 + b 8b + +89: + tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] + tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] + + umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) + umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y + umull2 v17.8h, v18.16b, v28.16b + umlal2 v17.8h, v19.16b, v27.16b + + rshrn v6.8b, v6.8h, #6 + rshrn2 v6.16b, v17.8h, #6 + + st1 {v6.d}[0], [x0], x1 + subs w5, w5, #2 + st1 {v6.d}[1], [x0], x1 + b.le 9f + + add v29.16b, v29.16b, v24.16b // base_y += 4 + add v30.16b, v30.16b, v24.16b // base_y += 4 + b 89b + +9: + ret +endfunc + + +// void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const left, +// const int width, const int height, +// const int dy, const int max_base_y); +function ipred_z3_fill1_8bpc_neon, export=1 + cmp w6, #64 + clz w9, w3 + adr x8, L(ipred_z3_fill1_tbl) + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + add x10, x2, w6, uxtw // left[max_base_y] + sub x8, x8, w9, uxtw + movrel x11, increments + ld1r {v31.16b}, [x10] // padding + ld1 {v30.8h}, [x11] // increments + mov w7, w5 + b.gt L(ipred_z3_fill1_large_h16) + br x8 + +40: + AARCH64_VALID_JUMP_TARGET + dup v29.4h, w5 // dy + + mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy + movi v23.16b, #0x3e + + // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32 + ld1 {v0.16b, v1.16b}, [x2] // left[] + add v30.4h, v29.4h, v30.4h // ypos + + movi v22.16b, #64 + movi v20.16b, #1 + movi v21.16b, #2 + + xtn v24.8b, v30.8h // (uint8_t)ypos + uqshrn v26.8b, v30.8h, #6 // base + and v24.8b, v24.8b, v23.8b // frac + + mov v4.8b, v31.8b + uqadd v27.8b, v26.8b, v20.8b // base + 1 + uqadd v28.8b, v26.8b, v21.8b // base + 2 + sub v25.8b, v22.8b, v24.8b // 64 - frac + + tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base] + + trn1 v27.2s, v27.2s, v28.2s // base + 1, base + 2 + trn1 v24.2s, v24.2s, v24.2s // frac + trn1 v25.2s, v25.2s, v25.2s // 64 - frac +1: + mov v5.8b, v31.8b + tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2] + + trn1 v4.2s, v4.2s, v5.2s // left[base], left[base+1] + + umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) + umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac + rshrn v16.8b, v16.8h, #6 + st1 {v16.s}[0], [x0], x1 + subs w4, w4, #2 + st1 {v16.s}[1], [x0], x1 + b.le 9f + + ext v4.8b, v5.8b, v5.8b, #4 + uqadd v27.8b, v27.8b, v21.8b // base += 2 + b 1b + +9: + ret + +80: + AARCH64_VALID_JUMP_TARGET + dup v29.8h, w5 // dy + + mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy + movi v23.16b, #0x3e + + // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48 + ld1 {v0.16b, v1.16b, v2.16b}, [x2] // left[] + add v30.8h, v29.8h, v30.8h // ypos + + movi v22.16b, #64 + movi v20.16b, #1 + movi v21.16b, #2 + + xtn v24.8b, v30.8h // (uint8_t)ypos + uqshrn v26.8b, v30.8h, #6 // base + and v24.8b, v24.8b, v23.8b // frac + + mov v4.8b, v31.8b + uqadd v27.8b, v26.8b, v20.8b // base + 1 + uqadd v28.8b, v26.8b, v21.8b // base + 2 + sub v25.8b, v22.8b, v24.8b // 64 - frac + + tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base] +1: + mov v5.8b, v31.8b + mov v6.8b, v31.8b + tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1] + tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2] + + umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) + umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac + umull v17.8h, v5.8b, v25.8b + umlal v17.8h, v6.8b, v24.8b + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + st1 {v16.8b}, [x0], x1 + subs w4, w4, #2 + st1 {v17.8b}, [x0], x1 + b.le 9f + + mov v4.8b, v6.8b + uqadd v27.8b, v27.8b, v21.8b // base += 2 + uqadd v28.8b, v28.8b, v21.8b // base += 2 + b 1b + +9: + ret + +160: + AARCH64_VALID_JUMP_TARGET + dup v28.8h, w5 // dy + + shl v29.8h, v28.8h, #3 // 8*dy + mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy + movi v23.16b, #0x3e + + // This is only executed if we've checked that max_base_y <= 64. + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] + add v28.8h, v28.8h, v30.8h // ypos + + movi v22.16b, #64 + movi v20.16b, #1 + movi v21.16b, #2 + + add v29.8h, v28.8h, v29.8h // ypos + 8*dy + + xtn v24.8b, v28.8h // (uint8_t)ypos + xtn2 v24.16b, v29.8h + uqshrn v26.8b, v28.8h, #6 // base + uqshrn2 v26.16b, v29.8h, #6 + and v24.16b, v24.16b, v23.16b // frac + + mov v4.16b, v31.16b + uqadd v27.16b, v26.16b, v20.16b // base + 1 + uqadd v28.16b, v26.16b, v21.16b // base + 2 + sub v25.16b, v22.16b, v24.16b // 64 - frac + + tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base] +1: + mov v5.16b, v31.16b + mov v6.16b, v31.16b + tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1] + tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2] + + umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) + umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac + umull2 v17.8h, v4.16b, v25.16b + umlal2 v17.8h, v5.16b, v24.16b + umull v18.8h, v5.8b, v25.8b + umlal v18.8h, v6.8b, v24.8b + umull2 v19.8h, v5.16b, v25.16b + umlal2 v19.8h, v6.16b, v24.16b + rshrn v16.8b, v16.8h, #6 + rshrn2 v16.16b, v17.8h, #6 + rshrn v17.8b, v18.8h, #6 + rshrn2 v17.16b, v19.8h, #6 + st1 {v16.16b}, [x0], x1 + subs w4, w4, #2 + st1 {v17.16b}, [x0], x1 + b.le 9f + + mov v4.16b, v6.16b + uqadd v27.16b, v27.16b, v21.16b // base += 2 + uqadd v28.16b, v28.16b, v21.16b // base += 2 + b 1b + +9: + ret +320: +640: + AARCH64_VALID_JUMP_TARGET + dup v28.8h, w5 // dy + mov w12, w3 + + add x13, x0, x1 + + shl v29.8h, v28.8h, #3 // 8*dy + mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy + movi v23.16b, #0x3e + + lsl x1, x1, #1 + sub x1, x1, w3, uxtw + add v30.8h, v28.8h, v30.8h // ypos + + // This is only executed if we've checked that max_base_y <= 64. + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] + + movi v22.16b, #64 + movi v20.16b, #1 + movi v21.16b, #2 + +1: + mov v26.16b, v30.16b // reset ypos + +2: + add v27.8h, v26.8h, v29.8h // ypos + 8*dy + uqshrn v16.8b, v26.8h, #6 // base + uqshrn2 v16.16b, v27.8h, #6 + xtn v24.8b, v26.8h // (uint8_t)ypos + xtn2 v24.16b, v27.8h + umov w14, v16.b[0] + and v24.16b, v24.16b, v23.16b // frac + + uqadd v17.16b, v16.16b, v20.16b // base + 1 + cmp w14, w6 // base >= max_base_y + uqadd v18.16b, v16.16b, v21.16b // base + 2 + sub v25.16b, v22.16b, v24.16b // 64 - frac + + b.ge 4f + + mov v4.16b, v31.16b + mov v5.16b, v31.16b + mov v6.16b, v31.16b + tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base] + tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1] + tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2] + + subs w3, w3, #16 + umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) + umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac + umull2 v17.8h, v4.16b, v25.16b + umlal2 v17.8h, v5.16b, v24.16b + umull v18.8h, v5.8b, v25.8b + umlal v18.8h, v6.8b, v24.8b + umull2 v19.8h, v5.16b, v25.16b + umlal2 v19.8h, v6.16b, v24.16b + rshrn v16.8b, v16.8h, #6 + rshrn2 v16.16b, v17.8h, #6 + rshrn v17.8b, v18.8h, #6 + rshrn2 v17.16b, v19.8h, #6 + st1 {v16.16b}, [x0], #16 + st1 {v17.16b}, [x13], #16 + b.le 3f + add v26.8h, v27.8h, v29.8h // ypos += 16*dy + b 2b + +3: + subs w4, w4, #2 + b.le 9f + movi v16.8h, #128 + add x0, x0, x1 + add x13, x13, x1 + add v30.8h, v30.8h, v16.8h // ypos = dy + y*(1<<6)*2 + mov w3, w12 + b 1b + +4: + subs w3, w3, #16 + st1 {v31.16b}, [x0], #16 + st1 {v31.16b}, [x13], #16 + b.gt 4b + b 3b + +9: + ret + +L(ipred_z3_fill1_large_h16): + // Fallback case for max_base_y > 64; similar to the z1 + // implementation. This does the filtering vertically, filling out + // a 2x pixel column at a time. + mov w15, #64 + add x13, x0, x1 + lsl x1, x1, #1 + + mov w12, w4 +1: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // ypos += dy + cmp w8, w6 // base >= max_base_y + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + add x8, x2, w8, uxtw + add x10, x2, w10, uxtw + dup v4.16b, w9 // frac + dup v5.16b, w11 + ld1 {v0.16b, v1.16b}, [x8], #32 // left[base] + ld1 {v2.16b, v3.16b}, [x10], #32 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.16b, w9 // 64 - frac + dup v7.16b, w11 + add w7, w7, w5 // ypos += dy +2: + ext v16.16b, v0.16b, v1.16b, #1 // left[base+1] + ext v17.16b, v2.16b, v3.16b, #1 + subs w4, w4, #16 + umull v18.8h, v16.8b, v4.8b // left[base+1]*frac + umlal v18.8h, v0.8b, v6.8b // + left[base]*(64-frac) + umull2 v19.8h, v16.16b, v4.16b + umlal2 v19.8h, v0.16b, v6.16b + umull v20.8h, v17.8b, v5.8b + umlal v20.8h, v2.8b, v7.8b + umull2 v21.8h, v17.16b, v5.16b + umlal2 v21.8h, v2.16b, v7.16b + rshrn v16.8b, v18.8h, #6 + rshrn2 v16.16b, v19.8h, #6 + rshrn v17.8b, v20.8h, #6 + rshrn2 v17.16b, v21.8h, #6 + zip1 v18.16b, v16.16b, v17.16b + zip2 v19.16b, v16.16b, v17.16b + st1 {v18.h}[0], [x0], x1 + st1 {v18.h}[1], [x13], x1 + st1 {v18.h}[2], [x0], x1 + st1 {v18.h}[3], [x13], x1 + st1 {v18.h}[4], [x0], x1 + st1 {v18.h}[5], [x13], x1 + st1 {v18.h}[6], [x0], x1 + st1 {v18.h}[7], [x13], x1 + st1 {v19.h}[0], [x0], x1 + st1 {v19.h}[1], [x13], x1 + st1 {v19.h}[2], [x0], x1 + st1 {v19.h}[3], [x13], x1 + st1 {v19.h}[4], [x0], x1 + st1 {v19.h}[5], [x13], x1 + st1 {v19.h}[6], [x0], x1 + st1 {v19.h}[7], [x13], x1 + b.le 3f + mov v0.16b, v1.16b + ld1 {v1.16b}, [x8], #16 // left[base] + mov v2.16b, v3.16b + ld1 {v3.16b}, [x10], #16 + b 2b + +3: + subs w3, w3, #2 + b.le 9f + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + lsl x1, x1, #1 + add x0, x0, #2 + add x13, x13, #2 + mov w4, w12 + b 1b +9: + ret + +L(ipred_z3_fill1_tbl): + .hword L(ipred_z3_fill1_tbl) - 640b + .hword L(ipred_z3_fill1_tbl) - 320b + .hword L(ipred_z3_fill1_tbl) - 160b + .hword L(ipred_z3_fill1_tbl) - 80b + .hword L(ipred_z3_fill1_tbl) - 40b +endfunc + +function ipred_z3_fill_padding_neon, export=0 + cmp w3, #16 + adr x8, L(ipred_z3_fill_padding_tbl) + b.gt L(ipred_z3_fill_padding_wide) + // w3 = remaining width, w4 = constant height + mov w12, w4 + +1: + // Fill a WxH rectangle with padding. W can be any number; + // this fills the exact width by filling in the largest + // power of two in the remaining width, and repeating. + clz w9, w3 + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + sub x9, x8, w9, uxtw + br x9 + +2: + AARCH64_VALID_JUMP_TARGET + st1 {v31.h}[0], [x0], x1 + subs w4, w4, #4 + st1 {v31.h}[0], [x13], x1 + st1 {v31.h}[0], [x0], x1 + st1 {v31.h}[0], [x13], x1 + b.gt 2b + subs w3, w3, #2 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #2 + add x13, x13, #2 + mov w4, w12 + b 1b + +4: + AARCH64_VALID_JUMP_TARGET + st1 {v31.s}[0], [x0], x1 + subs w4, w4, #4 + st1 {v31.s}[0], [x13], x1 + st1 {v31.s}[0], [x0], x1 + st1 {v31.s}[0], [x13], x1 + b.gt 4b + subs w3, w3, #4 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #4 + add x13, x13, #4 + mov w4, w12 + b 1b + +8: + AARCH64_VALID_JUMP_TARGET + st1 {v31.8b}, [x0], x1 + subs w4, w4, #4 + st1 {v31.8b}, [x13], x1 + st1 {v31.8b}, [x0], x1 + st1 {v31.8b}, [x13], x1 + b.gt 4b + subs w3, w3, #8 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #8 + add x13, x13, #8 + mov w4, w12 + b 1b + +16: +32: +64: + AARCH64_VALID_JUMP_TARGET + st1 {v31.16b}, [x0], x1 + subs w4, w4, #4 + st1 {v31.16b}, [x13], x1 + st1 {v31.16b}, [x0], x1 + st1 {v31.16b}, [x13], x1 + b.gt 4b + subs w3, w3, #16 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #16 + add x13, x13, #16 + mov w4, w12 + b 1b + +9: + ret + +L(ipred_z3_fill_padding_tbl): + .hword L(ipred_z3_fill_padding_tbl) - 64b + .hword L(ipred_z3_fill_padding_tbl) - 32b + .hword L(ipred_z3_fill_padding_tbl) - 16b + .hword L(ipred_z3_fill_padding_tbl) - 8b + .hword L(ipred_z3_fill_padding_tbl) - 4b + .hword L(ipred_z3_fill_padding_tbl) - 2b + +L(ipred_z3_fill_padding_wide): + // Fill a WxH rectangle with padding, with W > 16. + lsr x1, x1, #1 + mov w12, w3 + sub x1, x1, w3, uxtw +1: + ands w5, w3, #15 + b.eq 2f + // If the width isn't aligned to 16, first do one 16 byte write + // and align the start pointer. + sub w3, w3, w5 + st1 {v31.16b}, [x0] + add x0, x0, w5, uxtw +2: + // Fill the rest of the line with aligned 16 byte writes. + subs w3, w3, #16 + st1 {v31.16b}, [x0], #16 + b.gt 2b + subs w4, w4, #1 + add x0, x0, x1 + b.le 9f + mov w3, w12 + b 1b +9: + ret +endfunc + +function ipred_z3_fill2_8bpc_neon, export=1 + cmp w3, #8 + add x10, x2, w6, uxtw // left[max_base_y] + movrel x11, increments + ld1r {v31.16b}, [x10] // padding + ld1 {v30.8h}, [x11] // increments + b.eq 80f + +40: // w == 4 + dup v29.4h, w5 // dy + + mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy + movi v23.16b, #0x3e + + // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, + // so max_base_y <= 32. + ld1 {v0.16b, v1.16b}, [x2] // left[] + add v30.4h, v29.4h, v30.4h // ypos + + movi v22.16b, #64 + movi v20.16b, #1 + movi v21.16b, #2 + + xtn v24.8b, v30.8h // (uint8_t)ypos + uqshrn v26.8b, v30.8h, #6 // base + and v24.8b, v24.8b, v23.8b // frac + + uqadd v27.8b, v26.8b, v20.8b // base + 1 + uqadd v28.8b, v26.8b, v21.8b // base + 2 + sub v25.8b, v22.8b, v24.8b // 64 - frac + uqadd v29.8b, v27.8b, v21.8b // base + 3 + + trn1 v24.2s, v24.2s, v24.2s // frac + trn1 v26.2s, v26.2s, v28.2s // base + 0, base + 2 + trn1 v27.2s, v27.2s, v29.2s // base + 1, base + 3 + trn1 v25.2s, v25.2s, v25.2s // 64 - frac + + movi v21.16b, #4 +1: + mov v4.8b, v31.8b + mov v5.8b, v31.8b + tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2] + tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3] + + umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) + umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac + rshrn v16.8b, v16.8h, #6 + st1 {v16.s}[0], [x0], x1 + subs w4, w4, #2 + st1 {v16.s}[1], [x0], x1 + b.le 9f + + uqadd v26.8b, v26.8b, v21.8b // base += 4 + uqadd v27.8b, v27.8b, v21.8b // base += 4 + b 1b + +9: + ret + +80: // w == 8 + dup v29.8h, w5 // dy + + mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy + movi v23.16b, #0x3e + + // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, + // so max_base_y <= 32. + ld1 {v0.16b, v1.16b}, [x2] // left[] + add v30.8h, v29.8h, v30.8h // ypos + + movi v22.16b, #64 + movi v20.16b, #1 + movi v21.16b, #2 + + xtn v24.8b, v30.8h // (uint8_t)ypos + uqshrn v26.8b, v30.8h, #6 // base + and v24.8b, v24.8b, v23.8b // frac + + uqadd v27.8b, v26.8b, v20.8b // base + 1 + uqadd v28.8b, v26.8b, v21.8b // base + 2 + sub v25.8b, v22.8b, v24.8b // 64 - frac + uqadd v29.8b, v27.8b, v21.8b // base + 3 + + trn1 v24.2d, v24.2d, v24.2d // frac + trn1 v26.2d, v26.2d, v28.2d // base + 0, base + 2 + trn1 v27.2d, v27.2d, v29.2d // base + 1, base + 3 + trn1 v25.2d, v25.2d, v25.2d // 64 - frac + + movi v21.16b, #4 +1: + mov v4.16b, v31.16b + mov v5.16b, v31.16b + tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2] + tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3] + + umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) + umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac + umull2 v17.8h, v4.16b, v25.16b + umlal2 v17.8h, v5.16b, v24.16b + rshrn v16.8b, v16.8h, #6 + rshrn v17.8b, v17.8h, #6 + st1 {v16.8b}, [x0], x1 + subs w4, w4, #2 + st1 {v17.8b}, [x0], x1 + b.le 9f + + uqadd v26.16b, v26.16b, v21.16b // base += 4 + uqadd v27.16b, v27.16b, v21.16b // base += 4 + b 1b + +9: + ret +endfunc + + // void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int filt_idx, @@ -1541,23 +3925,26 @@ L(ipred_filter_tbl): endfunc // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, -// const uint16_t *const pal, const uint8_t *idx, +// const pixel *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_8bpc_neon, export=1 - ld1 {v0.8h}, [x2] + ld1 {v0.8b}, [x2] clz w9, w4 adr x6, L(pal_pred_tbl) sub w9, w9, #25 + movi v31.16b, #7 ldrh w9, [x6, w9, uxtw #1] - xtn v0.8b, v0.8h sub x6, x6, w9, uxtw add x2, x0, x1 lsl x1, x1, #1 br x6 4: AARCH64_VALID_JUMP_TARGET - ld1 {v1.16b}, [x3], #16 + ld1 {v1.8b}, [x3], #8 subs w5, w5, #4 + ushr v3.8b, v1.8b, #4 + and v2.8b, v1.8b, v31.8b + zip1 v1.16b, v2.16b, v3.16b tbl v1.16b, {v0.16b}, v1.16b st1 {v1.s}[0], [x0], x1 st1 {v1.s}[1], [x2], x1 @@ -1567,8 +3954,12 @@ function pal_pred_8bpc_neon, export=1 ret 8: AARCH64_VALID_JUMP_TARGET - ld1 {v1.16b, v2.16b}, [x3], #32 + ld1 {v1.16b}, [x3], #16 subs w5, w5, #4 + ushr v4.16b, v1.16b, #4 + and v3.16b, v1.16b, v31.16b + zip1 v1.16b, v3.16b, v4.16b + zip2 v2.16b, v3.16b, v4.16b tbl v1.16b, {v0.16b}, v1.16b st1 {v1.d}[0], [x0], x1 tbl v2.16b, {v0.16b}, v2.16b @@ -1579,9 +3970,17 @@ function pal_pred_8bpc_neon, export=1 ret 16: AARCH64_VALID_JUMP_TARGET - ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64 + ld1 {v1.16b, v2.16b}, [x3], #32 subs w5, w5, #4 + ushr v5.16b, v1.16b, #4 + and v4.16b, v1.16b, v31.16b + ushr v7.16b, v2.16b, #4 + and v6.16b, v2.16b, v31.16b + zip1 v1.16b, v4.16b, v5.16b + zip2 v2.16b, v4.16b, v5.16b + zip1 v3.16b, v6.16b, v7.16b tbl v1.16b, {v0.16b}, v1.16b + zip2 v4.16b, v6.16b, v7.16b tbl v2.16b, {v0.16b}, v2.16b st1 {v1.16b}, [x0], x1 tbl v3.16b, {v0.16b}, v3.16b @@ -1594,10 +3993,25 @@ function pal_pred_8bpc_neon, export=1 32: AARCH64_VALID_JUMP_TARGET ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 - ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 subs w5, w5, #4 + ushr v21.16b, v16.16b, #4 + and v20.16b, v16.16b, v31.16b + ushr v23.16b, v17.16b, #4 + and v22.16b, v17.16b, v31.16b + ushr v25.16b, v18.16b, #4 + and v24.16b, v18.16b, v31.16b + ushr v27.16b, v19.16b, #4 + and v26.16b, v19.16b, v31.16b + zip1 v16.16b, v20.16b, v21.16b + zip2 v17.16b, v20.16b, v21.16b + zip1 v18.16b, v22.16b, v23.16b + zip2 v19.16b, v22.16b, v23.16b + zip1 v20.16b, v24.16b, v25.16b + zip2 v21.16b, v24.16b, v25.16b tbl v16.16b, {v0.16b}, v16.16b + zip1 v22.16b, v26.16b, v27.16b tbl v17.16b, {v0.16b}, v17.16b + zip2 v23.16b, v26.16b, v27.16b tbl v18.16b, {v0.16b}, v18.16b tbl v19.16b, {v0.16b}, v19.16b tbl v20.16b, {v0.16b}, v20.16b @@ -1613,10 +4027,25 @@ function pal_pred_8bpc_neon, export=1 64: AARCH64_VALID_JUMP_TARGET ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 - ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 subs w5, w5, #2 + ushr v21.16b, v16.16b, #4 + and v20.16b, v16.16b, v31.16b + ushr v23.16b, v17.16b, #4 + and v22.16b, v17.16b, v31.16b + ushr v25.16b, v18.16b, #4 + and v24.16b, v18.16b, v31.16b + ushr v27.16b, v19.16b, #4 + and v26.16b, v19.16b, v31.16b + zip1 v16.16b, v20.16b, v21.16b + zip2 v17.16b, v20.16b, v21.16b + zip1 v18.16b, v22.16b, v23.16b + zip2 v19.16b, v22.16b, v23.16b + zip1 v20.16b, v24.16b, v25.16b + zip2 v21.16b, v24.16b, v25.16b tbl v16.16b, {v0.16b}, v16.16b + zip1 v22.16b, v26.16b, v27.16b tbl v17.16b, {v0.16b}, v17.16b + zip2 v23.16b, v26.16b, v27.16b tbl v18.16b, {v0.16b}, v18.16b tbl v19.16b, {v0.16b}, v19.16b st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/ipred16.S b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/ipred16.S index 6a2866167..3f8cff986 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/ipred16.S +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/ipred16.S @@ -1405,6 +1405,2471 @@ L(ipred_smooth_h_tbl): .hword L(ipred_smooth_h_tbl) - 40b endfunc +const padding_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +padding_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + +// void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz, +// const pixel *const in, const int end, +// const int bitdepth_max); +function ipred_z1_upsample_edge_16bpc_neon, export=1 + dup v30.8h, w4 // bitdepth_max + movrel x4, padding_mask + ld1 {v0.8h, v1.8h}, [x2] // in[] + add x5, x2, w3, uxtw #1 // in[end] + sub x4, x4, w3, uxtw #1 + + ld1r {v2.8h}, [x5] // padding + ld1 {v3.8h, v4.8h}, [x4] // padding_mask + + movi v31.8h, #9 + + bit v0.16b, v2.16b, v3.16b // padded in[] + bit v1.16b, v2.16b, v4.16b + + ext v4.16b, v0.16b, v1.16b, #2 + ext v5.16b, v1.16b, v2.16b, #2 + ext v6.16b, v0.16b, v1.16b, #4 + ext v7.16b, v1.16b, v2.16b, #4 + ext v16.16b, v0.16b, v1.16b, #6 + ext v17.16b, v1.16b, v2.16b, #6 + + add v18.8h, v4.8h, v6.8h // in[i+1] + in[i+2] + add v19.8h, v5.8h, v7.8h + add v20.8h, v0.8h, v16.8h + add v21.8h, v1.8h, v17.8h + umull v22.4s, v18.4h, v31.4h // 9*(in[i+1] + in[i+2]) + umull2 v23.4s, v18.8h, v31.8h + umull v24.4s, v19.4h, v31.4h + umull2 v25.4s, v19.8h, v31.8h + usubw v22.4s, v22.4s, v20.4h + usubw2 v23.4s, v23.4s, v20.8h + usubw v24.4s, v24.4s, v21.4h + usubw2 v25.4s, v25.4s, v21.8h + + sqrshrun v16.4h, v22.4s, #4 + sqrshrun2 v16.8h, v23.4s, #4 + sqrshrun v17.4h, v24.4s, #4 + sqrshrun2 v17.8h, v25.4s, #4 + + smin v16.8h, v16.8h, v30.8h + smin v17.8h, v17.8h, v30.8h + + zip1 v0.8h, v4.8h, v16.8h + zip2 v1.8h, v4.8h, v16.8h + zip1 v2.8h, v5.8h, v17.8h + zip2 v3.8h, v5.8h, v17.8h + + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + + ret +endfunc + +// void ipred_z2_upsample_edge_16bpc_neon(pixel *out, const int sz, +// const pixel *const in, +// const int bitdepth_max); +function ipred_z2_upsample_edge_16bpc_neon, export=1 + dup v30.8h, w3 // bitdepth_max + // Here, sz is 4 or 8, and we produce 2*sz+1 output elements. + movrel x4, padding_mask + ld1 {v0.8h, v1.8h}, [x2] // in[] + add x5, x2, w1, uxtw #1 // in[sz] + sub x4, x4, w1, uxtw #1 + + ld1r {v3.8h}, [x2] // in[0] for padding + ld1r {v2.8h}, [x5] // padding + ld1 {v4.8h, v5.8h}, [x4] // padding_mask + + movi v31.8h, #9 + + bit v0.16b, v2.16b, v4.16b // padded in[] + bit v1.16b, v2.16b, v5.16b + + ext v4.16b, v3.16b, v0.16b, #14 + ext v5.16b, v0.16b, v1.16b, #2 + ext v6.16b, v0.16b, v1.16b, #4 + + add v16.8h, v0.8h, v5.8h // in[i+0] + in[i+1] + add v17.8h, v4.8h, v6.8h // in[i-1] + in[i+2] + umull v18.4s, v16.4h, v31.4h // 9*(in[i+1] + in[i+2]) + umull2 v19.4s, v16.8h, v31.8h + usubw v18.4s, v18.4s, v17.4h + usubw2 v19.4s, v19.4s, v17.8h + + sqrshrun v16.4h, v18.4s, #4 + sqrshrun2 v16.8h, v19.4s, #4 + + add x5, x0, #2*16 + + smin v16.8h, v16.8h, v30.8h + + zip1 v4.8h, v0.8h, v16.8h + zip2 v5.8h, v0.8h, v16.8h + + st1 {v2.h}[0], [x5] + // In case sz=8, output one single pixel in out[16]. + st1 {v4.8h, v5.8h}, [x0] + + ret +endfunc + +const edge_filter + .short 0, 4, 8, 0 + .short 0, 5, 6, 0 +// Leaving out the coeffs for strength=3 +// .byte 2, 4, 4, 0 +endconst + +// void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz, +// const pixel *const in, const int end, +// const int strength); +function ipred_z1_filter_edge_16bpc_neon, export=1 + cmp w4, #3 + b.eq L(fivetap) // if (strength == 3) goto fivetap + + movrel x5, edge_filter, -6 + add x5, x5, w4, uxtw #3 // edge_filter + 2*((strength - 1)*4 + 1) + + ld1 {v31.s}[0], [x5] // kernel[1-2] + + ld1 {v0.8h}, [x2], #16 + + dup v30.8h, v31.h[0] + dup v31.8h, v31.h[1] +1: + // in[end], is the last valid pixel. We produce 16 pixels out by + // using 18 pixels in - the last pixel used is [17] of the ones + // read/buffered. + cmp w3, #17 + ld1 {v1.8h, v2.8h}, [x2], #32 + b.lt 2f + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + mul v16.8h, v0.8h, v30.8h + mla v16.8h, v3.8h, v31.8h + mla v16.8h, v5.8h, v30.8h + mul v17.8h, v1.8h, v30.8h + mla v17.8h, v4.8h, v31.8h + mla v17.8h, v6.8h, v30.8h + subs w1, w1, #16 + mov v0.16b, v2.16b + urshr v16.8h, v16.8h, #4 + urshr v17.8h, v17.8h, #4 + sub w3, w3, #16 + st1 {v16.8h, v17.8h}, [x0], #32 + b.gt 1b + ret +2: + // Right padding + + // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead) + movrel x5, padding_mask + sub w6, w3, #24 + sub x5, x5, w3, uxtw #1 + add x6, x2, w6, sxtw #1 + + ld1 {v3.8h, v4.8h}, [x5] // padding_mask + + ld1r {v2.8h}, [x6] + bit v0.16b, v2.16b, v3.16b // Pad v0-v1 + bit v1.16b, v2.16b, v4.16b + + // Filter one block + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + mul v16.8h, v0.8h, v30.8h + mla v16.8h, v3.8h, v31.8h + mla v16.8h, v5.8h, v30.8h + mul v17.8h, v1.8h, v30.8h + mla v17.8h, v4.8h, v31.8h + mla v17.8h, v6.8h, v30.8h + subs w1, w1, #16 + urshr v16.8h, v16.8h, #4 + urshr v17.8h, v17.8h, #4 + st1 {v16.8h, v17.8h}, [x0], #32 + b.le 9f +5: + // After one block, any remaining output would only be filtering + // padding - thus just store the padding. + subs w1, w1, #16 + st1 {v2.16b}, [x0], #16 + b.gt 5b +9: + ret + +L(fivetap): + sub x2, x2, #2 // topleft -= 1 pixel + movi v29.8h, #2 + ld1 {v0.8h}, [x2], #16 + movi v30.8h, #4 + movi v31.8h, #4 + ins v0.h[0], v0.h[1] +1: + // in[end+1], is the last valid pixel. We produce 16 pixels out by + // using 20 pixels in - the last pixel used is [19] of the ones + // read/buffered. + cmp w3, #18 + ld1 {v1.8h, v2.8h}, [x2], #32 + b.lt 2f // if (end + 1 < 19) + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + ext v16.16b, v0.16b, v1.16b, #6 + ext v17.16b, v1.16b, v2.16b, #6 + ext v18.16b, v0.16b, v1.16b, #8 + ext v19.16b, v1.16b, v2.16b, #8 + mul v20.8h, v0.8h, v29.8h + mla v20.8h, v3.8h, v30.8h + mla v20.8h, v5.8h, v31.8h + mla v20.8h, v16.8h, v30.8h + mla v20.8h, v18.8h, v29.8h + mul v21.8h, v1.8h, v29.8h + mla v21.8h, v4.8h, v30.8h + mla v21.8h, v6.8h, v31.8h + mla v21.8h, v17.8h, v30.8h + mla v21.8h, v19.8h, v29.8h + subs w1, w1, #16 + mov v0.16b, v2.16b + urshr v20.8h, v20.8h, #4 + urshr v21.8h, v21.8h, #4 + sub w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + b.gt 1b + ret +2: + // Right padding + + // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead) + movrel x5, padding_mask, -2 + sub w6, w3, #23 + sub x5, x5, w3, uxtw #1 + add x6, x2, w6, sxtw #1 + + ld1 {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask + + ld1r {v28.8h}, [x6] + bit v0.16b, v28.16b, v3.16b // Pad v0-v2 + bit v1.16b, v28.16b, v4.16b + bit v2.16b, v28.16b, v5.16b +4: + // Filter one block + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + ext v16.16b, v0.16b, v1.16b, #6 + ext v17.16b, v1.16b, v2.16b, #6 + ext v18.16b, v0.16b, v1.16b, #8 + ext v19.16b, v1.16b, v2.16b, #8 + mul v20.8h, v0.8h, v29.8h + mla v20.8h, v3.8h, v30.8h + mla v20.8h, v5.8h, v31.8h + mla v20.8h, v16.8h, v30.8h + mla v20.8h, v18.8h, v29.8h + mul v21.8h, v1.8h, v29.8h + mla v21.8h, v4.8h, v30.8h + mla v21.8h, v6.8h, v31.8h + mla v21.8h, v17.8h, v30.8h + mla v21.8h, v19.8h, v29.8h + subs w1, w1, #16 + mov v0.16b, v2.16b + mov v1.16b, v28.16b + mov v2.16b, v28.16b + urshr v20.8h, v20.8h, #4 + urshr v21.8h, v21.8h, #4 + sub w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + b.le 9f + // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to + // filter properly once more - aka (w3 >= 0). + cmp w3, #0 + b.ge 4b +5: + // When w3 <= 0, all remaining pixels in v0-v1 are equal to the + // last valid pixel - thus just output that without filtering. + subs w1, w1, #8 + st1 {v28.8h}, [x0], #16 + b.gt 5b +9: + ret +endfunc + +// void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px, +// const int n); +function ipred_pixel_set_16bpc_neon, export=1 + dup v0.8h, w1 +1: + subs w2, w2, #8 + st1 {v0.8h}, [x0], #16 + b.gt 1b + ret +endfunc + +// void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const top, +// const int width, const int height, +// const int dx, const int max_base_x); +function ipred_z1_fill1_16bpc_neon, export=1 + clz w9, w3 + adr x8, L(ipred_z1_fill1_tbl) + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + add x10, x2, w6, uxtw #1 // top[max_base_x] + sub x8, x8, w9, uxtw + ld1r {v31.8h}, [x10] // padding + mov w7, w5 + mov w15, #64 + br x8 +40: + AARCH64_VALID_JUMP_TARGET +4: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 49f + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + ext v1.16b, v0.16b, v0.16b, #2 // top[base+1] + ext v3.16b, v2.16b, v2.16b, #2 + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + st1 {v16.4h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.4h}, [x0], x1 + b.gt 4b + ret + +49: + st1 {v31.4h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.4h}, [x0], x1 + b.gt 49b + ret + +80: + AARCH64_VALID_JUMP_TARGET +8: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 89f + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h}, [x8] // top[base] + ld1 {v2.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + ldr h1, [x8, #16] + ldr h3, [x10, #16] + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + ext v1.16b, v0.16b, v1.16b, #2 // top[base+1] + ext v3.16b, v2.16b, v3.16b, #2 + umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) + umlal v16.4s, v1.4h, v4.4h // + top[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v1.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v3.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v3.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + st1 {v16.8h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.8h}, [x0], x1 + b.gt 8b + ret + +89: + st1 {v31.8h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.8h}, [x0], x1 + b.gt 89b + ret + +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + + mov w12, w3 + + add x13, x0, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 +1: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 169f + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v6.8h, w9 // frac + dup v7.8h, w11 + ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // top[base] + ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v16.8h, w9 // 64 - frac + dup v17.8h, w11 + add w7, w7, w5 // xpos += dx +2: + ext v18.16b, v0.16b, v1.16b, #2 // top[base+1] + ext v19.16b, v1.16b, v2.16b, #2 + ext v20.16b, v3.16b, v4.16b, #2 + ext v21.16b, v4.16b, v5.16b, #2 + subs w3, w3, #16 + umull v22.4s, v0.4h, v16.4h // top[base]*(64-frac) + umlal v22.4s, v18.4h, v6.4h // + top[base+1]*frac + umull2 v23.4s, v0.8h, v16.8h + umlal2 v23.4s, v18.8h, v6.8h + umull v24.4s, v1.4h, v16.4h + umlal v24.4s, v19.4h, v6.4h + umull2 v25.4s, v1.8h, v16.8h + umlal2 v25.4s, v19.8h, v6.8h + umull v26.4s, v3.4h, v17.4h + umlal v26.4s, v20.4h, v7.4h + umull2 v27.4s, v3.8h, v17.8h + umlal2 v27.4s, v20.8h, v7.8h + umull v28.4s, v4.4h, v17.4h + umlal v28.4s, v21.4h, v7.4h + umull2 v29.4s, v4.8h, v17.8h + umlal2 v29.4s, v21.8h, v7.8h + rshrn v22.4h, v22.4s, #6 + rshrn2 v22.8h, v23.4s, #6 + rshrn v23.4h, v24.4s, #6 + rshrn2 v23.8h, v25.4s, #6 + rshrn v24.4h, v26.4s, #6 + rshrn2 v24.8h, v27.4s, #6 + rshrn v25.4h, v28.4s, #6 + rshrn2 v25.8h, v29.4s, #6 + st1 {v22.8h, v23.8h}, [x0], #32 + st1 {v24.8h, v25.8h}, [x13], #32 + b.le 3f + mov v0.16b, v2.16b + ld1 {v1.8h, v2.8h}, [x8], #32 // top[base] + mov v3.16b, v5.16b + ld1 {v4.8h, v5.8h}, [x10], #32 + b 2b + +3: + subs w4, w4, #2 + b.le 9f + add x0, x0, x1 + add x13, x13, x1 + mov w3, w12 + b 1b +9: + ret + +169: + st1 {v31.8h}, [x0], #16 + subs w3, w3, #8 + st1 {v31.8h}, [x13], #16 + b.gt 169b + subs w4, w4, #2 + b.le 9b + add x0, x0, x1 + add x13, x13, x1 + mov w3, w12 + b 169b + +L(ipred_z1_fill1_tbl): + .hword L(ipred_z1_fill1_tbl) - 640b + .hword L(ipred_z1_fill1_tbl) - 320b + .hword L(ipred_z1_fill1_tbl) - 160b + .hword L(ipred_z1_fill1_tbl) - 80b + .hword L(ipred_z1_fill1_tbl) - 40b +endfunc + +function ipred_z1_fill2_16bpc_neon, export=1 + cmp w3, #8 + add x10, x2, w6, uxtw // top[max_base_x] + ld1r {v31.16b}, [x10] // padding + mov w7, w5 + mov w15, #64 + b.eq 8f + +4: // w == 4 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 49f + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + uzp2 v1.8h, v0.8h, v0.8h // top[base+1] + uzp1 v0.8h, v0.8h, v0.8h // top[base] + uzp2 v3.8h, v2.8h, v2.8h + uzp1 v2.8h, v2.8h, v2.8h + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + st1 {v16.4h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.4h}, [x0], x1 + b.gt 4b + ret + +49: + st1 {v31.4h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.4h}, [x0], x1 + b.gt 49b + ret + +8: // w == 8 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 89f + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h, v1.8h}, [x8] // top[base] + ld1 {v2.8h, v3.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + uzp2 v20.8h, v0.8h, v1.8h // top[base+1] + uzp1 v0.8h, v0.8h, v1.8h // top[base] + uzp2 v21.8h, v2.8h, v3.8h + uzp1 v2.8h, v2.8h, v3.8h + umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) + umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v20.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v21.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v21.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + st1 {v16.8h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.8h}, [x0], x1 + b.gt 8b + ret + +89: + st1 {v31.8h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.8h}, [x0], x1 + b.gt 89b + ret +endfunc + +// void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src, +// const int n); +function ipred_reverse_16bpc_neon, export=1 + sub x1, x1, #16 + add x3, x0, #8 + mov x4, #16 +1: + ld1 {v0.8h}, [x1] + subs w2, w2, #8 + rev64 v0.8h, v0.8h + sub x1, x1, #16 + st1 {v0.d}[1], [x0], x4 + st1 {v0.d}[0], [x3], x4 + b.gt 1b + ret +endfunc + +const increments + .short 0, 1, 2, 3, 4, 5, 6, 7 +endconst + +// void ipred_z2_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const top, +// const pixel *const left, +// const int width, const int height, +// const int dx, const int dy); +function ipred_z2_fill1_16bpc_neon, export=1 + clz w10, w4 + adr x9, L(ipred_z2_fill1_tbl) + sub w10, w10, #25 + ldrh w10, [x9, w10, uxtw #1] + mov w8, #(1 << 6) // xpos = 1 << 6 + sub x9, x9, w10, uxtw + sub w8, w8, w6 // xpos -= dx + + movrel x11, increments + ld1 {v31.8h}, [x11] // increments + neg w7, w7 // -dy + + br x9 +40: + AARCH64_VALID_JUMP_TARGET + + dup v30.4h, w7 // -dy + movi v17.8b, #1 + + mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy + movi v25.8h, #0x3e + add v30.4h, v16.4h, v30.4h // -= dy + + // Worst case height for w=4 is 16, but we need at least h+1 elements + ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] + + movi v26.8h, #64 + movi v19.16b, #4 + + shrn v29.8b, v30.8h, #6 // ypos >> 6 + and v27.8b, v30.8b, v25.8b // frac_y + + add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 + + movi v23.4h, #1, lsl #8 + shl v29.8b, v29.8b, #1 // 2*base_y + zip1 v29.8b, v29.8b, v29.8b // duplicate elements + movi v17.8b, #2 + add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... + + add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) + add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) + + tbl v18.8b, {v0.16b}, v29.8b // left[base_y] + + trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2 + + sub v28.4h, v26.4h, v27.4h // 64 - frac_y + + trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} + + trn1 v27.2d, v27.2d, v27.2d // frac_y + trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y + + movi v29.16b, #4 +4: + asr w9, w8, #6 // base_x + dup v16.4h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, #-4 // base_x <= -4 + asr w11, w8, #6 // base_x + b.le 49f + + lsl w9, w9, #1 + lsl w11, w11, #1 + + dup v17.4h, w8 // xpos + + ldr q4, [x2, w9, sxtw] // top[base_x] + ldr q6, [x2, w11, sxtw] + + trn1 v16.2d, v16.2d, v17.2d // xpos + + // Cut corners here; only doing tbl over v0-v1 here; we only + // seem to need the last pixel, from v2, after skipping to the + // left-only codepath below. + tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] + + sshr v20.8h, v16.8h, #6 // first base_x for each row + + ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1] + ext v7.16b, v6.16b, v6.16b, #2 + + and v16.16b, v16.16b, v25.16b // frac_x + + trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] + + trn1 v4.2d, v4.2d, v6.2d // top[base_x] + trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] + + sub v17.8h, v26.8h, v16.8h // 64 - frac_x + + add v20.8h, v20.8h, v31.8h // actual base_x + + umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + umull2 v22.4s, v18.8h, v28.8h + umlal2 v22.4s, v19.8h, v27.8h + + umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) + umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x + umull2 v24.4s, v4.8h, v17.8h + umlal2 v24.4s, v5.8h, v16.8h + + cmge v20.8h, v20.8h, #0 + + rshrn v21.4h, v21.4s, #6 + rshrn2 v21.8h, v22.4s, #6 + rshrn v22.4h, v23.4s, #6 + rshrn2 v22.8h, v24.4s, #6 + + bit v21.16b, v22.16b, v20.16b + + st1 {v21.d}[0], [x0], x1 + sub w8, w8, w6 // xpos -= dx + subs w5, w5, #2 + st1 {v21.d}[1], [x0], x1 + b.le 9f + + ext v18.16b, v19.16b, v19.16b, #8 + add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) + b 4b + +49: + tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+2] + + trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] + + umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + umull2 v21.4s, v18.8h, v28.8h + umlal2 v21.4s, v19.8h, v27.8h + + rshrn v20.4h, v20.4s, #6 + rshrn2 v20.8h, v21.4s, #6 + + st1 {v20.d}[0], [x0], x1 + subs w5, w5, #2 + st1 {v20.d}[1], [x0], x1 + b.le 9f + + ext v18.16b, v19.16b, v19.16b, #8 + add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) + b 49b + +9: + ret + +80: + AARCH64_VALID_JUMP_TARGET + + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + dup v18.8h, w7 // -dy + add x3, x3, #2 // Skip past left[0] + + mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy + movi v25.8h, #0x3e + add v16.8h, v16.8h, v18.8h // -= dy + + // Worst case height for w=8 is 32. + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[] + ld1r {v15.8h}, [x2] // left[0] == top[0] + + movi v26.8h, #64 + movi v19.16b, #4 + + shrn v29.8b, v16.8h, #6 // ypos >> 6 + and v27.16b, v16.16b, v25.16b // frac_y + + movi v23.8h, #1, lsl #8 + shl v29.8b, v29.8b, #1 // 2*base_y + mov v18.16b, v15.16b // left[0] + zip1 v29.16b, v29.16b, v29.16b // duplicate elements + movi v17.16b, #2 + add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... + + // Cut corners here; for the first row we don't expect to need to + // read outside of v0. + tbx v18.16b, {v0.16b}, v29.16b // left[base_y] + + add v30.16b, v29.16b, v19.16b // base_y + 2 (*2) + add v29.16b, v29.16b, v17.16b // base_y + 1 (*2) + + sub v28.8h, v26.8h, v27.8h // 64 - frac_y + + movi v24.16b, #4 +8: + asr w9, w8, #6 // base_x + dup v16.8h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, #-16 // base_x <= -16 + asr w11, w8, #6 // base_x + b.le 89f + + dup v17.8h, w8 // xpos + + add x9, x2, w9, sxtw #1 + add x11, x2, w11, sxtw #1 + + ld1 {v4.8h, v5.8h}, [x9] // top[base_x] + mov v19.16b, v15.16b // left[0] + ld1 {v6.8h, v7.8h}, [x11] + + tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] + + mov v20.16b, v15.16b // left[0] + + sshr v21.8h, v16.8h, #6 // first base_x + sshr v22.8h, v17.8h, #6 + + tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] + + ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1] + ext v7.16b, v6.16b, v7.16b, #2 + + and v16.16b, v16.16b, v25.16b // frac_x + and v17.16b, v17.16b, v25.16b + + umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + + sub v8.8h, v26.8h, v16.8h // 64 - frac_x + sub v9.8h, v26.8h, v17.8h + + umull2 v11.4s, v18.8h, v28.8h + umlal2 v11.4s, v19.8h, v27.8h + + add v21.8h, v21.8h, v31.8h // actual base_x + add v22.8h, v22.8h, v31.8h + + umull v12.4s, v19.4h, v28.4h + umlal v12.4s, v20.4h, v27.4h + umull2 v13.4s, v19.8h, v28.8h + umlal2 v13.4s, v20.8h, v27.8h + + rshrn v10.4h, v10.4s, #6 + rshrn2 v10.8h, v11.4s, #6 + rshrn v11.4h, v12.4s, #6 + rshrn2 v11.8h, v13.4s, #6 + + umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) + umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x + umull2 v13.4s, v4.8h, v8.8h + umlal2 v13.4s, v5.8h, v16.8h + umull v14.4s, v6.4h, v9.4h + umlal v14.4s, v7.4h, v17.4h + umull2 v18.4s, v6.8h, v9.8h + umlal2 v18.4s, v7.8h, v17.8h + + cmge v21.8h, v21.8h, #0 + cmge v22.8h, v22.8h, #0 + + rshrn v12.4h, v12.4s, #6 + rshrn2 v12.8h, v13.4s, #6 + rshrn v13.4h, v14.4s, #6 + rshrn2 v13.8h, v18.4s, #6 + + bit v10.16b, v12.16b, v21.16b + bit v11.16b, v13.16b, v22.16b + + st1 {v10.8h}, [x0], x1 + subs w5, w5, #2 + sub w8, w8, w6 // xpos -= dx + st1 {v11.8h}, [x0], x1 + b.le 9f + + mov v18.16b, v20.16b + add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) + add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) + b 8b + +89: + mov v19.16b, v15.16b + mov v20.16b, v15.16b + tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] + tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] + + umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + umull2 v5.4s, v18.8h, v28.8h + umlal2 v5.4s, v19.8h, v27.8h + umull v6.4s, v19.4h, v28.4h + umlal v6.4s, v20.4h, v27.4h + umull2 v7.4s, v19.8h, v28.8h + umlal2 v7.4s, v20.8h, v27.8h + + rshrn v4.4h, v4.4s, #6 + rshrn2 v4.8h, v5.4s, #6 + rshrn v5.4h, v6.4s, #6 + rshrn2 v5.8h, v7.4s, #6 + + st1 {v4.8h}, [x0], x1 + subs w5, w5, #2 + st1 {v5.8h}, [x0], x1 + b.le 9f + + mov v18.16b, v20.16b + add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) + add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) + b 89b + +9: + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + ret + +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + dup v25.8h, w7 // -dy + add x3, x3, #2 // Skip past left[0] + + add x13, x0, x1 // alternating row + lsl x1, x1, #1 // stride *= 2 + sub x1, x1, w4, uxtw #1 // stride -= width + + movi v11.8h, #8 + mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy + add v26.8h, v26.8h, v25.8h // -= dy + mul v25.8h, v25.8h, v11.8h // -8*dy + + // Worst case height is 64, but we can only fit 32 pixels into + // v0-v3 usable within one tbx instruction. As long as base_y is + // up to 32, we use tbx. + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[] + ld1r {v15.8h}, [x2] // left[0] == top[0] + + mov w12, w4 // orig w + neg w14, w4 // -w + +1: + mov v23.16b, v26.16b // reset ypos + + asr w9, w8, #6 // base_x + dup v16.8h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, w14 // base_x <= -2*w + asr w11, w8, #6 // base_x + b.le 169f + + dup v17.8h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + + add x9, x2, w9, sxtw #1 + add x11, x2, w11, sxtw #1 + + sshr v21.8h, v16.8h, #6 // first base_x + sshr v22.8h, v17.8h, #6 + + ld1 {v4.8h}, [x9], #16 // top[base_x] + ld1 {v6.8h}, [x11], #16 + + movi v10.8h, #0x3e + movi v11.8h, #64 + + and v16.16b, v16.16b, v10.16b // frac_x + and v17.16b, v17.16b, v10.16b + + sub v8.8h, v11.8h, v16.8h // 64 - frac_x + sub v9.8h, v11.8h, v17.8h + + add v21.8h, v21.8h, v31.8h // actual base_x + add v22.8h, v22.8h, v31.8h + +2: + smov w10, v22.h[0] + + shrn v29.8b, v23.8h, #6 // ypos >> 6 + movi v12.8h, #64 + cmp w10, #0 // base_x (bottom left) >= 0 + smov w10, v29.b[0] // base_y[0] + movi v10.8h, #0x3e + + b.ge 4f + and v27.16b, v23.16b, v10.16b // frac_y + cmp w10, #(32-3) + + mov v18.16b, v15.16b // left[0] + sub v28.8h, v12.8h, v27.8h // 64 - frac_y + b.gt 22f + +21: + // base_y < 32, using tbx + shl v29.8b, v29.8b, #1 // 2*base_y + movi v11.8h, #1, lsl #8 + zip1 v29.16b, v29.16b, v29.16b // duplicate elements + add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ... + + movi v13.16b, #2 + + tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] + + add v29.16b, v29.16b, v13.16b // base_y + 1 (*2) + mov v19.16b, v15.16b // left[0] + + tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] + + add v29.16b, v29.16b, v13.16b // base_y + 2 (*2) + mov v20.16b, v15.16b // left[0] + + tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] + + b 23f + +22: + // base_y >= 32, using separate loads. + smov w15, v29.b[1] + smov w16, v29.b[2] + add x10, x3, w10, sxtw #1 + smov w17, v29.b[3] + add x15, x3, w15, sxtw #1 + ld3 {v18.h, v19.h, v20.h}[0], [x10] + smov w10, v29.b[4] + add x16, x3, w16, sxtw #1 + ld3 {v18.h, v19.h, v20.h}[1], [x15] + smov w15, v29.b[5] + add x17, x3, w17, sxtw #1 + ld3 {v18.h, v19.h, v20.h}[2], [x16] + smov w16, v29.b[6] + add x10, x3, w10, sxtw #1 + ld3 {v18.h, v19.h, v20.h}[3], [x17] + smov w17, v29.b[7] + add x15, x3, w15, sxtw #1 + add x16, x3, w16, sxtw #1 + ld3 {v18.h, v19.h, v20.h}[4], [x10] + add x17, x3, w17, sxtw #1 + ld3 {v18.h, v19.h, v20.h}[5], [x15] + ld3 {v18.h, v19.h, v20.h}[6], [x16] + ld3 {v18.h, v19.h, v20.h}[7], [x17] + +23: + + ld1 {v5.8h}, [x9], #16 // top[base_x] + ld1 {v7.8h}, [x11], #16 + + add v23.8h, v23.8h, v25.8h // ypos -= 8*dy + + umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + umull2 v11.4s, v18.8h, v28.8h + umlal2 v11.4s, v19.8h, v27.8h + umull v12.4s, v19.4h, v28.4h + umlal v12.4s, v20.4h, v27.4h + umull2 v13.4s, v19.8h, v28.8h + umlal2 v13.4s, v20.8h, v27.8h + + ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1] + ext v19.16b, v6.16b, v7.16b, #2 + + rshrn v10.4h, v10.4s, #6 + rshrn2 v10.8h, v11.4s, #6 + rshrn v11.4h, v12.4s, #6 + rshrn2 v11.8h, v13.4s, #6 + + umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) + umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x + umull2 v13.4s, v4.8h, v8.8h + umlal2 v13.4s, v18.8h, v16.8h + umull v14.4s, v6.4h, v9.4h + umlal v14.4s, v19.4h, v17.4h + umull2 v20.4s, v6.8h, v9.8h + umlal2 v20.4s, v19.8h, v17.8h + + cmge v18.8h, v21.8h, #0 + cmge v19.8h, v22.8h, #0 + + rshrn v12.4h, v12.4s, #6 + rshrn2 v12.8h, v13.4s, #6 + rshrn v13.4h, v14.4s, #6 + rshrn2 v13.8h, v20.4s, #6 + + bit v10.16b, v12.16b, v18.16b + bit v11.16b, v13.16b, v19.16b + + st1 {v10.8h}, [x0], #16 + subs w4, w4, #8 + st1 {v11.8h}, [x13], #16 + b.le 3f + + movi v10.8h, #8 + mov v4.16b, v5.16b + mov v6.16b, v7.16b + add v21.8h, v21.8h, v10.8h // base_x += 8 + add v22.8h, v22.8h, v10.8h + b 2b + +3: + subs w5, w5, #2 + b.le 9f + movi v10.8h, #128 + add x0, x0, x1 + add x13, x13, x1 + mov w4, w12 // reset w + add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6) + b 1b + +4: // The rest of the row only predicted from top[] + ld1 {v5.8h}, [x9], #16 // top[base_x] + ld1 {v7.8h}, [x11], #16 + + ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1] + ext v19.16b, v6.16b, v7.16b, #2 + + umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) + umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x + umull2 v13.4s, v4.8h, v8.8h + umlal2 v13.4s, v18.8h, v16.8h + umull v14.4s, v6.4h, v9.4h + umlal v14.4s, v19.4h, v17.4h + umull2 v20.4s, v6.8h, v9.8h + umlal2 v20.4s, v19.8h, v17.8h + + rshrn v12.4h, v12.4s, #6 + rshrn2 v12.8h, v13.4s, #6 + rshrn v13.4h, v14.4s, #6 + rshrn2 v13.8h, v20.4s, #6 + + st1 {v12.8h}, [x0], #16 + subs w4, w4, #8 + st1 {v13.8h}, [x13], #16 + b.le 3b + + mov v4.16b, v5.16b + mov v6.16b, v7.16b + b 4b + +169: // The rest of the block only predicted from left[] + add x1, x1, w4, uxtw #1 // restore stride + mov w12, w5 // orig remaining h +1: + movi v12.8h, #64 + movi v10.8h, #0x3e + + shrn v29.8b, v23.8h, #6 // ypos >> 6 + and v27.16b, v23.16b, v10.16b // frac_y + + smov w10, v29.b[0] // base_y[0] + + shl v29.8b, v29.8b, #1 // 2*base_y + movi v11.8h, #1, lsl #8 + zip1 v29.16b, v29.16b, v29.16b // duplicate elements + add v23.8h, v23.8h, v25.8h // ypos -= 8*dy + add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ... + + cmp w10, #(32-1) + + mov v18.16b, v15.16b // left[0] + movi v21.16b, #2 + + sub v28.8h, v12.8h, v27.8h // 64 - frac_y + + b.gt 31f + + tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] + add v29.16b, v29.16b, v21.16b // base_y + 1 (*2) + +2: + // base_y < 32, using tbx. + smov w10, v29.b[0] // base_y[0] + mov v19.16b, v15.16b // left[0] + cmp w10, #(64-4) + b.gt 32f + tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] + add v29.16b, v29.16b, v21.16b // base_y + 2 (*2) + mov v20.16b, v15.16b // left[0] + tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] + add v29.16b, v29.16b, v21.16b // next base_y + + umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + umull2 v11.4s, v18.8h, v28.8h + umlal2 v11.4s, v19.8h, v27.8h + umull v12.4s, v19.4h, v28.4h + umlal v12.4s, v20.4h, v27.4h + umull2 v13.4s, v19.8h, v28.8h + umlal2 v13.4s, v20.8h, v27.8h + + rshrn v10.4h, v10.4s, #6 + rshrn2 v10.8h, v11.4s, #6 + rshrn v11.4h, v12.4s, #6 + rshrn2 v11.8h, v13.4s, #6 + + st1 {v10.8h}, [x0], x1 + subs w5, w5, #2 + st1 {v11.8h}, [x13], x1 + b.le 4f + mov v18.16b, v20.16b + b 2b + +31: // base_y >= 32, using separate loads, loading v18 if we had to bail + // in the prologue. + smov w10, v29.b[0] + smov w15, v29.b[2] + movi v21.16b, #2 + smov w16, v29.b[4] + add x10, x3, w10, sxtw + smov w17, v29.b[6] + add x15, x3, w15, sxtw + ld1 {v18.h}[0], [x10] + smov w10, v29.b[8] + add x16, x3, w16, sxtw + ld1 {v18.h}[1], [x15] + smov w15, v29.b[10] + add x17, x3, w17, sxtw + ld1 {v18.h}[2], [x16] + smov w16, v29.b[12] + add x10, x3, w10, sxtw + ld1 {v18.h}[3], [x17] + smov w17, v29.b[14] + add x15, x3, w15, sxtw + add x16, x3, w16, sxtw + ld1 {v18.h}[4], [x10] + add x17, x3, w17, sxtw + ld1 {v18.h}[5], [x15] + add v29.16b, v29.16b, v21.16b // next base_y + ld1 {v18.h}[6], [x16] + ld1 {v18.h}[7], [x17] + +32: // base_y >= 32, using separate loads. + cmp w5, #4 + b.lt 34f +33: // h >= 4, preserving v18 from the previous round, loading v19-v22. + smov w10, v29.b[0] + subs w5, w5, #4 + smov w15, v29.b[2] + movi v10.16b, #8 + smov w16, v29.b[4] + add x10, x3, w10, sxtw + smov w17, v29.b[6] + add x15, x3, w15, sxtw + ld4 {v19.h, v20.h, v21.h, v22.h}[0], [x10] + smov w10, v29.b[8] + add x16, x3, w16, sxtw + ld4 {v19.h, v20.h, v21.h, v22.h}[1], [x15] + smov w15, v29.b[10] + add x17, x3, w17, sxtw + ld4 {v19.h, v20.h, v21.h, v22.h}[2], [x16] + smov w16, v29.b[12] + add x10, x3, w10, sxtw + ld4 {v19.h, v20.h, v21.h, v22.h}[3], [x17] + smov w17, v29.b[14] + add x15, x3, w15, sxtw + add x16, x3, w16, sxtw + ld4 {v19.h, v20.h, v21.h, v22.h}[4], [x10] + add x17, x3, w17, sxtw + ld4 {v19.h, v20.h, v21.h, v22.h}[5], [x15] + ld4 {v19.h, v20.h, v21.h, v22.h}[6], [x16] + add v29.16b, v29.16b, v10.16b // next base_y + ld4 {v19.h, v20.h, v21.h, v22.h}[7], [x17] + + umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + umull2 v11.4s, v18.8h, v28.8h + umlal2 v11.4s, v19.8h, v27.8h + umull v12.4s, v19.4h, v28.4h + umlal v12.4s, v20.4h, v27.4h + umull2 v13.4s, v19.8h, v28.8h + umlal2 v13.4s, v20.8h, v27.8h + + rshrn v10.4h, v10.4s, #6 + rshrn2 v10.8h, v11.4s, #6 + rshrn v11.4h, v12.4s, #6 + rshrn2 v11.8h, v13.4s, #6 + + umull v12.4s, v20.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v12.4s, v21.4h, v27.4h // + left[base_y+1]*frac_y + umull2 v13.4s, v20.8h, v28.8h + umlal2 v13.4s, v21.8h, v27.8h + umull v14.4s, v21.4h, v28.4h + umlal v14.4s, v22.4h, v27.4h + umull2 v18.4s, v21.8h, v28.8h + umlal2 v18.4s, v22.8h, v27.8h + + rshrn v12.4h, v12.4s, #6 + rshrn2 v12.8h, v13.4s, #6 + rshrn v13.4h, v14.4s, #6 + rshrn2 v13.8h, v18.4s, #6 + + st1 {v10.8h}, [x0], x1 + cmp w5, #2 + st1 {v11.8h}, [x13], x1 + st1 {v12.8h}, [x0], x1 + st1 {v13.8h}, [x13], x1 + b.lt 4f + mov v18.16b, v22.16b + b.gt 33b + +34: // h == 2, preserving v18 from the previous round, loading v19-v20. + smov w10, v29.b[0] + smov w15, v29.b[2] + movi v21.16b, #4 + smov w16, v29.b[4] + add x10, x3, w10, sxtw + smov w17, v29.b[6] + add x15, x3, w15, sxtw + ld2 {v19.h, v20.h}[0], [x10] + smov w10, v29.b[8] + add x16, x3, w16, sxtw + ld2 {v19.h, v20.h}[1], [x15] + smov w15, v29.b[10] + add x17, x3, w17, sxtw + ld2 {v19.h, v20.h}[2], [x16] + smov w16, v29.b[12] + add x10, x3, w10, sxtw + ld2 {v19.h, v20.h}[3], [x17] + smov w17, v29.b[14] + add x15, x3, w15, sxtw + add x16, x3, w16, sxtw + ld2 {v19.h, v20.h}[4], [x10] + add x17, x3, w17, sxtw + ld2 {v19.h, v20.h}[5], [x15] + ld2 {v19.h, v20.h}[6], [x16] + add v29.16b, v29.16b, v21.16b // next base_y + ld2 {v19.h, v20.h}[7], [x17] + + umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + umull2 v11.4s, v18.8h, v28.8h + umlal2 v11.4s, v19.8h, v27.8h + umull v12.4s, v19.4h, v28.4h + umlal v12.4s, v20.4h, v27.4h + umull2 v13.4s, v19.8h, v28.8h + umlal2 v13.4s, v20.8h, v27.8h + + rshrn v10.4h, v10.4s, #6 + rshrn2 v10.8h, v11.4s, #6 + rshrn v11.4h, v12.4s, #6 + rshrn2 v11.8h, v13.4s, #6 + + st1 {v10.8h}, [x0], x1 + st1 {v11.8h}, [x13], x1 + // The h==2 case only happens once at the end, if at all. + +4: + subs w4, w4, #8 + b.le 9f + + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + lsl x1, x1, #1 + add x0, x0, #16 + add x13, x13, #16 + mov w5, w12 // reset h + b 1b + +9: + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + ret + +L(ipred_z2_fill1_tbl): + .hword L(ipred_z2_fill1_tbl) - 640b + .hword L(ipred_z2_fill1_tbl) - 320b + .hword L(ipred_z2_fill1_tbl) - 160b + .hword L(ipred_z2_fill1_tbl) - 80b + .hword L(ipred_z2_fill1_tbl) - 40b +endfunc + +function ipred_z2_fill2_16bpc_neon, export=1 + cmp w4, #8 + mov w8, #(2 << 6) // xpos = 2 << 6 + sub w8, w8, w6 // xpos -= dx + + movrel x11, increments + ld1 {v31.8h}, [x11] // increments + neg w7, w7 // -dy + b.eq 80f + +40: + dup v30.4h, w7 // -dy + movi v17.8b, #1 + + mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy + movi v25.8h, #0x3e + add v30.4h, v16.4h, v30.4h // -= dy + + // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements + // from left. + ld1 {v0.8h, v1.8h}, [x3] // left[] + + movi v26.8h, #64 + movi v19.16b, #4 + + shrn v29.8b, v30.8h, #6 // ypos >> 6 + and v27.8b, v30.8b, v25.8b // frac_y + + add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 + + movi v23.4h, #1, lsl #8 + shl v29.8b, v29.8b, #1 // 2*base_y + zip1 v29.8b, v29.8b, v29.8b // duplicate elements + movi v17.8b, #2 + add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... + + add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) + add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) + + tbl v18.8b, {v0.16b}, v29.8b // left[base_y] + + trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2 + + sub v28.4h, v26.4h, v27.4h // 64 - frac_y + + trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} + + trn1 v27.2d, v27.2d, v27.2d // frac_y + trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y + + movi v29.16b, #4 + add v31.8h, v31.8h, v31.8h // {0,2,4,6,0,2,4,6} +4: + asr w9, w8, #6 // base_x + dup v16.4h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, #-8 // base_x <= -8 + asr w11, w8, #6 // base_x + b.le 49f + + lsl w9, w9, #1 + lsl w11, w11, #1 + + dup v17.4h, w8 // xpos + + ldr q4, [x2, w9, sxtw] // top[base_x] + ldr q6, [x2, w11, sxtw] + + trn1 v16.2d, v16.2d, v17.2d // xpos + + tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] + + sshr v20.8h, v16.8h, #6 // first base_x for each row + + uzp2 v5.8h, v4.8h, v6.8h // top[base_x+1] + uzp1 v4.8h, v4.8h, v6.8h // top[base_x] + + and v16.16b, v16.16b, v25.16b // frac_x + + trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] + + sub v17.8h, v26.8h, v16.8h // 64 - frac_x + + add v20.8h, v20.8h, v31.8h // actual base_x + + umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + umull2 v22.4s, v18.8h, v28.8h + umlal2 v22.4s, v19.8h, v27.8h + + umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) + umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x + umull2 v24.4s, v4.8h, v17.8h + umlal2 v24.4s, v5.8h, v16.8h + + cmge v20.8h, v20.8h, #0 + + rshrn v21.4h, v21.4s, #6 + rshrn2 v21.8h, v22.4s, #6 + rshrn v22.4h, v23.4s, #6 + rshrn2 v22.8h, v24.4s, #6 + + bit v21.16b, v22.16b, v20.16b + + st1 {v21.d}[0], [x0], x1 + sub w8, w8, w6 // xpos -= dx + subs w5, w5, #2 + st1 {v21.d}[1], [x0], x1 + b.le 9f + + ext v18.16b, v19.16b, v19.16b, #8 + add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) + b 4b + +49: + tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] + + trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] + + umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + umull2 v21.4s, v18.8h, v28.8h + umlal2 v21.4s, v19.8h, v27.8h + + rshrn v20.4h, v20.4s, #6 + rshrn2 v20.8h, v21.4s, #6 + + st1 {v20.d}[0], [x0], x1 + subs w5, w5, #2 + st1 {v20.d}[1], [x0], x1 + b.le 9f + + ext v18.16b, v19.16b, v19.16b, #8 + add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) + b 49b + +9: + ret + +80: + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + dup v18.8h, w7 // -dy + movi v17.8b, #1 + + mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy + movi v25.8h, #0x3e + add v16.8h, v16.8h, v18.8h // -= dy + + // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements + // from left. + ld1 {v0.8h, v1.8h}, [x3] // left[] + + movi v26.8h, #64 + movi v19.16b, #4 + + shrn v29.8b, v16.8h, #6 // ypos >> 6 + and v27.16b, v16.16b, v25.16b // frac_y + + add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 + + movi v23.8h, #1, lsl #8 + shl v29.8b, v29.8b, #1 // 2*base_y + zip1 v29.16b, v29.16b, v29.16b // duplicate elements + movi v17.16b, #2 + add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... + + // Cut corners here; for the first row we don't expect to need to + // read outside of v0. + tbl v18.16b, {v0.16b}, v29.16b // left[base_y] + + add v30.16b, v29.16b, v19.16b // base_y + 2 (*2) + add v29.16b, v29.16b, v17.16b // base_y + 1 (*2) + + sub v28.8h, v26.8h, v27.8h // 64 - frac_y + + movi v24.16b, #4 + add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14} +8: + asr w9, w8, #6 // base_x + dup v16.8h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, #-16 // base_x <= -16 + asr w11, w8, #6 // base_x + b.le 89f + + dup v17.8h, w8 // xpos + + add x9, x2, w9, sxtw #1 + add x11, x2, w11, sxtw #1 + + ld1 {v4.8h, v5.8h}, [x9] // top[base_x] + ld1 {v6.8h, v7.8h}, [x11] + + tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1] + + sshr v21.8h, v16.8h, #6 // first base_x + sshr v22.8h, v17.8h, #6 + + tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2] + + uzp2 v2.8h, v4.8h, v5.8h // top[base_x+1] + uzp1 v4.8h, v4.8h, v5.8h // top[base_x] + uzp2 v3.8h, v6.8h, v7.8h + uzp1 v6.8h, v6.8h, v7.8h + mov v5.16b, v2.16b + mov v7.16b, v3.16b + + and v16.16b, v16.16b, v25.16b // frac_x + and v17.16b, v17.16b, v25.16b + + umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + + sub v8.8h, v26.8h, v16.8h // 64 - frac_x + sub v9.8h, v26.8h, v17.8h + + umull2 v11.4s, v18.8h, v28.8h + umlal2 v11.4s, v19.8h, v27.8h + + add v21.8h, v21.8h, v31.8h // actual base_x + add v22.8h, v22.8h, v31.8h + + umull v12.4s, v19.4h, v28.4h + umlal v12.4s, v20.4h, v27.4h + umull2 v13.4s, v19.8h, v28.8h + umlal2 v13.4s, v20.8h, v27.8h + + rshrn v10.4h, v10.4s, #6 + rshrn2 v10.8h, v11.4s, #6 + rshrn v11.4h, v12.4s, #6 + rshrn2 v11.8h, v13.4s, #6 + + umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) + umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x + umull2 v13.4s, v4.8h, v8.8h + umlal2 v13.4s, v5.8h, v16.8h + umull v14.4s, v6.4h, v9.4h + umlal v14.4s, v7.4h, v17.4h + umull2 v18.4s, v6.8h, v9.8h + umlal2 v18.4s, v7.8h, v17.8h + + cmge v21.8h, v21.8h, #0 + cmge v22.8h, v22.8h, #0 + + rshrn v12.4h, v12.4s, #6 + rshrn2 v12.8h, v13.4s, #6 + rshrn v13.4h, v14.4s, #6 + rshrn2 v13.8h, v18.4s, #6 + + bit v10.16b, v12.16b, v21.16b + bit v11.16b, v13.16b, v22.16b + + st1 {v10.8h}, [x0], x1 + subs w5, w5, #2 + sub w8, w8, w6 // xpos -= dx + st1 {v11.8h}, [x0], x1 + b.le 9f + + mov v18.16b, v20.16b + add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) + add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) + b 8b + +89: + tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1] + tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2] + + umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + umull2 v5.4s, v18.8h, v28.8h + umlal2 v5.4s, v19.8h, v27.8h + umull v6.4s, v19.4h, v28.4h + umlal v6.4s, v20.4h, v27.4h + umull2 v7.4s, v19.8h, v28.8h + umlal2 v7.4s, v20.8h, v27.8h + + rshrn v4.4h, v4.4s, #6 + rshrn2 v4.8h, v5.4s, #6 + rshrn v5.4h, v6.4s, #6 + rshrn2 v5.8h, v7.4s, #6 + + st1 {v4.8h}, [x0], x1 + subs w5, w5, #2 + st1 {v5.8h}, [x0], x1 + b.le 9f + + mov v18.16b, v20.16b + add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) + add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) + b 89b + +9: + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + ret +endfunc + +function ipred_z2_fill3_16bpc_neon, export=1 + cmp w4, #8 + mov w8, #(1 << 6) // xpos = 1 << 6 + sub w8, w8, w6 // xpos -= dx + + movrel x11, increments + ld1 {v31.8h}, [x11] // increments + neg w7, w7 // -dy + b.eq 80f + +40: + dup v30.4h, w7 // -dy + movi v17.8b, #1 + + mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy + movi v25.8h, #0x3e + add v30.4h, v16.4h, v30.4h // -= dy + + // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. + ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] + + movi v26.8h, #64 + movi v19.16b, #2 + + shrn v29.8b, v30.8h, #6 // ypos >> 6 + and v27.8b, v30.8b, v25.8b // frac_y + + add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 + + movi v23.4h, #1, lsl #8 + shl v29.8b, v29.8b, #1 // 2*base_y + movi v19.16b, #4 + zip1 v29.8b, v29.8b, v29.8b // duplicate elements + movi v17.8b, #2 + add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... + + add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) + add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) + + trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} + + add v24.8b, v30.8b, v19.8b // base_y + 3 (*2) + + trn1 v29.2d, v29.2d, v28.2d // base_y + 0, base_y + 2 + trn1 v30.2d, v30.2d, v24.2d // base_y + 1, base_y + 3 + + sub v28.4h, v26.4h, v27.4h // 64 - frac_y + + trn1 v27.2d, v27.2d, v27.2d // frac_y + trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y + + movi v24.16b, #8 +4: + asr w9, w8, #6 // base_x + dup v16.4h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, #-4 // base_x <= -4 + asr w11, w8, #6 // base_x + b.le 49f + + lsl w9, w9, #1 + lsl w11, w11, #1 + + dup v17.4h, w8 // xpos + + ldr q4, [x2, w9, sxtw] // top[base_x] + ldr q6, [x2, w11, sxtw] + + trn1 v16.2d, v16.2d, v17.2d // xpos + + tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] + tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] + + sshr v20.8h, v16.8h, #6 // first base_x for each row + + ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1] + ext v7.16b, v6.16b, v6.16b, #2 + + and v16.16b, v16.16b, v25.16b // frac_x + + trn1 v4.2d, v4.2d, v6.2d // top[base_x] + trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] + + sub v17.8h, v26.8h, v16.8h // 64 - frac_x + + add v20.8h, v20.8h, v31.8h // actual base_x + + umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + umull2 v22.4s, v18.8h, v28.8h + umlal2 v22.4s, v19.8h, v27.8h + + umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) + umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x + umull2 v24.4s, v4.8h, v17.8h + umlal2 v24.4s, v5.8h, v16.8h + + cmge v20.8h, v20.8h, #0 + + rshrn v21.4h, v21.4s, #6 + rshrn2 v21.8h, v22.4s, #6 + rshrn v22.4h, v23.4s, #6 + rshrn2 v22.8h, v24.4s, #6 + + movi v24.16b, #8 + + bit v21.16b, v22.16b, v20.16b + + st1 {v21.d}[0], [x0], x1 + sub w8, w8, w6 // xpos -= dx + subs w5, w5, #2 + st1 {v21.d}[1], [x0], x1 + b.le 9f + + add v29.16b, v29.16b, v24.16b // base_y += 4 (*2) + add v30.16b, v30.16b, v24.16b // base_y += 4 (*2) + b 4b + +49: + tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] + tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] + + umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + umull2 v21.4s, v18.8h, v28.8h + umlal2 v21.4s, v19.8h, v27.8h + + rshrn v20.4h, v20.4s, #6 + rshrn2 v20.8h, v21.4s, #6 + + st1 {v20.d}[0], [x0], x1 + subs w5, w5, #2 + st1 {v20.d}[1], [x0], x1 + b.le 9f + + add v29.16b, v29.16b, v24.16b // base_y += 4 (*2) + add v30.16b, v30.16b, v24.16b // base_y += 4 (*2) + b 49b + +9: + ret + +80: + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + dup v18.8h, w7 // -dy + movi v17.16b, #2 + + mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy + movi v25.8h, #0x3e + add v16.8h, v16.8h, v18.8h // -= dy + + // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. + ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] + + movi v26.8h, #64 + movi v19.16b, #4 + + shrn v29.8b, v16.8h, #6 // ypos >> 6 + and v27.16b, v16.16b, v25.16b // frac_y + + add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 2 + + movi v23.8h, #1, lsl #8 + shl v29.8b, v29.8b, #1 // 2*base_y + mov v18.16b, v15.16b // left[0] + zip1 v29.16b, v29.16b, v29.16b // duplicate elements + add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... + + add v30.16b, v29.16b, v17.16b // base_y + 1 (*2) + + sub v28.8h, v26.8h, v27.8h // 64 - frac_y + + movi v24.16b, #4 +8: + asr w9, w8, #6 // base_x + dup v16.8h, w8 // xpos + sub w8, w8, w6 // xpos -= dx + cmp w9, #-16 // base_x <= -16 + asr w11, w8, #6 // base_x + b.le 89f + + dup v17.8h, w8 // xpos + + add x9, x2, w9, sxtw #1 + add x11, x2, w11, sxtw #1 + + ld1 {v4.8h, v5.8h}, [x9] // top[base_x] + ld1 {v6.8h, v7.8h}, [x11] + + tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0] + add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) + tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1] + add v30.16b, v30.16b, v24.16b + + sshr v22.8h, v16.8h, #6 // first base_x + tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2] + sshr v23.8h, v17.8h, #6 + tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3] + + ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1] + ext v7.16b, v6.16b, v7.16b, #2 + + and v16.16b, v16.16b, v25.16b // frac_x + and v17.16b, v17.16b, v25.16b + + umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + + sub v8.8h, v26.8h, v16.8h // 64 - frac_x + sub v9.8h, v26.8h, v17.8h + + umull2 v11.4s, v18.8h, v28.8h + umlal2 v11.4s, v19.8h, v27.8h + + add v22.8h, v22.8h, v31.8h // actual base_x + add v23.8h, v23.8h, v31.8h + + umull v12.4s, v20.4h, v28.4h + umlal v12.4s, v21.4h, v27.4h + umull2 v13.4s, v20.8h, v28.8h + umlal2 v13.4s, v21.8h, v27.8h + + rshrn v10.4h, v10.4s, #6 + rshrn2 v10.8h, v11.4s, #6 + rshrn v11.4h, v12.4s, #6 + rshrn2 v11.8h, v13.4s, #6 + + umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) + umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x + umull2 v13.4s, v4.8h, v8.8h + umlal2 v13.4s, v5.8h, v16.8h + umull v14.4s, v6.4h, v9.4h + umlal v14.4s, v7.4h, v17.4h + umull2 v18.4s, v6.8h, v9.8h + umlal2 v18.4s, v7.8h, v17.8h + + cmge v22.8h, v22.8h, #0 + cmge v23.8h, v23.8h, #0 + + rshrn v12.4h, v12.4s, #6 + rshrn2 v12.8h, v13.4s, #6 + rshrn v13.4h, v14.4s, #6 + rshrn2 v13.8h, v18.4s, #6 + + bit v10.16b, v12.16b, v22.16b + bit v11.16b, v13.16b, v23.16b + + st1 {v10.8h}, [x0], x1 + subs w5, w5, #2 + sub w8, w8, w6 // xpos -= dx + st1 {v11.8h}, [x0], x1 + b.le 9f + + add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) + add v30.16b, v30.16b, v24.16b + b 8b + +89: + tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0] + add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) + tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1] + add v30.16b, v30.16b, v24.16b + tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2] + tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3] + + umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) + umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y + umull2 v5.4s, v18.8h, v28.8h + umlal2 v5.4s, v19.8h, v27.8h + umull v6.4s, v20.4h, v28.4h + umlal v6.4s, v21.4h, v27.4h + umull2 v7.4s, v20.8h, v28.8h + umlal2 v7.4s, v21.8h, v27.8h + + rshrn v4.4h, v4.4s, #6 + rshrn2 v4.8h, v5.4s, #6 + rshrn v5.4h, v6.4s, #6 + rshrn2 v5.8h, v7.4s, #6 + + st1 {v4.8h}, [x0], x1 + subs w5, w5, #2 + st1 {v5.8h}, [x0], x1 + b.le 9f + + add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) + add v30.16b, v30.16b, v24.16b + b 89b + +9: + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + ret +endfunc + +// void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const left, +// const int width, const int height, +// const int dy, const int max_base_y); +function ipred_z3_fill1_16bpc_neon, export=1 + clz w9, w4 + adr x8, L(ipred_z3_fill1_tbl) + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + add x10, x2, w6, uxtw #1 // left[max_base_y] + sub x8, x8, w9, uxtw + ld1r {v31.8h}, [x10] // padding + mov w7, w5 + mov w15, #64 + add x13, x0, x1 + lsl x1, x1, #1 + br x8 + +40: + AARCH64_VALID_JUMP_TARGET +4: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // left[base] + ldr q2, [x2, w10, uxtw] + dup v4.8h, w9 // frac + dup v5.8h, w11 + ext v1.16b, v0.16b, v0.16b, #2 // left[base+1] + ext v3.16b, v2.16b, v2.16b, #2 + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + add w7, w7, w5 // xpos += dx + st1 {v18.s}[2], [x0] + st1 {v18.s}[3], [x13] + b.le 9f + sub x0, x0, x1 // ptr -= 4 * (2*stride) + sub x13, x13, x1 + add x0, x0, #4 + add x13, x13, #4 + b 4b +9: + ret + +80: + AARCH64_VALID_JUMP_TARGET +8: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h}, [x8] // left[base] + ld1 {v2.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + ldr h1, [x8, #16] + ldr h3, [x10, #16] + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + ext v1.16b, v0.16b, v1.16b, #2 // left[base+1] + ext v3.16b, v2.16b, v3.16b, #2 + umull v16.4s, v0.4h, v6.4h // left[base]*(64-frac) + umlal v16.4s, v1.4h, v4.4h // + left[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v1.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v3.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v3.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + zip2 v19.8h, v16.8h, v17.8h + add w7, w7, w5 // xpos += dx + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + st1 {v18.s}[2], [x0], x1 + st1 {v18.s}[3], [x13], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v19.s}[1], [x13], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v19.s}[3], [x13], x1 + b.le 9f + sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) + sub x13, x13, x1, lsl #2 + add x0, x0, #4 + add x13, x13, #4 + b 8b +9: + ret + +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + mov w12, w4 +1: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // ypos += dy + cmp w8, w6 // base >= max_base_y + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v6.8h, w9 // frac + dup v7.8h, w11 + ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // left[base] + ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v16.8h, w9 // 64 - frac + dup v17.8h, w11 + add w7, w7, w5 // ypos += dy +2: + ext v18.16b, v0.16b, v1.16b, #2 // left[base+1] + ext v19.16b, v1.16b, v2.16b, #2 + ext v20.16b, v3.16b, v4.16b, #2 + ext v21.16b, v4.16b, v5.16b, #2 + subs w4, w4, #16 + umull v22.4s, v0.4h, v16.4h // left[base]*(64-frac) + umlal v22.4s, v18.4h, v6.4h // + left[base+1]*frac + umull2 v23.4s, v0.8h, v16.8h + umlal2 v23.4s, v18.8h, v6.8h + umull v24.4s, v1.4h, v16.4h + umlal v24.4s, v19.4h, v6.4h + umull2 v25.4s, v1.8h, v16.8h + umlal2 v25.4s, v19.8h, v6.8h + umull v26.4s, v3.4h, v17.4h + umlal v26.4s, v20.4h, v7.4h + umull2 v27.4s, v3.8h, v17.8h + umlal2 v27.4s, v20.8h, v7.8h + umull v28.4s, v4.4h, v17.4h + umlal v28.4s, v21.4h, v7.4h + umull2 v29.4s, v4.8h, v17.8h + umlal2 v29.4s, v21.8h, v7.8h + rshrn v22.4h, v22.4s, #6 + rshrn2 v22.8h, v23.4s, #6 + rshrn v23.4h, v24.4s, #6 + rshrn2 v23.8h, v25.4s, #6 + rshrn v24.4h, v26.4s, #6 + rshrn2 v24.8h, v27.4s, #6 + rshrn v25.4h, v28.4s, #6 + rshrn2 v25.8h, v29.4s, #6 + zip1 v18.8h, v22.8h, v24.8h + zip2 v19.8h, v22.8h, v24.8h + zip1 v20.8h, v23.8h, v25.8h + zip2 v21.8h, v23.8h, v25.8h + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + st1 {v18.s}[2], [x0], x1 + st1 {v18.s}[3], [x13], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v19.s}[1], [x13], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v19.s}[3], [x13], x1 + st1 {v20.s}[0], [x0], x1 + st1 {v20.s}[1], [x13], x1 + st1 {v20.s}[2], [x0], x1 + st1 {v20.s}[3], [x13], x1 + st1 {v21.s}[0], [x0], x1 + st1 {v21.s}[1], [x13], x1 + st1 {v21.s}[2], [x0], x1 + st1 {v21.s}[3], [x13], x1 + b.le 3f + mov v0.16b, v2.16b + ld1 {v1.8h, v2.8h}, [x8], #32 // left[base] + mov v3.16b, v5.16b + ld1 {v4.8h, v5.8h}, [x10], #32 + b 2b + +3: + subs w3, w3, #2 + b.le 9f + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + lsl x1, x1, #1 + add x0, x0, #4 + add x13, x13, #4 + mov w4, w12 + b 1b +9: + ret + +L(ipred_z3_fill1_tbl): + .hword L(ipred_z3_fill1_tbl) - 640b + .hword L(ipred_z3_fill1_tbl) - 320b + .hword L(ipred_z3_fill1_tbl) - 160b + .hword L(ipred_z3_fill1_tbl) - 80b + .hword L(ipred_z3_fill1_tbl) - 40b +endfunc + +function ipred_z3_fill_padding_neon, export=0 + cmp w3, #8 + adr x8, L(ipred_z3_fill_padding_tbl) + b.gt L(ipred_z3_fill_padding_wide) + // w3 = remaining width, w4 = constant height + mov w12, w4 + +1: + // Fill a WxH rectangle with padding. W can be any number; + // this fills the exact width by filling in the largest + // power of two in the remaining width, and repeating. + clz w9, w3 + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + sub x9, x8, w9, uxtw + br x9 + +2: + AARCH64_VALID_JUMP_TARGET + st1 {v31.s}[0], [x0], x1 + subs w4, w4, #4 + st1 {v31.s}[0], [x13], x1 + st1 {v31.s}[0], [x0], x1 + st1 {v31.s}[0], [x13], x1 + b.gt 2b + subs w3, w3, #2 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #4 + add x13, x13, #4 + mov w4, w12 + b 1b + +4: + AARCH64_VALID_JUMP_TARGET + st1 {v31.4h}, [x0], x1 + subs w4, w4, #4 + st1 {v31.4h}, [x13], x1 + st1 {v31.4h}, [x0], x1 + st1 {v31.4h}, [x13], x1 + b.gt 4b + subs w3, w3, #4 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #8 + add x13, x13, #8 + mov w4, w12 + b 1b + +8: +16: +32: +64: + AARCH64_VALID_JUMP_TARGET + st1 {v31.8h}, [x0], x1 + subs w4, w4, #4 + st1 {v31.8h}, [x13], x1 + st1 {v31.8h}, [x0], x1 + st1 {v31.8h}, [x13], x1 + b.gt 4b + subs w3, w3, #8 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #16 + add x13, x13, #16 + mov w4, w12 + b 1b + +9: + ret + +L(ipred_z3_fill_padding_tbl): + .hword L(ipred_z3_fill_padding_tbl) - 64b + .hword L(ipred_z3_fill_padding_tbl) - 32b + .hword L(ipred_z3_fill_padding_tbl) - 16b + .hword L(ipred_z3_fill_padding_tbl) - 8b + .hword L(ipred_z3_fill_padding_tbl) - 4b + .hword L(ipred_z3_fill_padding_tbl) - 2b + +L(ipred_z3_fill_padding_wide): + // Fill a WxH rectangle with padding, with W > 8. + lsr x1, x1, #1 + mov w12, w3 + sub x1, x1, w3, uxtw #1 +1: + ands w5, w3, #7 + b.eq 2f + // If the width isn't aligned to 8, first do one 8 pixel write + // and align the start pointer. + sub w3, w3, w5 + st1 {v31.8h}, [x0] + add x0, x0, w5, uxtw #1 +2: + // Fill the rest of the line with aligned 8 pixel writes. + subs w3, w3, #8 + st1 {v31.8h}, [x0], #16 + b.gt 2b + subs w4, w4, #1 + add x0, x0, x1 + b.le 9f + mov w3, w12 + b 1b +9: + ret +endfunc + +function ipred_z3_fill2_16bpc_neon, export=1 + cmp w4, #8 + add x10, x2, w6, uxtw // left[max_base_y] + ld1r {v31.16b}, [x10] // padding + mov w7, w5 + mov w15, #64 + add x13, x0, x1 + lsl x1, x1, #1 + b.eq 8f + +4: // h == 4 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + uzp2 v1.8h, v0.8h, v0.8h // top[base+1] + uzp1 v0.8h, v0.8h, v0.8h // top[base] + uzp2 v3.8h, v2.8h, v2.8h + uzp1 v2.8h, v2.8h, v2.8h + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + add w7, w7, w5 // xpos += dx + st1 {v18.s}[2], [x0] + st1 {v18.s}[3], [x13] + b.le 9f + sub x0, x0, x1 // ptr -= 4 * (2*stride) + sub x13, x13, x1 + add x0, x0, #4 + add x13, x13, #4 + b 4b +9: + ret + +8: // h == 8 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h, v1.8h}, [x8] // top[base] + ld1 {v2.8h, v3.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + uzp2 v20.8h, v0.8h, v1.8h // top[base+1] + uzp1 v0.8h, v0.8h, v1.8h // top[base] + uzp2 v21.8h, v2.8h, v3.8h + uzp1 v2.8h, v2.8h, v3.8h + umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) + umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v20.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v21.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v21.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + zip2 v19.8h, v16.8h, v17.8h + add w7, w7, w5 // xpos += dx + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + st1 {v18.s}[2], [x0], x1 + st1 {v18.s}[3], [x13], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v19.s}[1], [x13], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v19.s}[3], [x13], x1 + b.le 9f + sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) + sub x13, x13, x1, lsl #2 + add x0, x0, #4 + add x13, x13, #4 + b 8b +9: + ret +endfunc + + // void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int filt_idx, @@ -1717,13 +4182,14 @@ function ipred_filter_16bpc_neon, export=1 endfunc // void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, -// const uint16_t *const pal, const uint8_t *idx, +// const pixel *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_16bpc_neon, export=1 ld1 {v30.8h}, [x2] clz w9, w4 adr x6, L(pal_pred_tbl) sub w9, w9, #25 + movi v29.16b, #7 ldrh w9, [x6, w9, uxtw #1] movi v31.8h, #1, lsl #8 sub x6, x6, w9, uxtw @@ -1733,8 +4199,11 @@ function pal_pred_16bpc_neon, export=1 add x2, x0, x1 lsl x1, x1, #1 4: - ld1 {v1.16b}, [x3], #16 + ld1 {v1.8b}, [x3], #8 subs w5, w5, #4 + ushr v3.8b, v1.8b, #4 + and v2.8b, v1.8b, v29.8b + zip1 v1.16b, v2.16b, v3.16b // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... add v1.16b, v1.16b, v1.16b zip1 v0.16b, v1.16b, v1.16b @@ -1754,8 +4223,12 @@ function pal_pred_16bpc_neon, export=1 add x2, x0, x1 lsl x1, x1, #1 8: - ld1 {v2.16b, v3.16b}, [x3], #32 + ld1 {v2.16b}, [x3], #16 subs w5, w5, #4 + ushr v4.16b, v2.16b, #4 + and v3.16b, v2.16b, v29.16b + zip1 v2.16b, v3.16b, v4.16b + zip2 v3.16b, v3.16b, v4.16b add v2.16b, v2.16b, v2.16b add v3.16b, v3.16b, v3.16b zip1 v0.16b, v2.16b, v2.16b @@ -1781,8 +4254,16 @@ function pal_pred_16bpc_neon, export=1 add x2, x0, x1 lsl x1, x1, #1 16: - ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + ld1 {v4.16b, v5.16b}, [x3], #32 subs w5, w5, #4 + ushr v7.16b, v4.16b, #4 + and v6.16b, v4.16b, v29.16b + ushr v3.16b, v5.16b, #4 + and v2.16b, v5.16b, v29.16b + zip1 v4.16b, v6.16b, v7.16b + zip2 v5.16b, v6.16b, v7.16b + zip1 v6.16b, v2.16b, v3.16b + zip2 v7.16b, v2.16b, v3.16b add v4.16b, v4.16b, v4.16b add v5.16b, v5.16b, v5.16b add v6.16b, v6.16b, v6.16b @@ -1822,8 +4303,16 @@ function pal_pred_16bpc_neon, export=1 add x2, x0, x1 lsl x1, x1, #1 32: - ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + ld1 {v4.16b, v5.16b}, [x3], #32 subs w5, w5, #2 + ushr v7.16b, v4.16b, #4 + and v6.16b, v4.16b, v29.16b + ushr v3.16b, v5.16b, #4 + and v2.16b, v5.16b, v29.16b + zip1 v4.16b, v6.16b, v7.16b + zip2 v5.16b, v6.16b, v7.16b + zip1 v6.16b, v2.16b, v3.16b + zip2 v7.16b, v2.16b, v3.16b add v4.16b, v4.16b, v4.16b add v5.16b, v5.16b, v5.16b add v6.16b, v6.16b, v6.16b @@ -1860,8 +4349,16 @@ function pal_pred_16bpc_neon, export=1 AARCH64_VALID_JUMP_TARGET add x2, x0, #64 64: - ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + ld1 {v4.16b, v5.16b}, [x3], #32 subs w5, w5, #1 + ushr v7.16b, v4.16b, #4 + and v6.16b, v4.16b, v29.16b + ushr v3.16b, v5.16b, #4 + and v2.16b, v5.16b, v29.16b + zip1 v4.16b, v6.16b, v7.16b + zip2 v5.16b, v6.16b, v7.16b + zip1 v6.16b, v2.16b, v3.16b + zip2 v7.16b, v2.16b, v3.16b add v4.16b, v4.16b, v4.16b add v5.16b, v5.16b, v5.16b add v6.16b, v6.16b, v6.16b diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration.S b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration.S index a598b72b0..f8dc0df4d 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration.S +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration.S @@ -965,371 +965,338 @@ function wiener_filter5_hv_8bpc_neon ret endfunc -#define SUM_STRIDE (384+16) - #include "looprestoration_tmpl.S" -// void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum, -// const pixel (*left)[4], -// const pixel *src, const ptrdiff_t stride, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box3_h_8bpc_neon, export=1 - add w5, w5, #2 // w += 2 - - // Set up pointers for reading/writing alternate rows - add x10, x0, #(4*SUM_STRIDE) // sumsq - add x11, x1, #(2*SUM_STRIDE) // sum - add x12, x3, x4 // src - lsl x4, x4, #1 - mov x9, #(2*2*SUM_STRIDE) // double sum stride - - // Subtract the aligned width from the output stride. - add w13, w5, #7 - bic w13, w13, #7 - sub x9, x9, w13, uxtw #1 - - // Store the width for the vertical loop - mov w8, w5 - - // Subtract the number of pixels read from the input from the stride - add w13, w13, #8 - sub x4, x4, w13, uxtw +// void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const int w, +// const enum LrEdgeFlags edges); +function sgr_box3_row_h_8bpc_neon, export=1 + add w4, w4, #2 // w += 2 - // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w7, #1 // LR_HAVE_LEFT - b.eq 2f - // LR_HAVE_LEFT + tst w5, #1 // LR_HAVE_LEFT + b.eq 1f cbnz x2, 0f - // left == NULL - sub x3, x3, #2 - sub x12, x12, #2 - b 1f -0: // LR_HAVE_LEFT, left != NULL -2: // !LR_HAVE_LEFT, increase the stride. - // For this case we don't read the left 2 pixels from the src pointer, - // but shift it as if we had done that. - add x4, x4, #2 + // LR_HAVE_LEFT && left == NULL + sub x3, x3, #2 + ld1 {v0.16b}, [x3], #16 + b 2f -1: // Loop vertically - ld1 {v0.16b}, [x3], #16 - ld1 {v4.16b}, [x12], #16 - - tst w7, #1 // LR_HAVE_LEFT - b.eq 0f - cbz x2, 2f +0: // LR_HAVE_LEFT, left != NULL - ld1 {v1.s}[3], [x2], #4 - // Move x3/x12 back to account for the last 2 bytes we loaded earlier, + ld1 {v0.16b}, [x3], #16 + ld1 {v1.s}[3], [x2] + // Move x3 back to account for the last 2 bytes we loaded earlier, // which we'll shift out. sub x3, x3, #2 - sub x12, x12, #2 - ld1 {v5.s}[3], [x2], #4 ext v0.16b, v1.16b, v0.16b, #14 - ext v4.16b, v5.16b, v4.16b, #14 b 2f -0: + +1: + ld1 {v0.16b}, [x3], #16 // !LR_HAVE_LEFT, fill v1 with the leftmost byte // and shift v0 to have 2x the first byte at the front. dup v1.16b, v0.b[0] - dup v5.16b, v4.b[0] // Move x3 back to account for the last 2 bytes we loaded before, // which we shifted out. sub x3, x3, #2 - sub x12, x12, #2 ext v0.16b, v1.16b, v0.16b, #14 - ext v4.16b, v5.16b, v4.16b, #14 2: umull v1.8h, v0.8b, v0.8b umull2 v2.8h, v0.16b, v0.16b - umull v5.8h, v4.8b, v4.8b - umull2 v6.8h, v4.16b, v4.16b - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT b.ne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. - sub w13, w5, #(2 + 16 - 2 + 1) + sub w13, w4, #(2 + 16 - 2 + 1) ldr b30, [x3, w13, sxtw] - ldr b31, [x12, w13, sxtw] - // Fill v30/v31 with the right padding pixel + // Fill v30 with the right padding pixel dup v30.16b, v30.b[0] - dup v31.16b, v31.b[0] 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #10 + cmp w4, #10 b.ge 4f // If w >= 10, all used input pixels are valid // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called // again; it's not strictly needed in those cases (we pad enough here), // but keeping the code as simple as possible. - // Insert padding in v0/4.b[w] onwards + // Insert padding in v0.b[w] onwards movrel x13, right_ext_mask - sub x13, x13, w5, uxtw + sub x13, x13, w4, uxtw ld1 {v29.16b}, [x13] bit v0.16b, v30.16b, v29.16b - bit v4.16b, v31.16b, v29.16b // Update the precalculated squares umull v1.8h, v0.8b, v0.8b umull2 v2.8h, v0.16b, v0.16b - umull v5.8h, v4.8b, v4.8b - umull2 v6.8h, v4.16b, v4.16b 4: // Loop horizontally ext v16.16b, v0.16b, v0.16b, #1 ext v17.16b, v0.16b, v0.16b, #2 - ext v18.16b, v4.16b, v4.16b, #1 - ext v19.16b, v4.16b, v4.16b, #2 uaddl v3.8h, v0.8b, v16.8b + ext v20.16b, v1.16b, v2.16b, #2 uaddw v3.8h, v3.8h, v17.8b - uaddl v7.8h, v4.8b, v18.8b - uaddw v7.8h, v7.8h, v19.8b - ext v20.16b, v1.16b, v2.16b, #2 ext v21.16b, v1.16b, v2.16b, #4 - ext v22.16b, v5.16b, v6.16b, #2 - ext v23.16b, v5.16b, v6.16b, #4 uaddl v26.4s, v1.4h, v20.4h uaddl2 v27.4s, v1.8h, v20.8h uaddw v26.4s, v26.4s, v21.4h uaddw2 v27.4s, v27.4s, v21.8h - uaddl v28.4s, v5.4h, v22.4h - uaddl2 v29.4s, v5.8h, v22.8h - uaddw v28.4s, v28.4s, v23.4h - uaddw2 v29.4s, v29.4s, v23.8h - - subs w5, w5, #8 + subs w4, w4, #8 st1 {v3.8h}, [x1], #16 - st1 {v7.8h}, [x11], #16 st1 {v26.4s,v27.4s}, [x0], #32 - st1 {v28.4s,v29.4s}, [x10], #32 b.le 9f - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT ld1 {v3.8b}, [x3], #8 - ld1 {v7.8b}, [x12], #8 mov v1.16b, v2.16b - mov v5.16b, v6.16b ext v0.16b, v0.16b, v3.16b, #8 - ext v4.16b, v4.16b, v7.16b, #8 umull v2.8h, v3.8b, v3.8b - umull v6.8h, v7.8b, v7.8b b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: - subs w6, w6, #2 - b.le 0f - // Jump to the next row and loop horizontally - add x0, x0, x9, lsl #1 - add x10, x10, x9, lsl #1 - add x1, x1, x9 - add x11, x11, x9 - add x3, x3, x4 - add x12, x12, x4 - mov w5, w8 - b 1b -0: ret endfunc -// void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum, -// const pixel (*left)[4], -// const pixel *src, const ptrdiff_t stride, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box5_h_8bpc_neon, export=1 - add w5, w5, #2 // w += 2 - - // Set up pointers for reading/writing alternate rows - add x10, x0, #(4*SUM_STRIDE) // sumsq - add x11, x1, #(2*SUM_STRIDE) // sum - add x12, x3, x4 // src - lsl x4, x4, #1 - mov x9, #(2*2*SUM_STRIDE) // double sum stride - - // Subtract the aligned width from the output stride. - add w13, w5, #7 - bic w13, w13, #7 - sub x9, x9, w13, uxtw #1 - add w13, w13, #8 - sub x4, x4, w13, uxtw - - // Store the width for the vertical loop - mov w8, w5 +// void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const int w, +// const enum LrEdgeFlags edges); +function sgr_box5_row_h_8bpc_neon, export=1 + add w4, w4, #2 // w += 2 - // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w7, #1 // LR_HAVE_LEFT - b.eq 2f - // LR_HAVE_LEFT + tst w5, #1 // LR_HAVE_LEFT + b.eq 1f cbnz x2, 0f - // left == NULL + + // LR_HAVE_LEFT && left == NULL sub x3, x3, #3 - sub x12, x12, #3 - b 1f -0: // LR_HAVE_LEFT, left != NULL -2: // !LR_HAVE_LEFT, increase the stride. - // For this case we don't read the left 3 pixels from the src pointer, - // but shift it as if we had done that. - add x4, x4, #3 - -1: // Loop vertically - ld1 {v0.16b}, [x3], #16 - ld1 {v4.16b}, [x12], #16 + ld1 {v0.16b}, [x3], #16 + b 2f - tst w7, #1 // LR_HAVE_LEFT - b.eq 0f - cbz x2, 2f +0: // LR_HAVE_LEFT, left != NULL - ld1 {v1.s}[3], [x2], #4 - // Move x3/x12 back to account for the last 3 bytes we loaded earlier, + ld1 {v0.16b}, [x3], #16 + ld1 {v1.s}[3], [x2], #4 + // Move x3 back to account for the last 3 bytes we loaded earlier, // which we'll shift out. sub x3, x3, #3 - sub x12, x12, #3 - ld1 {v5.s}[3], [x2], #4 ext v0.16b, v1.16b, v0.16b, #13 - ext v4.16b, v5.16b, v4.16b, #13 b 2f -0: + +1: + ld1 {v0.16b}, [x3], #16 // !LR_HAVE_LEFT, fill v1 with the leftmost byte // and shift v0 to have 3x the first byte at the front. dup v1.16b, v0.b[0] - dup v5.16b, v4.b[0] // Move x3 back to account for the last 3 bytes we loaded before, // which we shifted out. sub x3, x3, #3 - sub x12, x12, #3 ext v0.16b, v1.16b, v0.16b, #13 - ext v4.16b, v5.16b, v4.16b, #13 2: umull v1.8h, v0.8b, v0.8b umull2 v2.8h, v0.16b, v0.16b - umull v5.8h, v4.8b, v4.8b - umull2 v6.8h, v4.16b, v4.16b - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT b.ne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. - sub w13, w5, #(2 + 16 - 3 + 1) + sub w13, w4, #(2 + 16 - 3 + 1) ldr b30, [x3, w13, sxtw] - ldr b31, [x12, w13, sxtw] - // Fill v30/v31 with the right padding pixel + // Fill v30 with the right padding pixel dup v30.16b, v30.b[0] - dup v31.16b, v31.b[0] 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #11 + cmp w4, #11 b.ge 4f // If w >= 11, all used input pixels are valid // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. - // Insert padding in v0/4.b[w+1] onwards; fuse the +1 into the + // Insert padding in v0.b[w+1] onwards; fuse the +1 into the // buffer pointer. movrel x13, right_ext_mask, -1 - sub x13, x13, w5, uxtw + sub x13, x13, w4, uxtw ld1 {v29.16b}, [x13] bit v0.16b, v30.16b, v29.16b - bit v4.16b, v31.16b, v29.16b // Update the precalculated squares umull v1.8h, v0.8b, v0.8b umull2 v2.8h, v0.16b, v0.16b - umull v5.8h, v4.8b, v4.8b - umull2 v6.8h, v4.16b, v4.16b 4: // Loop horizontally ext v16.16b, v0.16b, v0.16b, #1 ext v17.16b, v0.16b, v0.16b, #2 ext v18.16b, v0.16b, v0.16b, #3 ext v19.16b, v0.16b, v0.16b, #4 - ext v20.16b, v4.16b, v4.16b, #1 - ext v21.16b, v4.16b, v4.16b, #2 - ext v22.16b, v4.16b, v4.16b, #3 - ext v23.16b, v4.16b, v4.16b, #4 uaddl v3.8h, v0.8b, v16.8b uaddl v24.8h, v17.8b, v18.8b - uaddl v7.8h, v4.8b, v20.8b uaddw v3.8h, v3.8h, v19.8b - uaddl v25.8h, v21.8b, v22.8b - uaddw v7.8h, v7.8h, v23.8b add v3.8h, v3.8h, v24.8h - add v7.8h, v7.8h, v25.8h ext v16.16b, v1.16b, v2.16b, #2 ext v17.16b, v1.16b, v2.16b, #4 ext v18.16b, v1.16b, v2.16b, #6 ext v19.16b, v1.16b, v2.16b, #8 - ext v20.16b, v5.16b, v6.16b, #2 - ext v21.16b, v5.16b, v6.16b, #4 - ext v22.16b, v5.16b, v6.16b, #6 - ext v23.16b, v5.16b, v6.16b, #8 uaddl v26.4s, v1.4h, v16.4h uaddl2 v27.4s, v1.8h, v16.8h uaddl v16.4s, v17.4h, v18.4h uaddl2 v17.4s, v17.8h, v18.8h - uaddl v28.4s, v5.4h, v20.4h - uaddl2 v29.4s, v5.8h, v20.8h uaddw v26.4s, v26.4s, v19.4h uaddw2 v27.4s, v27.4s, v19.8h - uaddl v20.4s, v21.4h, v22.4h - uaddl2 v21.4s, v21.8h, v22.8h - uaddw v28.4s, v28.4s, v23.4h - uaddw2 v29.4s, v29.4s, v23.8h add v26.4s, v26.4s, v16.4s add v27.4s, v27.4s, v17.4s - add v28.4s, v28.4s, v20.4s - add v29.4s, v29.4s, v21.4s - subs w5, w5, #8 + subs w4, w4, #8 st1 {v3.8h}, [x1], #16 - st1 {v7.8h}, [x11], #16 st1 {v26.4s,v27.4s}, [x0], #32 - st1 {v28.4s,v29.4s}, [x10], #32 b.le 9f - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT ld1 {v3.8b}, [x3], #8 - ld1 {v7.8b}, [x12], #8 mov v1.16b, v2.16b - mov v5.16b, v6.16b ext v0.16b, v0.16b, v3.16b, #8 - ext v4.16b, v4.16b, v7.16b, #8 umull v2.8h, v3.8b, v3.8b - umull v6.8h, v7.8b, v7.8b + b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: - subs w6, w6, #2 - b.le 0f - // Jump to the next row and loop horizontally - add x0, x0, x9, lsl #1 - add x10, x10, x9, lsl #1 - add x1, x1, x9 - add x11, x11, x9 - add x3, x3, x4 - add x12, x12, x4 - mov w5, w8 - b 1b + ret +endfunc + +// void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3, +// int32_t *sumsq5, int16_t *sum5, +// const pixel (*left)[4], +// const pixel *src, const int w, +// const enum LrEdgeFlags edges); +function sgr_box35_row_h_8bpc_neon, export=1 + add w6, w6, #2 // w += 2 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + cbnz x4, 0f + + // LR_HAVE_LEFT && left == NULL + sub x5, x5, #3 + ld1 {v0.16b}, [x5], #16 + b 2f + 0: + // LR_HAVE_LEFT, left != NULL + ld1 {v0.16b}, [x5], #16 + ld1 {v1.s}[3], [x4], #4 + // Move x3 back to account for the last 3 bytes we loaded earlier, + // which we'll shift out. + sub x5, x5, #3 + ext v0.16b, v1.16b, v0.16b, #13 + b 2f + +1: + ld1 {v0.16b}, [x5], #16 + // !LR_HAVE_LEFT, fill v1 with the leftmost byte + // and shift v0 to have 3x the first byte at the front. + dup v1.16b, v0.b[0] + // Move x3 back to account for the last 3 bytes we loaded before, + // which we shifted out. + sub x5, x5, #3 + ext v0.16b, v1.16b, v0.16b, #13 + +2: + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that byte to pad with + // here since we can find it pretty easily from here. + sub w13, w6, #(2 + 16 - 3 + 1) + ldr b30, [x5, w13, sxtw] + // Fill v30 with the right padding pixel + dup v30.16b, v30.b[0] +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w6, #11 + b.ge 4f // If w >= 11, all used input pixels are valid + + // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in v0.b[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel x13, right_ext_mask, -1 + sub x13, x13, w6, uxtw + ld1 {v29.16b}, [x13] + + bit v0.16b, v30.16b, v29.16b + + // Update the precalculated squares + umull v1.8h, v0.8b, v0.8b + umull2 v2.8h, v0.16b, v0.16b + +4: // Loop horizontally + ext v16.16b, v0.16b, v0.16b, #1 + ext v17.16b, v0.16b, v0.16b, #2 + ext v19.16b, v0.16b, v0.16b, #4 + ext v18.16b, v0.16b, v0.16b, #3 + uaddl v3.8h, v16.8b, v17.8b + uaddl v24.8h, v0.8b, v19.8b + uaddw v3.8h, v3.8h, v18.8b + + ext v16.16b, v1.16b, v2.16b, #2 + ext v17.16b, v1.16b, v2.16b, #4 + ext v19.16b, v1.16b, v2.16b, #8 + ext v18.16b, v1.16b, v2.16b, #6 + + st1 {v3.8h}, [x1], #16 + add v3.8h, v3.8h, v24.8h + + uaddl v26.4s, v16.4h, v17.4h + uaddl2 v27.4s, v16.8h, v17.8h + uaddl v16.4s, v1.4h, v19.4h + uaddl2 v17.4s, v1.8h, v19.8h + uaddw v26.4s, v26.4s, v18.4h + uaddw2 v27.4s, v27.4s, v18.8h + + st1 {v26.4s,v27.4s}, [x0], #32 + add v26.4s, v26.4s, v16.4s + add v27.4s, v27.4s, v17.4s + + subs w6, w6, #8 + + st1 {v3.8h}, [x3], #16 + st1 {v26.4s,v27.4s}, [x2], #32 + + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + ld1 {v3.8b}, [x5], #8 + mov v1.16b, v2.16b + ext v0.16b, v0.16b, v3.16b, #8 + umull v2.8h, v3.8b, v3.8b + + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: ret endfunc diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration16.S b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration16.S index 8954e604c..3b76b1ee2 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration16.S +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration16.S @@ -1070,349 +1070,318 @@ function wiener_filter5_hv_16bpc_neon ret endfunc -#define SUM_STRIDE (384+16) - #include "looprestoration_tmpl.S" -// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum, -// const pixel (*left)[4], -// const pixel *src, const ptrdiff_t stride, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box3_h_16bpc_neon, export=1 - add w5, w5, #2 // w += 2 - - // Set up pointers for reading/writing alternate rows - add x10, x0, #(4*SUM_STRIDE) // sumsq - add x11, x1, #(2*SUM_STRIDE) // sum - add x12, x3, x4 // src - lsl x4, x4, #1 - mov x9, #(2*2*SUM_STRIDE) // double sum stride - - // Subtract the aligned width from the output stride. - add w13, w5, #7 - bic w13, w13, #7 - sub x9, x9, w13, uxtw #1 - - // Store the width for the vertical loop - mov w8, w5 - - // Subtract the number of pixels read from the input from the stride - add w13, w13, #8 - sub x4, x4, w13, uxtw #1 +// void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const int w, +// const enum LrEdgeFlags edges); +function sgr_box3_row_h_16bpc_neon, export=1 + add w4, w4, #2 // w += 2 - // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w7, #1 // LR_HAVE_LEFT - b.eq 2f - // LR_HAVE_LEFT + tst w5, #1 // LR_HAVE_LEFT + b.eq 1f cbnz x2, 0f - // left == NULL - sub x3, x3, #4 - sub x12, x12, #4 - b 1f -0: // LR_HAVE_LEFT, left != NULL -2: // !LR_HAVE_LEFT, increase the stride. - // For this case we don't read the left 2 pixels from the src pointer, - // but shift it as if we had done that. - add x4, x4, #4 + // LR_HAVE_LEFT && left == NULL + sub x3, x3, #4 + ld1 {v0.8h, v1.8h}, [x3], #32 + b 2f -1: // Loop vertically - ld1 {v0.8h, v1.8h}, [x3], #32 - ld1 {v16.8h, v17.8h}, [x12], #32 - - tst w7, #1 // LR_HAVE_LEFT - b.eq 0f - cbz x2, 2f +0: // LR_HAVE_LEFT, left != NULL - ld1 {v2.d}[1], [x2], #8 - // Move x3/x12 back to account for the last 2 pixels we loaded earlier, + ld1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v2.d}[1], [x2] + // Move x3 back to account for the last 2 pixels we loaded earlier, // which we'll shift out. sub x3, x3, #4 - sub x12, x12, #4 - ld1 {v18.d}[1], [x2], #8 - ext v1.16b, v0.16b, v1.16b, #12 - ext v0.16b, v2.16b, v0.16b, #12 - ext v17.16b, v16.16b, v17.16b, #12 - ext v16.16b, v18.16b, v16.16b, #12 + ext v1.16b, v0.16b, v1.16b, #12 + ext v0.16b, v2.16b, v0.16b, #12 b 2f -0: + +1: + ld1 {v0.8h, v1.8h}, [x3], #32 // !LR_HAVE_LEFT, fill v2 with the leftmost pixel // and shift v0/v1 to have 2x the first pixel at the front. - dup v2.8h, v0.h[0] - dup v18.8h, v16.h[0] + dup v2.8h, v0.h[0] // Move x3 back to account for the last 2 pixels we loaded before, // which we shifted out. sub x3, x3, #4 - sub x12, x12, #4 - ext v1.16b, v0.16b, v1.16b, #12 - ext v0.16b, v2.16b, v0.16b, #12 - ext v17.16b, v16.16b, v17.16b, #12 - ext v16.16b, v18.16b, v16.16b, #12 + ext v1.16b, v0.16b, v1.16b, #12 + ext v0.16b, v2.16b, v0.16b, #12 2: - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT b.ne 4f // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. - sub w13, w5, #(2 + 16 - 2 + 1) + sub w13, w4, #(2 + 16 - 2 + 1) ldr h30, [x3, w13, sxtw #1] - ldr h31, [x12, w13, sxtw #1] - // Fill v30/v31 with the right padding pixel + // Fill v30 with the right padding pixel dup v30.8h, v30.h[0] - dup v31.8h, v31.h[0] 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #10 + cmp w4, #10 b.ge 4f // If w >= 10, all used input pixels are valid - // 1 <= w < 10, w pixels valid in v0-v1. For w=9, this ends up called + // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called // again; it's not strictly needed in those cases (we pad enough here), // but keeping the code as simple as possible. - // Insert padding in v0/1.h[w] onwards + // Insert padding in v0.b[w] onwards movrel x13, right_ext_mask - sub x13, x13, w5, uxtw #1 + sub x13, x13, w4, uxtw #1 ld1 {v28.16b, v29.16b}, [x13] bit v0.16b, v30.16b, v28.16b bit v1.16b, v30.16b, v29.16b - bit v16.16b, v31.16b, v28.16b - bit v17.16b, v31.16b, v29.16b 4: // Loop horizontally ext v26.16b, v0.16b, v1.16b, #2 - ext v28.16b, v16.16b, v17.16b, #2 ext v27.16b, v0.16b, v1.16b, #4 - ext v29.16b, v16.16b, v17.16b, #4 add v6.8h, v0.8h, v26.8h umull v22.4s, v0.4h, v0.4h umlal v22.4s, v26.4h, v26.4h umlal v22.4s, v27.4h, v27.4h - add v7.8h, v16.8h, v28.8h - umull v24.4s, v16.4h, v16.4h - umlal v24.4s, v28.4h, v28.4h - umlal v24.4s, v29.4h, v29.4h add v6.8h, v6.8h, v27.8h umull2 v23.4s, v0.8h, v0.8h umlal2 v23.4s, v26.8h, v26.8h umlal2 v23.4s, v27.8h, v27.8h - add v7.8h, v7.8h, v29.8h - umull2 v25.4s, v16.8h, v16.8h - umlal2 v25.4s, v28.8h, v28.8h - umlal2 v25.4s, v29.8h, v29.8h - subs w5, w5, #8 + subs w4, w4, #8 st1 {v6.8h}, [x1], #16 - st1 {v7.8h}, [x11], #16 st1 {v22.4s,v23.4s}, [x0], #32 - st1 {v24.4s,v25.4s}, [x10], #32 b.le 9f - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT mov v0.16b, v1.16b - mov v16.16b, v17.16b ld1 {v1.8h}, [x3], #16 - ld1 {v17.8h}, [x12], #16 b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: - subs w6, w6, #2 - b.le 0f - // Jump to the next row and loop horizontally - add x0, x0, x9, lsl #1 - add x10, x10, x9, lsl #1 - add x1, x1, x9 - add x11, x11, x9 - add x3, x3, x4 - add x12, x12, x4 - mov w5, w8 - b 1b -0: ret endfunc -// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, -// const pixel (*left)[4], -// const pixel *src, const ptrdiff_t stride, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box5_h_16bpc_neon, export=1 - add w5, w5, #2 // w += 2 - - // Set up pointers for reading/writing alternate rows - add x10, x0, #(4*SUM_STRIDE) // sumsq - add x11, x1, #(2*SUM_STRIDE) // sum - add x12, x3, x4 // src - lsl x4, x4, #1 - mov x9, #(2*2*SUM_STRIDE) // double sum stride - - // Subtract the aligned width from the output stride. - add w13, w5, #7 - bic w13, w13, #7 - sub x9, x9, w13, uxtw #1 - add w13, w13, #8 - sub x4, x4, w13, uxtw #1 - - // Store the width for the vertical loop - mov w8, w5 +// void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum, +// const pixel (*left)[4], +// const pixel *src, const int w, +// const enum LrEdgeFlags edges); +function sgr_box5_row_h_16bpc_neon, export=1 + add w4, w4, #2 // w += 2 - // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL - tst w7, #1 // LR_HAVE_LEFT - b.eq 2f - // LR_HAVE_LEFT + tst w5, #1 // LR_HAVE_LEFT + b.eq 1f cbnz x2, 0f - // left == NULL + + // LR_HAVE_LEFT && left == NULL sub x3, x3, #6 - sub x12, x12, #6 - b 1f -0: // LR_HAVE_LEFT, left != NULL -2: // !LR_HAVE_LEFT, increase the stride. - // For this case we don't read the left 3 pixels from the src pointer, - // but shift it as if we had done that. - add x4, x4, #6 - -1: // Loop vertically - ld1 {v0.8h, v1.8h}, [x3], #32 - ld1 {v16.8h, v17.8h}, [x12], #32 + ld1 {v0.8h, v1.8h}, [x3], #32 + b 2f - tst w7, #1 // LR_HAVE_LEFT - b.eq 0f - cbz x2, 2f +0: // LR_HAVE_LEFT, left != NULL - ld1 {v2.d}[1], [x2], #8 - // Move x3/x12 back to account for the last 3 pixels we loaded earlier, + ld1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v2.d}[1], [x2], #8 + // Move x3 back to account for the last 3 pixels we loaded earlier, // which we'll shift out. sub x3, x3, #6 - sub x12, x12, #6 - ld1 {v18.d}[1], [x2], #8 ext v1.16b, v0.16b, v1.16b, #10 ext v0.16b, v2.16b, v0.16b, #10 - ext v17.16b, v16.16b, v17.16b, #10 - ext v16.16b, v18.16b, v16.16b, #10 b 2f -0: + +1: + ld1 {v0.8h, v1.8h}, [x3], #32 // !LR_HAVE_LEFT, fill v2 with the leftmost pixel // and shift v0/v1 to have 3x the first pixel at the front. dup v2.8h, v0.h[0] - dup v18.8h, v16.h[0] // Move x3 back to account for the last 3 pixels we loaded before, // which we shifted out. sub x3, x3, #6 - sub x12, x12, #6 ext v1.16b, v0.16b, v1.16b, #10 ext v0.16b, v2.16b, v0.16b, #10 - ext v17.16b, v16.16b, v17.16b, #10 - ext v16.16b, v18.16b, v16.16b, #10 2: - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT b.ne 4f // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. - sub w13, w5, #(2 + 16 - 3 + 1) + sub w13, w4, #(2 + 16 - 3 + 1) ldr h30, [x3, w13, sxtw #1] - ldr h31, [x12, w13, sxtw #1] - // Fill v30/v31 with the right padding pixel + // Fill v30 with the right padding pixel dup v30.8h, v30.h[0] - dup v31.8h, v31.h[0] 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge - cmp w5, #11 + cmp w4, #11 b.ge 4f // If w >= 11, all used input pixels are valid - // 1 <= w < 11, w+1 pixels valid in v0-v1. For w=9 or w=10, + // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. - // Insert padding in v0/1.h[w+1] onwards; fuse the +1 into the + // Insert padding in v0.b[w+1] onwards; fuse the +1 into the // buffer pointer. - movrel x13, right_ext_mask, -2 - sub x13, x13, w5, uxtw #1 + movrel x13, right_ext_mask, -1 + sub x13, x13, w4, uxtw #1 ld1 {v28.16b, v29.16b}, [x13] bit v0.16b, v30.16b, v28.16b bit v1.16b, v30.16b, v29.16b - bit v16.16b, v31.16b, v28.16b - bit v17.16b, v31.16b, v29.16b 4: // Loop horizontally ext v26.16b, v0.16b, v1.16b, #2 - ext v28.16b, v16.16b, v17.16b, #2 ext v27.16b, v0.16b, v1.16b, #4 - ext v29.16b, v16.16b, v17.16b, #4 add v6.8h, v0.8h, v26.8h umull v22.4s, v0.4h, v0.4h umlal v22.4s, v26.4h, v26.4h umlal v22.4s, v27.4h, v27.4h - add v7.8h, v16.8h, v28.8h - umull v24.4s, v16.4h, v16.4h - umlal v24.4s, v28.4h, v28.4h - umlal v24.4s, v29.4h, v29.4h add v6.8h, v6.8h, v27.8h umull2 v23.4s, v0.8h, v0.8h umlal2 v23.4s, v26.8h, v26.8h umlal2 v23.4s, v27.8h, v27.8h - add v7.8h, v7.8h, v29.8h - umull2 v25.4s, v16.8h, v16.8h - umlal2 v25.4s, v28.8h, v28.8h - umlal2 v25.4s, v29.8h, v29.8h ext v26.16b, v0.16b, v1.16b, #6 - ext v28.16b, v16.16b, v17.16b, #6 ext v27.16b, v0.16b, v1.16b, #8 - ext v29.16b, v16.16b, v17.16b, #8 add v6.8h, v6.8h, v26.8h umlal v22.4s, v26.4h, v26.4h umlal v22.4s, v27.4h, v27.4h - add v7.8h, v7.8h, v28.8h - umlal v24.4s, v28.4h, v28.4h - umlal v24.4s, v29.4h, v29.4h add v6.8h, v6.8h, v27.8h umlal2 v23.4s, v26.8h, v26.8h umlal2 v23.4s, v27.8h, v27.8h - add v7.8h, v7.8h, v29.8h - umlal2 v25.4s, v28.8h, v28.8h - umlal2 v25.4s, v29.8h, v29.8h - subs w5, w5, #8 + subs w4, w4, #8 st1 {v6.8h}, [x1], #16 - st1 {v7.8h}, [x11], #16 st1 {v22.4s,v23.4s}, [x0], #32 - st1 {v24.4s,v25.4s}, [x10], #32 b.le 9f - tst w7, #2 // LR_HAVE_RIGHT + tst w5, #2 // LR_HAVE_RIGHT mov v0.16b, v1.16b - mov v16.16b, v17.16b - ld1 {v1.8h}, [x3], #16 - ld1 {v17.8h}, [x12], #16 + ld1 {v1.8h}, [x3], #16 b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: - subs w6, w6, #2 - b.le 0f - // Jump to the next row and loop horizontally - add x0, x0, x9, lsl #1 - add x10, x10, x9, lsl #1 - add x1, x1, x9 - add x11, x11, x9 - add x3, x3, x4 - add x12, x12, x4 - mov w5, w8 - b 1b + ret +endfunc + +// void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3, +// int32_t *sumsq5, int16_t *sum5, +// const pixel (*left)[4], +// const pixel *src, const int w, +// const enum LrEdgeFlags edges); +function sgr_box35_row_h_16bpc_neon, export=1 + add w6, w6, #2 // w += 2 + + tst w7, #1 // LR_HAVE_LEFT + b.eq 1f + cbnz x4, 0f + + // LR_HAVE_LEFT && left == NULL + sub x5, x5, #6 + ld1 {v0.8h, v1.8h}, [x5], #32 + b 2f + 0: + // LR_HAVE_LEFT, left != NULL + ld1 {v0.8h, v1.8h}, [x5], #32 + ld1 {v2.d}[1], [x4], #8 + // Move x3 back to account for the last 3 pixels we loaded earlier, + // which we'll shift out. + sub x5, x5, #6 + ext v1.16b, v0.16b, v1.16b, #10 + ext v0.16b, v2.16b, v0.16b, #10 + b 2f + +1: + ld1 {v0.8h, v1.8h}, [x5], #32 + // !LR_HAVE_LEFT, fill v2 with the leftmost pixel + // and shift v0/v1 to have 3x the first pixel at the front. + dup v2.8h, v0.h[0] + // Move x5 back to account for the last 3 pixels we loaded before, + // which we shifted out. + sub x5, x5, #6 + ext v1.16b, v0.16b, v1.16b, #10 + ext v0.16b, v2.16b, v0.16b, #10 + +2: + tst w7, #2 // LR_HAVE_RIGHT + b.ne 4f + // If we'll need to pad the right edge, load that pixel to pad with + // here since we can find it pretty easily from here. + sub w13, w6, #(2 + 16 - 3 + 1) + ldr h30, [x5, w13, sxtw #1] + // Fill v30 with the right padding pixel + dup v30.8h, v30.h[0] +3: // !LR_HAVE_RIGHT + + // Check whether we need to pad the right edge + cmp w6, #11 + b.ge 4f // If w >= 11, all used input pixels are valid + + // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, + // this ends up called again; it's not strictly needed in those + // cases (we pad enough here), but keeping the code as simple as possible. + + // Insert padding in v0.b[w+1] onwards; fuse the +1 into the + // buffer pointer. + movrel x13, right_ext_mask, -1 + sub x13, x13, w6, uxtw #1 + ld1 {v28.16b, v29.16b}, [x13] + + bit v0.16b, v30.16b, v28.16b + bit v1.16b, v30.16b, v29.16b + +4: // Loop horizontally + ext v16.16b, v0.16b, v1.16b, #2 + ext v17.16b, v0.16b, v1.16b, #4 + ext v19.16b, v0.16b, v1.16b, #8 + ext v18.16b, v0.16b, v1.16b, #6 + + add v20.8h, v16.8h, v17.8h + add v21.8h, v0.8h, v19.8h + add v20.8h, v20.8h, v18.8h + + umull v22.4s, v16.4h, v16.4h + umlal v22.4s, v17.4h, v17.4h + umlal v22.4s, v18.4h, v18.4h + + umull2 v23.4s, v16.8h, v16.8h + umlal2 v23.4s, v17.8h, v17.8h + umlal2 v23.4s, v18.8h, v18.8h + + add v21.8h, v21.8h, v20.8h + st1 {v20.8h}, [x1], #16 + st1 {v22.4s,v23.4s}, [x0], #32 + + umlal v22.4s, v0.4h, v0.4h + umlal v22.4s, v19.4h, v19.4h + + umlal2 v23.4s, v0.8h, v0.8h + umlal2 v23.4s, v19.8h, v19.8h + + subs w6, w6, #8 + + st1 {v21.8h}, [x3], #16 + st1 {v22.4s,v23.4s}, [x2], #32 + + b.le 9f + tst w7, #2 // LR_HAVE_RIGHT + mov v0.16b, v1.16b + ld1 {v1.8h}, [x5], #16 + + b.ne 4b // If we don't need to pad, just keep summing. + b 3b // If we need to pad, check how many pixels we have left. + +9: ret endfunc diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration_common.S b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration_common.S index 200eb6318..745f6c20f 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration_common.S +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration_common.S @@ -28,332 +28,29 @@ #include "src/arm/asm.S" #include "util.S" -#define SUM_STRIDE (384+16) - -// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box3_v_neon, export=1 - add w10, w3, #2 // Number of output rows to move back - mov w11, w3 // Number of input rows to move back - add w2, w2, #2 // Actual summed width - mov x7, #(4*SUM_STRIDE) // sumsq stride - mov x8, #(2*SUM_STRIDE) // sum stride - sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride - sub x1, x1, #(2*SUM_STRIDE) // sum -= stride - - tst w4, #4 // LR_HAVE_TOP - b.eq 0f - // If have top, read from row -2. - sub x5, x0, #(4*SUM_STRIDE) - sub x6, x1, #(2*SUM_STRIDE) - add w11, w11, #2 - b 1f -0: - // !LR_HAVE_TOP - // If we don't have top, read from row 0 even if - // we start writing to row -1. - add x5, x0, #(4*SUM_STRIDE) - add x6, x1, #(2*SUM_STRIDE) -1: - - tst w4, #8 // LR_HAVE_BOTTOM - b.eq 1f - // LR_HAVE_BOTTOM - add w3, w3, #2 // Sum all h+2 lines with the main loop - add w11, w11, #2 -1: - mov w9, w3 // Backup of h for next loops - -1: - // Start of horizontal loop; start one vertical filter slice. - // Start loading rows into v16-v21 and v24-v26 taking top - // padding into consideration. - tst w4, #4 // LR_HAVE_TOP - ld1 {v16.4s, v17.4s}, [x5], x7 - ld1 {v24.8h}, [x6], x8 - b.eq 2f - // LR_HAVE_TOP - ld1 {v18.4s, v19.4s}, [x5], x7 - ld1 {v25.8h}, [x6], x8 - ld1 {v20.4s, v21.4s}, [x5], x7 - ld1 {v26.8h}, [x6], x8 - b 3f -2: // !LR_HAVE_TOP - mov v18.16b, v16.16b - mov v19.16b, v17.16b - mov v25.16b, v24.16b - mov v20.16b, v16.16b - mov v21.16b, v17.16b - mov v26.16b, v24.16b - -3: - subs w3, w3, #1 -.macro add3 - add v16.4s, v16.4s, v18.4s - add v17.4s, v17.4s, v19.4s - add v24.8h, v24.8h, v25.8h - add v16.4s, v16.4s, v20.4s - add v17.4s, v17.4s, v21.4s - add v24.8h, v24.8h, v26.8h - st1 {v16.4s, v17.4s}, [x0], x7 - st1 {v24.8h}, [x1], x8 -.endm - add3 - mov v16.16b, v18.16b - mov v17.16b, v19.16b - mov v24.16b, v25.16b - mov v18.16b, v20.16b - mov v19.16b, v21.16b - mov v25.16b, v26.16b - b.le 4f - ld1 {v20.4s, v21.4s}, [x5], x7 - ld1 {v26.8h}, [x6], x8 - b 3b - -4: - tst w4, #8 // LR_HAVE_BOTTOM - b.ne 5f - // !LR_HAVE_BOTTOM - // Produce two more rows, extending the already loaded rows. - add3 - mov v16.16b, v18.16b - mov v17.16b, v19.16b - mov v24.16b, v25.16b - add3 - -5: // End of one vertical slice. - subs w2, w2, #8 - b.le 0f - // Move pointers back up to the top and loop horizontally. - // Input pointers - msub x5, x7, x11, x5 - msub x6, x8, x11, x6 - // Output pointers - msub x0, x7, x10, x0 - msub x1, x8, x10, x1 - add x0, x0, #32 - add x1, x1, #16 - add x5, x5, #32 - add x6, x6, #16 - mov w3, w9 - b 1b - -0: - ret -.purgem add3 -endfunc +// void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum, +// int32_t *AA, int16_t *BB, +// const int w, const int s, +// const int bitdepth_max); +function sgr_box3_vert_neon, export=1 + stp d8, d9, [sp, #-0x30]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] -// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, -// const int w, const int h, -// const enum LrEdgeFlags edges); -function sgr_box5_v_neon, export=1 - add w10, w3, #2 // Number of output rows to move back - mov w11, w3 // Number of input rows to move back - add w2, w2, #8 // Actual summed width - mov x7, #(4*SUM_STRIDE) // sumsq stride - mov x8, #(2*SUM_STRIDE) // sum stride - sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride - sub x1, x1, #(2*SUM_STRIDE) // sum -= stride - - tst w4, #4 // LR_HAVE_TOP - b.eq 0f - // If have top, read from row -2. - sub x5, x0, #(4*SUM_STRIDE) - sub x6, x1, #(2*SUM_STRIDE) - add w11, w11, #2 - b 1f -0: - // !LR_HAVE_TOP - // If we don't have top, read from row 0 even if - // we start writing to row -1. - add x5, x0, #(4*SUM_STRIDE) - add x6, x1, #(2*SUM_STRIDE) -1: - - tst w4, #8 // LR_HAVE_BOTTOM - b.eq 0f - // LR_HAVE_BOTTOM - add w3, w3, #2 // Handle h+2 lines with the main loop - add w11, w11, #2 - b 1f -0: - // !LR_HAVE_BOTTOM - sub w3, w3, #1 // Handle h-1 lines with the main loop -1: - mov w9, w3 // Backup of h for next loops - -1: - // Start of horizontal loop; start one vertical filter slice. - // Start loading rows into v16-v25 and v26-v30 taking top - // padding into consideration. - tst w4, #4 // LR_HAVE_TOP - ld1 {v16.4s, v17.4s}, [x5], x7 - ld1 {v26.8h}, [x6], x8 - b.eq 2f - // LR_HAVE_TOP - ld1 {v20.4s, v21.4s}, [x5], x7 - ld1 {v28.8h}, [x6], x8 - mov v18.16b, v16.16b - mov v19.16b, v17.16b - mov v27.16b, v26.16b - ld1 {v22.4s, v23.4s}, [x5], x7 - ld1 {v29.8h}, [x6], x8 - b 3f -2: // !LR_HAVE_TOP - mov v18.16b, v16.16b - mov v19.16b, v17.16b - mov v27.16b, v26.16b - mov v20.16b, v16.16b - mov v21.16b, v17.16b - mov v28.16b, v26.16b - mov v22.16b, v16.16b - mov v23.16b, v17.16b - mov v29.16b, v26.16b - -3: - cbz w3, 4f - ld1 {v24.4s, v25.4s}, [x5], x7 - ld1 {v30.8h}, [x6], x8 - -3: - // Start of vertical loop - subs w3, w3, #2 -.macro add5 - add v16.4s, v16.4s, v18.4s - add v17.4s, v17.4s, v19.4s - add v26.8h, v26.8h, v27.8h - add v0.4s, v20.4s, v22.4s - add v1.4s, v21.4s, v23.4s - add v2.8h, v28.8h, v29.8h - add v16.4s, v16.4s, v24.4s - add v17.4s, v17.4s, v25.4s - add v26.8h, v26.8h, v30.8h - add v16.4s, v16.4s, v0.4s - add v17.4s, v17.4s, v1.4s - add v26.8h, v26.8h, v2.8h - st1 {v16.4s, v17.4s}, [x0], x7 - st1 {v26.8h}, [x1], x8 -.endm - add5 -.macro shift2 - mov v16.16b, v20.16b - mov v17.16b, v21.16b - mov v26.16b, v28.16b - mov v18.16b, v22.16b - mov v19.16b, v23.16b - mov v27.16b, v29.16b - mov v20.16b, v24.16b - mov v21.16b, v25.16b - mov v28.16b, v30.16b -.endm - shift2 - add x0, x0, x7 - add x1, x1, x8 - b.le 5f - ld1 {v22.4s, v23.4s}, [x5], x7 - ld1 {v29.8h}, [x6], x8 - ld1 {v24.4s, v25.4s}, [x5], x7 - ld1 {v30.8h}, [x6], x8 - b 3b - -4: - // h == 1, !LR_HAVE_BOTTOM. - // Pad the last row with the only content row, and add. - mov v24.16b, v22.16b - mov v25.16b, v23.16b - mov v30.16b, v29.16b - add5 - shift2 - add x0, x0, x7 - add x1, x1, x8 - add5 - b 6f - -5: - tst w4, #8 // LR_HAVE_BOTTOM - b.ne 6f - // !LR_HAVE_BOTTOM - cbnz w3, 5f - // The intended three edge rows left; output the one at h-2 and - // the past edge one at h. - ld1 {v22.4s, v23.4s}, [x5], x7 - ld1 {v29.8h}, [x6], x8 - // Pad the past-edge row from the last content row. - mov v24.16b, v22.16b - mov v25.16b, v23.16b - mov v30.16b, v29.16b - add5 - shift2 - add x0, x0, x7 - add x1, x1, x8 - // The last two rows are already padded properly here. - add5 - b 6f - -5: - // w3 == -1, two rows left, output one. - // Pad the last two rows from the mid one. - mov v22.16b, v20.16b - mov v23.16b, v21.16b - mov v29.16b, v28.16b - mov v24.16b, v20.16b - mov v25.16b, v21.16b - mov v30.16b, v28.16b - add5 - add x0, x0, x7 - add x1, x1, x8 - b 6f - -6: // End of one vertical slice. - subs w2, w2, #8 - b.le 0f - // Move pointers back up to the top and loop horizontally. - // Input pointers - msub x5, x7, x11, x5 - msub x6, x8, x11, x6 - // Output pointers - msub x0, x7, x10, x0 - msub x1, x8, x10, x1 - add x0, x0, #32 - add x1, x1, #16 - add x5, x5, #32 - add x6, x6, #16 - mov w3, w9 - b 1b - -0: - ret -.purgem add5 -endfunc + add w4, w4, #2 + clz w9, w6 // bitdepth_max + dup v28.4s, w5 // strength -// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, -// const int w, const int h, const int strength, -// const int bitdepth_max); -// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, -// const int w, const int h, const int strength, -// const int bitdepth_max); -function sgr_calc_ab1_neon, export=1 - clz w9, w5 - add x3, x3, #2 // h += 2 - movi v31.4s, #9 // n - mov x5, #455 - mov x8, #SUM_STRIDE - b sgr_calc_ab_neon -endfunc + ldp x5, x6, [x0] + ldr x0, [x0, #16] + ldp x7, x8, [x1] + ldr x1, [x1, #16] -function sgr_calc_ab2_neon, export=1 - clz w9, w5 - add x3, x3, #3 // h += 3 - asr x3, x3, #1 // h /= 2 - movi v31.4s, #25 // n - mov x5, #164 - mov x8, #(2*SUM_STRIDE) -endfunc + movi v31.4s, #9 // n -function sgr_calc_ab_neon sub w9, w9, #24 // -bitdepth_min_8 movrel x12, X(sgr_x_by_x) + mov w13, #455 // one_by_x ld1 {v16.16b, v17.16b, v18.16b}, [x12] dup v6.8h, w9 // -bitdepth_min_8 movi v19.16b, #5 @@ -363,70 +60,213 @@ function sgr_calc_ab_neon movi v23.8b, #169 // idx of last 2 movi v24.8b, #254 // idx of last 1 saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 - add x2, x2, #2 // w += 2 - add x7, x2, #7 - bic x7, x7, #7 // aligned w - sub x7, x8, x7 // increment between rows movi v29.8h, #1, lsl #8 - dup v28.4s, w4 - dup v30.4s, w5 // one_by_x - sub x0, x0, #(4*(SUM_STRIDE)) - sub x1, x1, #(2*(SUM_STRIDE)) - mov x6, x2 // backup of w + dup v30.4s, w13 // one_by_x + + sub v16.16b, v16.16b, v19.16b + sub v17.16b, v17.16b, v19.16b + sub v18.16b, v18.16b, v19.16b + + ld1 {v8.4s, v9.4s}, [x5], #32 + ld1 {v10.4s, v11.4s}, [x6], #32 + ld1 {v12.8h}, [x7], #16 + ld1 {v13.8h}, [x8], #16 + ld1 {v0.4s, v1.4s}, [x0], #32 + ld1 {v2.8h}, [x1], #16 +1: + + add v8.4s, v8.4s, v10.4s + add v9.4s, v9.4s, v11.4s + + add v12.8h, v12.8h, v13.8h + + subs w4, w4, #8 + add v0.4s, v0.4s, v8.4s + add v1.4s, v1.4s, v9.4s + add v2.8h, v2.8h, v12.8h + + srshl v0.4s, v0.4s, v7.4s + srshl v1.4s, v1.4s, v7.4s + srshl v4.8h, v2.8h, v6.8h + mul v0.4s, v0.4s, v31.4s // a * n + mul v1.4s, v1.4s, v31.4s // a * n + umull v3.4s, v4.4h, v4.4h // b * b + umull2 v4.4s, v4.8h, v4.8h // b * b + uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) + uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) + mul v0.4s, v0.4s, v28.4s // p * s + mul v1.4s, v1.4s, v28.4s // p * s + ld1 {v8.4s, v9.4s}, [x5], #32 + uqshrn v0.4h, v0.4s, #16 + uqshrn2 v0.8h, v1.4s, #16 + ld1 {v10.4s, v11.4s}, [x6], #32 + uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) + + ld1 {v12.8h}, [x7], #16 + + cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 + cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 + tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b + cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 + cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 + add v25.8b, v25.8b, v26.8b + cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 + add v27.8b, v27.8b, v4.8b + add v5.8b, v5.8b, v19.8b + add v25.8b, v25.8b, v27.8b + add v5.8b, v1.8b, v5.8b + ld1 {v13.8h}, [x8], #16 + add v5.8b, v5.8b, v25.8b + ld1 {v0.4s, v1.4s}, [x0], #32 + uxtl v5.8h, v5.8b // x + + umull v3.4s, v5.4h, v2.4h // x * BB[i] + umull2 v4.4s, v5.8h, v2.8h // x * BB[i] + mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x + mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x + srshr v3.4s, v3.4s, #12 // AA[i] + srshr v4.4s, v4.4s, #12 // AA[i] + sub v5.8h, v29.8h, v5.8h // 256 - x + ld1 {v2.8h}, [x1], #16 + + st1 {v3.4s, v4.4s}, [x2], #32 + st1 {v5.8h}, [x3], #16 + b.gt 1b + + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x30 + ret +endfunc + +// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum, +// int32_t *AA, int16_t *BB, +// const int w, const int s, +// const int bitdepth_max); +function sgr_box5_vert_neon, export=1 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + add w4, w4, #2 + clz w15, w6 // bitdepth_max + dup v28.4s, w5 // strength + + ldp x5, x6, [x0] + ldp x7, x8, [x0, #16] + ldr x0, [x0, #32] + ldp x9, x10, [x1] + ldp x11, x12, [x1, #16] + ldr x1, [x1, #32] + + movi v31.4s, #25 // n + + sub w15, w15, #24 // -bitdepth_min_8 + movrel x13, X(sgr_x_by_x) + mov w14, #164 // one_by_x + ld1 {v16.16b, v17.16b, v18.16b}, [x13] + dup v6.8h, w15 // -bitdepth_min_8 + movi v19.16b, #5 + movi v24.8b, #254 // idx of last 1 + saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 + movi v29.8h, #1, lsl #8 + dup v30.4s, w14 // one_by_x + sub v16.16b, v16.16b, v19.16b sub v17.16b, v17.16b, v19.16b sub v18.16b, v18.16b, v19.16b + + ld1 {v8.4s, v9.4s}, [x5], #32 + ld1 {v10.4s, v11.4s}, [x6], #32 + ld1 {v12.4s, v13.4s}, [x7], #32 + ld1 {v14.4s, v15.4s}, [x8], #32 + ld1 {v20.8h}, [x9], #16 + ld1 {v21.8h}, [x10], #16 + ld1 {v22.8h}, [x11], #16 + ld1 {v23.8h}, [x12], #16 + ld1 {v0.4s, v1.4s}, [x0], #32 + ld1 {v2.8h}, [x1], #16 + 1: - subs x2, x2, #8 - ld1 {v0.4s, v1.4s}, [x0] // a - ld1 {v2.8h}, [x1] // b - srshl v0.4s, v0.4s, v7.4s - srshl v1.4s, v1.4s, v7.4s - srshl v4.8h, v2.8h, v6.8h - mul v0.4s, v0.4s, v31.4s // a * n - mul v1.4s, v1.4s, v31.4s // a * n - umull v3.4s, v4.4h, v4.4h // b * b - umull2 v4.4s, v4.8h, v4.8h // b * b - uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) - uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) - mul v0.4s, v0.4s, v28.4s // p * s - mul v1.4s, v1.4s, v28.4s // p * s - uqshrn v0.4h, v0.4s, #16 - uqshrn2 v0.8h, v1.4s, #16 - uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) - - cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 - cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 - tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b - cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 - cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 - add v25.8b, v25.8b, v26.8b - cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 - add v27.8b, v27.8b, v4.8b - add v5.8b, v5.8b, v19.8b - add v25.8b, v25.8b, v27.8b - add v1.8b, v1.8b, v5.8b - add v1.8b, v1.8b, v25.8b - uxtl v1.8h, v1.8b // x - - umull v3.4s, v1.4h, v2.4h // x * BB[i] - umull2 v4.4s, v1.8h, v2.8h // x * BB[i] - mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x - mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x - srshr v3.4s, v3.4s, #12 // AA[i] - srshr v4.4s, v4.4s, #12 // AA[i] - sub v2.8h, v29.8h, v1.8h // 256 - x - - st1 {v3.4s, v4.4s}, [x0], #32 - st1 {v2.8h}, [x1], #16 + add v8.4s, v8.4s, v10.4s + add v9.4s, v9.4s, v11.4s + add v12.4s, v12.4s, v14.4s + add v13.4s, v13.4s, v15.4s + + add v20.8h, v20.8h, v21.8h + add v22.8h, v22.8h, v23.8h + + add v0.4s, v0.4s, v8.4s + add v1.4s, v1.4s, v9.4s + add v2.8h, v2.8h, v20.8h + + add v0.4s, v0.4s, v12.4s + add v1.4s, v1.4s, v13.4s + add v2.8h, v2.8h, v22.8h + + subs w4, w4, #8 + + movi v20.8b, #55 // idx of last 5 + movi v21.8b, #72 // idx of last 4 + movi v22.8b, #101 // idx of last 3 + movi v23.8b, #169 // idx of last 2 + + srshl v0.4s, v0.4s, v7.4s + srshl v1.4s, v1.4s, v7.4s + srshl v4.8h, v2.8h, v6.8h + mul v0.4s, v0.4s, v31.4s // a * n + mul v1.4s, v1.4s, v31.4s // a * n + umull v3.4s, v4.4h, v4.4h // b * b + umull2 v4.4s, v4.8h, v4.8h // b * b + uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) + uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) + mul v0.4s, v0.4s, v28.4s // p * s + mul v1.4s, v1.4s, v28.4s // p * s + ld1 {v8.4s, v9.4s}, [x5], #32 + uqshrn v0.4h, v0.4s, #16 + uqshrn2 v0.8h, v1.4s, #16 + ld1 {v10.4s, v11.4s}, [x6], #32 + uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) + + ld1 {v12.4s, v13.4s}, [x7], #32 + + cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 + cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 + tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b + cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 + cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 + ld1 {v14.4s, v15.4s}, [x8], #32 + add v25.8b, v25.8b, v26.8b + cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 + add v27.8b, v27.8b, v4.8b + ld1 {v20.8h}, [x9], #16 + add v5.8b, v5.8b, v19.8b + add v25.8b, v25.8b, v27.8b + ld1 {v21.8h}, [x10], #16 + add v5.8b, v1.8b, v5.8b + ld1 {v22.8h}, [x11], #16 + add v5.8b, v5.8b, v25.8b + ld1 {v23.8h}, [x12], #16 + uxtl v5.8h, v5.8b // x + + ld1 {v0.4s, v1.4s}, [x0], #32 + umull v3.4s, v5.4h, v2.4h // x * BB[i] + umull2 v4.4s, v5.8h, v2.8h // x * BB[i] + mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x + mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x + srshr v3.4s, v3.4s, #12 // AA[i] + srshr v4.4s, v4.4s, #12 // AA[i] + sub v5.8h, v29.8h, v5.8h // 256 - x + ld1 {v2.8h}, [x1], #16 + + st1 {v3.4s, v4.4s}, [x2], #32 + st1 {v5.8h}, [x3], #16 b.gt 1b - subs x3, x3, #1 - b.le 0f - add x0, x0, x7, lsl #2 - add x1, x1, x7, lsl #1 - mov x2, x6 - b 1b -0: + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 ret endfunc diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration_tmpl.S b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration_tmpl.S index 7cdfd6f3f..1373f9ace 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration_tmpl.S +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/looprestoration_tmpl.S @@ -30,52 +30,224 @@ #define FILTER_OUT_STRIDE 384 .macro sgr_funcs bpc -// void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp, -// const pixel *src, const ptrdiff_t stride, -// const int32_t *a, const int16_t *b, -// const int w, const int h); -function sgr_finish_filter1_\bpc\()bpc_neon, export=1 - sub x7, x3, #(4*SUM_STRIDE) - add x8, x3, #(4*SUM_STRIDE) - sub x9, x4, #(2*SUM_STRIDE) - add x10, x4, #(2*SUM_STRIDE) - mov x11, #SUM_STRIDE - mov x12, #FILTER_OUT_STRIDE - add x13, x5, #7 - bic x13, x13, #7 // Aligned width +// void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp, +// const pixel *src, +// const ptrdiff_t src_stride, +// const int32_t **a, +// const int16_t **b, +// const int w, const int h); +function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + ldp x7, x8, [x3] + ldp x9, x3, [x3, #16] + ldp x10, x11, [x4] + ldp x12, x4, [x4, #16] + + mov x13, #FILTER_OUT_STRIDE + cmp w6, #1 + add x2, x1, x2 // src + stride + csel x2, x1, x2, le // if (h <= 1) x2 = x1 + add x13, x0, x13, lsl #1 + + movi v30.8h, #3 + movi v31.4s, #3 +1: + ld1 {v0.8h, v1.8h}, [x10], #32 + ld1 {v2.8h, v3.8h}, [x11], #32 + ld1 {v4.8h, v5.8h}, [x12], #32 + ld1 {v6.8h, v7.8h}, [x4], #32 + ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 + ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48 + ld1 {v22.4s, v23.4s, v24.4s}, [x9], #48 + ld1 {v25.4s, v26.4s, v27.4s}, [x3], #48 + +2: + ext v8.16b, v0.16b, v1.16b, #2 // [0][1] + ext v9.16b, v2.16b, v3.16b, #2 // [1][1] + ext v10.16b, v4.16b, v5.16b, #2 // [2][1] + ext v11.16b, v0.16b, v1.16b, #4 // [0][2] + ext v12.16b, v2.16b, v3.16b, #4 // [1][2] + ext v13.16b, v4.16b, v5.16b, #4 // [2][2] + + add v14.8h, v2.8h, v8.8h // [1][0] + [0][1] + add v15.8h, v9.8h, v10.8h // [1][1] + [2][1] + + add v28.8h, v0.8h, v11.8h // [0][0] + [0][2] + add v14.8h, v14.8h, v12.8h // () + [1][2] + add v29.8h, v4.8h, v13.8h // [2][0] + [2][2] + + ext v8.16b, v6.16b, v7.16b, #2 // [3][1] + ext v11.16b, v6.16b, v7.16b, #4 // [3][2] + + add v14.8h, v14.8h, v15.8h // mid + add v15.8h, v28.8h, v29.8h // corners + + add v28.8h, v4.8h, v9.8h // [2][0] + [1][1] + add v29.8h, v10.8h, v8.8h // [2][1] + [3][1] + + add v2.8h, v2.8h, v12.8h // [1][0] + [1][2] + add v28.8h, v28.8h, v13.8h // () + [2][2] + add v4.8h, v6.8h, v11.8h // [3][0] + [3][2] + + add v0.8h, v28.8h, v29.8h // mid + add v2.8h, v2.8h, v4.8h // corners + + shl v4.8h, v14.8h, #2 + mla v4.8h, v15.8h, v30.8h // * 3 -> a + + shl v0.8h, v0.8h, #2 + mla v0.8h, v2.8h, v30.8h // * 3 -> a + + ext v8.16b, v16.16b, v17.16b, #4 // [0][1] + ext v9.16b, v17.16b, v18.16b, #4 + ext v10.16b, v16.16b, v17.16b, #8 // [0][2] + ext v11.16b, v17.16b, v18.16b, #8 + ext v12.16b, v19.16b, v20.16b, #4 // [1][1] + ext v13.16b, v20.16b, v21.16b, #4 + add v8.4s, v8.4s, v19.4s // [0][1] + [1][0] + add v9.4s, v9.4s, v20.4s + add v16.4s, v16.4s, v10.4s // [0][0] + [0][2] + add v17.4s, v17.4s, v11.4s + ext v14.16b, v19.16b, v20.16b, #8 // [1][2] + ext v15.16b, v20.16b, v21.16b, #8 + add v16.4s, v16.4s, v22.4s // () + [2][0] + add v17.4s, v17.4s, v23.4s + add v28.4s, v12.4s, v14.4s // [1][1] + [1][2] + add v29.4s, v13.4s, v15.4s + ext v10.16b, v22.16b, v23.16b, #4 // [2][1] + ext v11.16b, v23.16b, v24.16b, #4 + add v8.4s, v8.4s, v28.4s // mid (incomplete) + add v9.4s, v9.4s, v29.4s + + add v19.4s, v19.4s, v14.4s // [1][0] + [1][2] + add v20.4s, v20.4s, v15.4s + add v14.4s, v22.4s, v12.4s // [2][0] + [1][1] + add v15.4s, v23.4s, v13.4s + + ext v12.16b, v22.16b, v23.16b, #8 // [2][2] + ext v13.16b, v23.16b, v24.16b, #8 + ext v28.16b, v25.16b, v26.16b, #4 // [3][1] + ext v29.16b, v26.16b, v27.16b, #4 + add v8.4s, v8.4s, v10.4s // () + [2][1] = mid + add v9.4s, v9.4s, v11.4s + add v14.4s, v14.4s, v10.4s // () + [2][1] + add v15.4s, v15.4s, v11.4s + ext v10.16b, v25.16b, v26.16b, #8 // [3][2] + ext v11.16b, v26.16b, v27.16b, #8 + add v16.4s, v16.4s, v12.4s // () + [2][2] = corner + add v17.4s, v17.4s, v13.4s + + add v12.4s, v12.4s, v28.4s // [2][2] + [3][1] + add v13.4s, v13.4s, v29.4s + add v25.4s, v25.4s, v10.4s // [3][0] + [3][2] + add v26.4s, v26.4s, v11.4s + + add v14.4s, v14.4s, v12.4s // mid + add v15.4s, v15.4s, v13.4s + add v19.4s, v19.4s, v25.4s // corner + add v20.4s, v20.4s, v26.4s + .if \bpc == 8 - sub x2, x2, x13 + ld1 {v25.8b}, [x1], #8 // src + ld1 {v26.8b}, [x2], #8 .else - sub x2, x2, x13, lsl #1 + ld1 {v25.8h}, [x1], #16 // src + ld1 {v26.8h}, [x2], #16 .endif - sub x12, x12, x13 - sub x11, x11, x13 - sub x11, x11, #4 // We read 4 extra elements from a - sub x14, x11, #4 // We read 8 extra elements from b - mov x13, x5 + + shl v8.4s, v8.4s, #2 + shl v9.4s, v9.4s, #2 + mla v8.4s, v16.4s, v31.4s // * 3 -> b + mla v9.4s, v17.4s, v31.4s + +.if \bpc == 8 + uxtl v25.8h, v25.8b // src + uxtl v26.8h, v26.8b +.endif + + shl v14.4s, v14.4s, #2 + shl v15.4s, v15.4s, #2 + mla v14.4s, v19.4s, v31.4s // * 3 -> b + mla v15.4s, v20.4s, v31.4s + + umlal v8.4s, v4.4h, v25.4h // b + a * src + umlal2 v9.4s, v4.8h, v25.8h + umlal v14.4s, v0.4h, v26.4h // b + a * src + umlal2 v15.4s, v0.8h, v26.8h + mov v0.16b, v1.16b + rshrn v8.4h, v8.4s, #9 + rshrn2 v8.8h, v9.4s, #9 + mov v2.16b, v3.16b + rshrn v14.4h, v14.4s, #9 + rshrn2 v14.8h, v15.4s, #9 + subs w5, w5, #8 + mov v4.16b, v5.16b + st1 {v8.8h}, [x0], #16 + mov v6.16b, v7.16b + st1 {v14.8h}, [x13], #16 + + b.le 3f + mov v16.16b, v18.16b + mov v19.16b, v21.16b + mov v22.16b, v24.16b + mov v25.16b, v27.16b + ld1 {v1.8h}, [x10], #16 + ld1 {v3.8h}, [x11], #16 + ld1 {v5.8h}, [x12], #16 + ld1 {v7.8h}, [x4], #16 + ld1 {v17.4s, v18.4s}, [x7], #32 + ld1 {v20.4s, v21.4s}, [x8], #32 + ld1 {v23.4s, v24.4s}, [x9], #32 + ld1 {v26.4s, v27.4s}, [x3], #32 + b 2b + +3: + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + ret +endfunc + +// void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst, +// const int32_t **a, const int16_t **b, +// const int w, const int w1, +// const int bitdepth_max); +function sgr_finish_weighted1_\bpc\()bpc_neon, export=1 + ldp x7, x8, [x1] + ldr x1, [x1, #16] + ldp x9, x10, [x2] + ldr x2, [x2, #16] + + dup v31.8h, w4 + dup v30.8h, w5 + movi v6.8h, #3 movi v7.4s, #3 1: - ld1 {v0.8h, v1.8h}, [x9], #32 - ld1 {v2.8h, v3.8h}, [x4], #32 - ld1 {v4.8h, v5.8h}, [x10], #32 + ld1 {v0.8h, v1.8h}, [x9], #32 + ld1 {v2.8h, v3.8h}, [x10], #32 + ld1 {v4.8h, v5.8h}, [x2], #32 ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 - ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48 - ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48 + ld1 {v19.4s, v20.4s, v21.4s}, [x8], #48 + ld1 {v22.4s, v23.4s, v24.4s}, [x1], #48 2: - subs x5, x5, #8 ext v25.16b, v0.16b, v1.16b, #2 // -stride ext v26.16b, v2.16b, v3.16b, #2 // 0 ext v27.16b, v4.16b, v5.16b, #2 // +stride ext v28.16b, v0.16b, v1.16b, #4 // +1-stride ext v29.16b, v2.16b, v3.16b, #4 // +1 - ext v30.16b, v4.16b, v5.16b, #4 // +1+stride add v2.8h, v2.8h, v25.8h // -1, -stride + ext v25.16b, v4.16b, v5.16b, #4 // +1+stride add v26.8h, v26.8h, v27.8h // 0, +stride add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride add v2.8h, v2.8h, v26.8h - add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride + add v4.8h, v4.8h, v25.8h // -1+stride, +1+stride add v2.8h, v2.8h, v29.8h // +1 add v0.8h, v0.8h, v4.8h @@ -85,7 +257,7 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1 ext v27.16b, v16.16b, v17.16b, #8 // +1-stride ext v28.16b, v17.16b, v18.16b, #8 ext v29.16b, v19.16b, v20.16b, #4 // 0 - ext v30.16b, v20.16b, v21.16b, #4 + ext v4.16b, v20.16b, v21.16b, #4 mla v2.8h, v0.8h, v6.8h // * 3 -> a add v25.4s, v25.4s, v19.4s // -stride, -1 add v26.4s, v26.4s, v20.4s @@ -96,22 +268,22 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1 add v16.4s, v16.4s, v22.4s // -1+stride add v17.4s, v17.4s, v23.4s add v29.4s, v29.4s, v27.4s // 0, +1 - add v30.4s, v30.4s, v28.4s + add v4.4s, v4.4s, v28.4s add v25.4s, v25.4s, v29.4s - add v26.4s, v26.4s, v30.4s + add v26.4s, v26.4s, v4.4s ext v27.16b, v22.16b, v23.16b, #4 // +stride ext v28.16b, v23.16b, v24.16b, #4 ext v29.16b, v22.16b, v23.16b, #8 // +1+stride - ext v30.16b, v23.16b, v24.16b, #8 + ext v4.16b, v23.16b, v24.16b, #8 .if \bpc == 8 - ld1 {v19.8b}, [x1], #8 // src + ld1 {v19.8b}, [x0] // src .else - ld1 {v19.8h}, [x1], #16 // src + ld1 {v19.8h}, [x0] // src .endif add v25.4s, v25.4s, v27.4s // +stride add v26.4s, v26.4s, v28.4s add v16.4s, v16.4s, v29.4s // +1+stride - add v17.4s, v17.4s, v30.4s + add v17.4s, v17.4s, v4.4s shl v25.4s, v25.4s, #2 shl v26.4s, v26.4s, #2 mla v25.4s, v16.4s, v7.4s // * 3 -> b @@ -125,61 +297,68 @@ function sgr_finish_filter1_\bpc\()bpc_neon, export=1 mov v2.16b, v3.16b rshrn v25.4h, v25.4s, #9 rshrn2 v25.8h, v26.4s, #9 + + subs w3, w3, #8 + + // weighted1 + shl v19.8h, v19.8h, #4 // u mov v4.16b, v5.16b - st1 {v25.8h}, [x0], #16 - b.le 3f + sub v25.8h, v25.8h, v19.8h // t1 - u + ld1 {v1.8h}, [x9], #16 + ushll v26.4s, v19.4h, #7 // u << 7 + ushll2 v27.4s, v19.8h, #7 // u << 7 + ld1 {v3.8h}, [x10], #16 + smlal v26.4s, v25.4h, v31.4h // v + smlal2 v27.4s, v25.8h, v31.8h // v + ld1 {v5.8h}, [x2], #16 +.if \bpc == 8 + rshrn v26.4h, v26.4s, #11 + rshrn2 v26.8h, v27.4s, #11 + mov v16.16b, v18.16b + sqxtun v26.8b, v26.8h + mov v19.16b, v21.16b + mov v22.16b, v24.16b + st1 {v26.8b}, [x0], #8 +.else + sqrshrun v26.4h, v26.4s, #11 + sqrshrun2 v26.8h, v27.4s, #11 mov v16.16b, v18.16b + umin v26.8h, v26.8h, v30.8h mov v19.16b, v21.16b mov v22.16b, v24.16b - ld1 {v1.8h}, [x9], #16 - ld1 {v3.8h}, [x4], #16 - ld1 {v5.8h}, [x10], #16 + st1 {v26.8h}, [x0], #16 +.endif + + b.le 3f ld1 {v17.4s, v18.4s}, [x7], #32 - ld1 {v20.4s, v21.4s}, [x3], #32 - ld1 {v23.4s, v24.4s}, [x8], #32 + ld1 {v20.4s, v21.4s}, [x8], #32 + ld1 {v23.4s, v24.4s}, [x1], #32 b 2b 3: - subs x6, x6, #1 - b.le 0f - mov x5, x13 - add x0, x0, x12, lsl #1 - add x1, x1, x2 - add x3, x3, x11, lsl #2 - add x7, x7, x11, lsl #2 - add x8, x8, x11, lsl #2 - add x4, x4, x14, lsl #1 - add x9, x9, x14, lsl #1 - add x10, x10, x14, lsl #1 - b 1b -0: ret endfunc -// void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp, -// const pixel *src, const ptrdiff_t stride, -// const int32_t *a, const int16_t *b, -// const int w, const int h); -function sgr_finish_filter2_\bpc\()bpc_neon, export=1 - add x7, x3, #(4*(SUM_STRIDE)) - sub x3, x3, #(4*(SUM_STRIDE)) - add x8, x4, #(2*(SUM_STRIDE)) - sub x4, x4, #(2*(SUM_STRIDE)) - mov x9, #(2*SUM_STRIDE) +// void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp, +// const pixel *src, +// const ptrdiff_t stride, +// const int32_t **a, +// const int16_t **b, +// const int w, const int h); +function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + ldp x3, x7, [x3] + ldp x4, x8, [x4] mov x10, #FILTER_OUT_STRIDE - add x11, x5, #7 - bic x11, x11, #7 // Aligned width -.if \bpc == 8 - sub x2, x2, x11 -.else - sub x2, x2, x11, lsl #1 -.endif - sub x10, x10, x11 - sub x9, x9, x11 - sub x9, x9, #4 // We read 4 extra elements from a - sub x12, x9, #4 // We read 8 extra elements from b - mov x11, x5 + cmp w6, #1 + add x2, x1, x2 // src + stride + csel x2, x1, x2, le // if (h <= 1) x2 = x1 + add x10, x0, x10, lsl #1 movi v4.8h, #5 movi v5.4s, #5 movi v6.8h, #6 @@ -191,7 +370,6 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1 ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 2: - subs x5, x5, #8 ext v24.16b, v0.16b, v1.16b, #4 // +1-stride ext v25.16b, v2.16b, v3.16b, #4 // +1+stride ext v22.16b, v0.16b, v1.16b, #2 // -stride @@ -201,6 +379,9 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1 add v2.8h, v22.8h, v23.8h // -stride, +stride add v0.8h, v0.8h, v25.8h + mul v8.8h, v25.8h, v4.8h // * 5 + mla v8.8h, v23.8h, v6.8h // * 6 + ext v22.16b, v16.16b, v17.16b, #4 // -stride ext v23.16b, v17.16b, v18.16b, #4 ext v24.16b, v19.16b, v20.16b, #4 // +stride @@ -213,8 +394,10 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1 mla v0.8h, v2.8h, v6.8h // * 6 .if \bpc == 8 ld1 {v31.8b}, [x1], #8 + ld1 {v30.8b}, [x2], #8 .else ld1 {v31.8h}, [x1], #16 + ld1 {v30.8h}, [x2], #16 .endif add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride add v17.4s, v17.4s, v27.4s @@ -223,6 +406,11 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1 add v16.4s, v16.4s, v19.4s add v17.4s, v17.4s, v20.4s + mul v9.4s, v19.4s, v5.4s // * 5 + mla v9.4s, v24.4s, v7.4s // * 6 + mul v10.4s, v20.4s, v5.4s // * 5 + mla v10.4s, v25.4s, v7.4s // * 6 + add v22.4s, v22.4s, v24.4s // -stride, +stride add v23.4s, v23.4s, v25.4s // This is, surprisingly, faster than other variants where the @@ -234,16 +422,23 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1 .if \bpc == 8 uxtl v31.8h, v31.8b + uxtl v30.8h, v30.8b .endif umlal v16.4s, v0.4h, v31.4h // b + a * src umlal2 v17.4s, v0.8h, v31.8h + umlal v9.4s, v8.4h, v30.4h // b + a * src + umlal2 v10.4s, v8.8h, v30.8h mov v0.16b, v1.16b rshrn v16.4h, v16.4s, #9 rshrn2 v16.8h, v17.4s, #9 + rshrn v9.4h, v9.4s, #8 + rshrn2 v9.8h, v10.4s, #8 + subs w5, w5, #8 mov v2.16b, v3.16b - st1 {v16.8h}, [x0], #16 + st1 {v16.8h}, [x0], #16 + st1 {v9.8h}, [x10], #16 - b.le 3f + b.le 9f mov v16.16b, v18.16b mov v19.16b, v21.16b ld1 {v1.8h}, [x4], #16 @@ -252,201 +447,160 @@ function sgr_finish_filter2_\bpc\()bpc_neon, export=1 ld1 {v20.4s, v21.4s}, [x7], #32 b 2b -3: - subs x6, x6, #1 - b.le 0f - mov x5, x11 - add x0, x0, x10, lsl #1 - add x1, x1, x2 - add x3, x3, x9, lsl #2 - add x7, x7, x9, lsl #2 - add x4, x4, x12, lsl #1 - add x8, x8, x12, lsl #1 - mov x13, x3 - mov x14, x4 +9: + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + ret +endfunc - ld1 {v0.8h, v1.8h}, [x4], #32 - ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 +// void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, +// const int32_t **a, +// const int16_t **b, +// const int w, const int h, +// const int w1, +// const int bitdepth_max); +function sgr_finish_weighted2_\bpc\()bpc_neon, export=1 + stp d8, d9, [sp, #-0x30]! + str d10, [sp, #0x10] + stp d14, d15, [sp, #0x20] + + dup v14.8h, w6 + dup v15.8h, w7 -4: - subs x5, x5, #8 - ext v23.16b, v0.16b, v1.16b, #4 // +1 - ext v22.16b, v0.16b, v1.16b, #2 // 0 - add v0.8h, v0.8h, v23.8h // -1, +1 + ldp x2, x7, [x2] + ldp x3, x8, [x3] + cmp w5, #1 + add x1, x0, x1 // src + stride + // if (h <= 1), set the pointer to the second row to any dummy buffer + // we can clobber (x2 in this case) + csel x1, x2, x1, le + movi v4.8h, #5 + movi v5.4s, #5 + movi v6.8h, #6 + movi v7.4s, #6 +1: + ld1 {v0.8h, v1.8h}, [x3], #32 + ld1 {v2.8h, v3.8h}, [x8], #32 + ld1 {v16.4s, v17.4s, v18.4s}, [x2], #48 + ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 - ext v24.16b, v16.16b, v17.16b, #4 // 0 - ext v25.16b, v17.16b, v18.16b, #4 - ext v26.16b, v16.16b, v17.16b, #8 // +1 +2: + ext v24.16b, v0.16b, v1.16b, #4 // +1-stride + ext v25.16b, v2.16b, v3.16b, #4 // +1+stride + ext v22.16b, v0.16b, v1.16b, #2 // -stride + ext v23.16b, v2.16b, v3.16b, #2 // +stride + add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride + add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride + add v2.8h, v22.8h, v23.8h // -stride, +stride + add v0.8h, v0.8h, v25.8h + + mul v8.8h, v25.8h, v4.8h // * 5 + mla v8.8h, v23.8h, v6.8h // * 6 + + ext v22.16b, v16.16b, v17.16b, #4 // -stride + ext v23.16b, v17.16b, v18.16b, #4 + ext v24.16b, v19.16b, v20.16b, #4 // +stride + ext v25.16b, v20.16b, v21.16b, #4 + ext v26.16b, v16.16b, v17.16b, #8 // +1-stride ext v27.16b, v17.16b, v18.16b, #8 - mul v2.8h, v22.8h, v6.8h // * 6 - mla v2.8h, v0.8h, v4.8h // * 5 -> a + ext v28.16b, v19.16b, v20.16b, #8 // +1+stride + ext v29.16b, v20.16b, v21.16b, #8 + mul v0.8h, v0.8h, v4.8h // * 5 + mla v0.8h, v2.8h, v6.8h // * 6 .if \bpc == 8 - ld1 {v31.8b}, [x1], #8 + ld1 {v31.8b}, [x0] + ld1 {v30.8b}, [x1] .else - ld1 {v31.8h}, [x1], #16 + ld1 {v31.8h}, [x0] + ld1 {v30.8h}, [x1] .endif - add v16.4s, v16.4s, v26.4s // -1, +1 + add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride add v17.4s, v17.4s, v27.4s -.if \bpc == 8 - uxtl v31.8h, v31.8b -.endif + add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride + add v20.4s, v20.4s, v29.4s + add v16.4s, v16.4s, v19.4s + add v17.4s, v17.4s, v20.4s + + mul v9.4s, v19.4s, v5.4s // * 5 + mla v9.4s, v24.4s, v7.4s // * 6 + mul v10.4s, v20.4s, v5.4s // * 5 + mla v10.4s, v25.4s, v7.4s // * 6 + + add v22.4s, v22.4s, v24.4s // -stride, +stride + add v23.4s, v23.4s, v25.4s // This is, surprisingly, faster than other variants where the // mul+mla pairs are further apart, on Cortex A53. - mul v24.4s, v24.4s, v7.4s // * 6 - mla v24.4s, v16.4s, v5.4s // * 5 -> b - mul v25.4s, v25.4s, v7.4s // * 6 - mla v25.4s, v17.4s, v5.4s // * 5 -> b + mul v16.4s, v16.4s, v5.4s // * 5 + mla v16.4s, v22.4s, v7.4s // * 6 + mul v17.4s, v17.4s, v5.4s // * 5 + mla v17.4s, v23.4s, v7.4s // * 6 - umlal v24.4s, v2.4h, v31.4h // b + a * src - umlal2 v25.4s, v2.8h, v31.8h +.if \bpc == 8 + uxtl v31.8h, v31.8b + uxtl v30.8h, v30.8b +.endif + umlal v16.4s, v0.4h, v31.4h // b + a * src + umlal2 v17.4s, v0.8h, v31.8h + umlal v9.4s, v8.4h, v30.4h // b + a * src + umlal2 v10.4s, v8.8h, v30.8h mov v0.16b, v1.16b - rshrn v24.4h, v24.4s, #8 - rshrn2 v24.8h, v25.4s, #8 - mov v16.16b, v18.16b - st1 {v24.8h}, [x0], #16 + rshrn v16.4h, v16.4s, #9 + rshrn2 v16.8h, v17.4s, #9 + rshrn v9.4h, v9.4s, #8 + rshrn2 v9.8h, v10.4s, #8 - b.le 5f - ld1 {v1.8h}, [x4], #16 - ld1 {v17.4s, v18.4s}, [x3], #32 - b 4b - -5: - subs x6, x6, #1 - b.le 0f - mov x5, x11 - add x0, x0, x10, lsl #1 - add x1, x1, x2 - mov x3, x13 // Rewind x3/x4 to where they started - mov x4, x14 - b 1b -0: - ret -endfunc + subs w4, w4, #8 -// void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride, -// const pixel *src, const ptrdiff_t src_stride, -// const int16_t *t1, const int w, const int h, -// const int wt, const int bitdepth_max); -function sgr_weighted1_\bpc\()bpc_neon, export=1 -.if \bpc == 16 - ldr w8, [sp] -.endif - dup v31.8h, w7 - cmp x6, #2 -.if \bpc == 16 - dup v30.8h, w8 -.endif - add x9, x0, x1 - add x10, x2, x3 - add x11, x4, #2*FILTER_OUT_STRIDE - mov x7, #(4*FILTER_OUT_STRIDE) - lsl x1, x1, #1 - lsl x3, x3, #1 - add x8, x5, #7 - bic x8, x8, #7 // Aligned width -.if \bpc == 8 - sub x1, x1, x8 - sub x3, x3, x8 -.else - sub x1, x1, x8, lsl #1 - sub x3, x3, x8, lsl #1 -.endif - sub x7, x7, x8, lsl #1 - mov x8, x5 - b.lt 2f -1: -.if \bpc == 8 - ld1 {v0.8b}, [x2], #8 - ld1 {v4.8b}, [x10], #8 -.else - ld1 {v0.8h}, [x2], #16 - ld1 {v4.8h}, [x10], #16 -.endif - ld1 {v1.8h}, [x4], #16 - ld1 {v5.8h}, [x11], #16 - subs x5, x5, #8 -.if \bpc == 8 - ushll v0.8h, v0.8b, #4 // u - ushll v4.8h, v4.8b, #4 // u -.else - shl v0.8h, v0.8h, #4 // u - shl v4.8h, v4.8h, #4 // u -.endif - sub v1.8h, v1.8h, v0.8h // t1 - u - sub v5.8h, v5.8h, v4.8h // t1 - u - ushll v2.4s, v0.4h, #7 // u << 7 - ushll2 v3.4s, v0.8h, #7 // u << 7 - ushll v6.4s, v4.4h, #7 // u << 7 - ushll2 v7.4s, v4.8h, #7 // u << 7 - smlal v2.4s, v1.4h, v31.4h // v - smlal2 v3.4s, v1.8h, v31.8h // v - smlal v6.4s, v5.4h, v31.4h // v - smlal2 v7.4s, v5.8h, v31.8h // v + // weighted1 + shl v31.8h, v31.8h, #4 // u + shl v30.8h, v30.8h, #4 + mov v2.16b, v3.16b + + sub v16.8h, v16.8h, v31.8h // t1 - u + sub v9.8h, v9.8h, v30.8h + ld1 {v1.8h}, [x3], #16 + ushll v22.4s, v31.4h, #7 // u << 7 + ushll2 v23.4s, v31.8h, #7 + ushll v24.4s, v30.4h, #7 + ushll2 v25.4s, v30.8h, #7 + ld1 {v3.8h}, [x8], #16 + smlal v22.4s, v16.4h, v14.4h // v + smlal2 v23.4s, v16.8h, v14.8h + mov v16.16b, v18.16b + smlal v24.4s, v9.4h, v14.4h + smlal2 v25.4s, v9.8h, v14.8h + mov v19.16b, v21.16b .if \bpc == 8 - rshrn v2.4h, v2.4s, #11 - rshrn2 v2.8h, v3.4s, #11 - rshrn v6.4h, v6.4s, #11 - rshrn2 v6.8h, v7.4s, #11 - sqxtun v2.8b, v2.8h - sqxtun v6.8b, v6.8h - st1 {v2.8b}, [x0], #8 - st1 {v6.8b}, [x9], #8 + rshrn v22.4h, v22.4s, #11 + rshrn2 v22.8h, v23.4s, #11 + rshrn v23.4h, v24.4s, #11 + rshrn2 v23.8h, v25.4s, #11 + sqxtun v22.8b, v22.8h + sqxtun v23.8b, v23.8h + st1 {v22.8b}, [x0], #8 + st1 {v23.8b}, [x1], #8 .else - sqrshrun v2.4h, v2.4s, #11 - sqrshrun2 v2.8h, v3.4s, #11 - sqrshrun v6.4h, v6.4s, #11 - sqrshrun2 v6.8h, v7.4s, #11 - umin v2.8h, v2.8h, v30.8h - umin v6.8h, v6.8h, v30.8h - st1 {v2.8h}, [x0], #16 - st1 {v6.8h}, [x9], #16 + sqrshrun v22.4h, v22.4s, #11 + sqrshrun2 v22.8h, v23.4s, #11 + sqrshrun v23.4h, v24.4s, #11 + sqrshrun2 v23.8h, v25.4s, #11 + umin v22.8h, v22.8h, v15.8h + umin v23.8h, v23.8h, v15.8h + st1 {v22.8h}, [x0], #16 + st1 {v23.8h}, [x1], #16 .endif - b.gt 1b - sub x6, x6, #2 - cmp x6, #1 - b.lt 0f - mov x5, x8 - add x0, x0, x1 - add x9, x9, x1 - add x2, x2, x3 - add x10, x10, x3 - add x4, x4, x7 - add x11, x11, x7 - b.eq 2f - b 1b + b.le 3f + ld1 {v17.4s, v18.4s}, [x2], #32 + ld1 {v20.4s, v21.4s}, [x7], #32 + b 2b -2: -.if \bpc == 8 - ld1 {v0.8b}, [x2], #8 -.else - ld1 {v0.8h}, [x2], #16 -.endif - ld1 {v1.8h}, [x4], #16 - subs x5, x5, #8 -.if \bpc == 8 - ushll v0.8h, v0.8b, #4 // u -.else - shl v0.8h, v0.8h, #4 // u -.endif - sub v1.8h, v1.8h, v0.8h // t1 - u - ushll v2.4s, v0.4h, #7 // u << 7 - ushll2 v3.4s, v0.8h, #7 // u << 7 - smlal v2.4s, v1.4h, v31.4h // v - smlal2 v3.4s, v1.8h, v31.8h // v -.if \bpc == 8 - rshrn v2.4h, v2.4s, #11 - rshrn2 v2.8h, v3.4s, #11 - sqxtun v2.8b, v2.8h - st1 {v2.8b}, [x0], #8 -.else - sqrshrun v2.4h, v2.4s, #11 - sqrshrun2 v2.8h, v3.4s, #11 - umin v2.8h, v2.8h, v30.8h - st1 {v2.8h}, [x0], #16 -.endif - b.gt 2b -0: +3: + ldp d14, d15, [sp, #0x20] + ldr d10, [sp, #0x10] + ldp d8, d9, [sp], 0x30 ret endfunc @@ -461,7 +615,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1 .else ldp x8, x9, [sp] .endif - cmp x7, #2 + cmp w7, #2 add x10, x0, x1 add x11, x2, x3 add x12, x4, #2*FILTER_OUT_STRIDE @@ -483,7 +637,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1 sub x3, x3, x9, lsl #1 .endif sub x8, x8, x9, lsl #1 - mov x9, x6 + mov w9, w6 b.lt 2f 1: .if \bpc == 8 @@ -497,7 +651,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1 ld1 {v17.8h}, [x12], #16 ld1 {v2.8h}, [x5], #16 ld1 {v18.8h}, [x13], #16 - subs x6, x6, #8 + subs w6, w6, #8 .if \bpc == 8 ushll v0.8h, v0.8b, #4 // u ushll v16.8h, v16.8b, #4 // u @@ -542,10 +696,10 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1 .endif b.gt 1b - subs x7, x7, #2 - cmp x7, #1 + subs w7, w7, #2 + cmp w7, #1 b.lt 0f - mov x6, x9 + mov w6, w9 add x0, x0, x1 add x10, x10, x1 add x2, x2, x3 @@ -565,7 +719,7 @@ function sgr_weighted2_\bpc\()bpc_neon, export=1 .endif ld1 {v1.8h}, [x4], #16 ld1 {v2.8h}, [x5], #16 - subs x6, x6, #8 + subs w6, w6, #8 .if \bpc == 8 ushll v0.8h, v0.8b, #4 // u .else diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/refmvs.S b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/refmvs.S index becd4c08f..e905682f4 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/refmvs.S +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/64/refmvs.S @@ -89,3 +89,204 @@ L(splat_tbl): .hword L(splat_tbl) - 20b .hword L(splat_tbl) - 10b endfunc + +const mv_tbls, align=4 + .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 + .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 + .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 + .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 +endconst + +const mask_mult, align=4 + .byte 1, 2, 1, 2, 0, 0, 0, 0 +endconst + +// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride, +// refmvs_block **rr, const uint8_t *ref_sign, +// int col_end8, int row_end8, +// int col_start8, int row_start8) +function save_tmvs_neon, export=1 + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-16]! + mov x29, sp + + movi v30.8b, #0 + ld1 {v31.8b}, [x3] + adr x8, L(save_tmvs_tbl) + movrel x16, mask_mult + movrel x13, mv_tbls + ld1 {v29.8b}, [x16] + ext v31.8b, v30.8b, v31.8b, #7 // [0, ref_sign] + mov w15, #5 + mov w14, #12*2 + sxtw x4, w4 + sxtw x6, w6 + mul w1, w1, w15 // stride *= 5 + sub w5, w5, w7 // h = row_end8 - row_start8 + lsl w7, w7, #1 // row_start8 <<= 1 +1: + mov w15, #5 + and w9, w7, #30 // (y & 15) * 2 + ldr x9, [x2, w9, uxtw #3] // b = rr[(y & 15) * 2] + add x9, x9, #12 // &b[... + 1] + madd x10, x4, x14, x9 // end_cand_b = &b[col_end8*2 + 1] + madd x9, x6, x14, x9 // cand_b = &b[x*2 + 1] + + madd x3, x6, x15, x0 // &rp[x] + +2: + ldrb w11, [x9, #10] // cand_b->bs + ld1 {v0.16b}, [x9] // cand_b->mv + add x11, x8, w11, uxtw #2 + ldr h1, [x9, #8] // cand_b->ref + ldrh w12, [x11] // bw8 + mov x15, x8 + add x9, x9, w12, uxtw #1 // cand_b += bw8*2 + cmp x9, x10 + mov v2.8b, v0.8b + b.ge 3f + + ldrb w15, [x9, #10] // cand_b->bs + add x16, x9, #8 + ld1 {v4.16b}, [x9] // cand_b->mv + add x15, x8, w15, uxtw #2 + ld1 {v1.h}[1], [x16] // cand_b->ref + ldrh w12, [x15] // bw8 + add x9, x9, w12, uxtw #1 // cand_b += bw8*2 + trn1 v2.2d, v0.2d, v4.2d + +3: + abs v2.8h, v2.8h // abs(mv[].xy) + tbl v1.8b, {v31.16b}, v1.8b // ref_sign[ref] + ushr v2.8h, v2.8h, #12 // abs(mv[].xy) >> 12 + umull v1.8h, v1.8b, v29.8b // ref_sign[ref] * {1, 2} + cmeq v2.4s, v2.4s, #0 // abs(mv[].xy) <= 4096 + xtn v2.4h, v2.4s // abs() condition to 16 bit + and v1.8b, v1.8b, v2.8b // h[0-3] contains conditions for mv[0-1] + addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0] + umov w16, v1.h[0] // Extract case for first block + umov w17, v1.h[1] + ldrh w11, [x11, #2] // Fetch jump table entry + ldrh w15, [x15, #2] + ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case + ldr q5, [x13, w17, uxtw #4] + sub x11, x8, w11, uxtw // Find jump table target + sub x15, x8, w15, uxtw + tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block + tbl v4.16b, {v4.16b}, v5.16b + + // v1 follows on v0, with another 3 full repetitions of the pattern. + ext v1.16b, v0.16b, v0.16b, #1 + ext v5.16b, v4.16b, v4.16b, #1 + // v2 ends with 3 complete repetitions of the pattern. + ext v2.16b, v0.16b, v1.16b, #4 + ext v6.16b, v4.16b, v5.16b, #4 + + blr x11 + b.ge 4f // if (cand_b >= end) + mov v0.16b, v4.16b + mov v1.16b, v5.16b + mov v2.16b, v6.16b + cmp x9, x10 + blr x15 + b.lt 2b // if (cand_b < end) + +4: + subs w5, w5, #1 // h-- + add w7, w7, #2 // y += 2 + add x0, x0, x1 // rp += stride + b.gt 1b + + ldp x29, x30, [sp], #16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +10: + AARCH64_VALID_CALL_TARGET + add x16, x3, #4 + st1 {v0.s}[0], [x3] + st1 {v0.b}[4], [x16] + add x3, x3, #5 + ret +20: + AARCH64_VALID_CALL_TARGET + add x16, x3, #8 + st1 {v0.d}[0], [x3] + st1 {v0.h}[4], [x16] + add x3, x3, #2*5 + ret +40: + AARCH64_VALID_CALL_TARGET + st1 {v0.16b}, [x3] + str s1, [x3, #16] + add x3, x3, #4*5 + ret +80: + AARCH64_VALID_CALL_TARGET + // This writes 6 full entries plus 2 extra bytes + st1 {v0.16b, v1.16b}, [x3] + // Write the last few, overlapping with the first write. + stur q2, [x3, #(8*5-16)] + add x3, x3, #8*5 + ret +160: + AARCH64_VALID_CALL_TARGET + add x16, x3, #6*5 + add x17, x3, #12*5 + // This writes 6 full entries plus 2 extra bytes + st1 {v0.16b, v1.16b}, [x3] + // Write another 6 full entries, slightly overlapping with the first set + st1 {v0.16b, v1.16b}, [x16] + // Write 8 bytes (one full entry) after the first 12 + st1 {v0.8b}, [x17] + // Write the last 3 entries + str q2, [x3, #(16*5-16)] + add x3, x3, #16*5 + ret + +L(save_tmvs_tbl): + .hword 16 * 12 + .hword L(save_tmvs_tbl) - 160b + .hword 16 * 12 + .hword L(save_tmvs_tbl) - 160b + .hword 8 * 12 + .hword L(save_tmvs_tbl) - 80b + .hword 8 * 12 + .hword L(save_tmvs_tbl) - 80b + .hword 8 * 12 + .hword L(save_tmvs_tbl) - 80b + .hword 8 * 12 + .hword L(save_tmvs_tbl) - 80b + .hword 4 * 12 + .hword L(save_tmvs_tbl) - 40b + .hword 4 * 12 + .hword L(save_tmvs_tbl) - 40b + .hword 4 * 12 + .hword L(save_tmvs_tbl) - 40b + .hword 4 * 12 + .hword L(save_tmvs_tbl) - 40b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 2 * 12 + .hword L(save_tmvs_tbl) - 20b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b + .hword 1 * 12 + .hword L(save_tmvs_tbl) - 10b +endfunc diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/filmgrain.h b/prog/3rdPartyLibs/codecs/dav1d/src/arm/filmgrain.h index 48776ac85..9f51b0310 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/filmgrain.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/filmgrain.h @@ -91,8 +91,8 @@ static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row, int offsets[2 /* col offset */][2 /* row offset */]; - // process this row in BLOCK_SIZE^2 blocks - for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) { + // process this row in FG_BLOCK_SIZE^2 blocks + for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) { if (data->overlap_flag && bx) { // shift previous offsets left @@ -155,8 +155,8 @@ fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \ \ int offsets[2 /* col offset */][2 /* row offset */]; \ \ - /* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \ - for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \ + /* process this row in FG_BLOCK_SIZE^2 blocks (subsampled) */ \ + for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) { \ if (data->overlap_flag && bx) { \ /* shift previous offsets left */ \ for (int i = 0; i < rows; i++) \ diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/ipred.h b/prog/3rdPartyLibs/codecs/dav1d/src/arm/ipred.h index aef4daebb..9c2aae748 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/ipred.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/ipred.h @@ -50,6 +50,247 @@ decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon)); decl_pal_pred_fn(BF(dav1d_pal_pred, neon)); +#if ARCH_AARCH64 +void BF(dav1d_ipred_z1_upsample_edge, neon)(pixel *out, const int hsz, + const pixel *const in, + const int end HIGHBD_DECL_SUFFIX); +void BF(dav1d_ipred_z1_filter_edge, neon)(pixel *out, const int sz, + const pixel *const in, + const int end, const int strength); +void BF(dav1d_ipred_pixel_set, neon)(pixel *out, const pixel px, + const int n); +void BF(dav1d_ipred_z1_fill1, neon)(pixel *dst, ptrdiff_t stride, + const pixel *const top, const int width, + const int height, const int dx, + const int max_base_x); +void BF(dav1d_ipred_z1_fill2, neon)(pixel *dst, ptrdiff_t stride, + const pixel *const top, const int width, + const int height, const int dx, + const int max_base_x); + +static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft_in, + const int width, const int height, int angle, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + const int is_sm = (angle >> 9) & 0x1; + const int enable_intra_edge_filter = angle >> 10; + angle &= 511; + int dx = dav1d_dr_intra_derivative[angle >> 1]; + pixel top_out[64 + 64 + (64+15)*2 + 16]; + int max_base_x; + const int upsample_above = enable_intra_edge_filter ? + get_upsample(width + height, 90 - angle, is_sm) : 0; + if (upsample_above) { + BF(dav1d_ipred_z1_upsample_edge, neon)(top_out, width + height, + topleft_in, + width + imin(width, height) + HIGHBD_TAIL_SUFFIX); + max_base_x = 2 * (width + height) - 2; + dx <<= 1; + } else { + const int filter_strength = enable_intra_edge_filter ? + get_filter_strength(width + height, 90 - angle, is_sm) : 0; + if (filter_strength) { + BF(dav1d_ipred_z1_filter_edge, neon)(top_out, width + height, + topleft_in, + width + imin(width, height), + filter_strength); + max_base_x = width + height - 1; + } else { + max_base_x = width + imin(width, height) - 1; + memcpy(top_out, &topleft_in[1], (max_base_x + 1) * sizeof(pixel)); + } + } + const int base_inc = 1 + upsample_above; + int pad_pixels = width + 15; // max(dx >> 6) == 15 + BF(dav1d_ipred_pixel_set, neon)(&top_out[max_base_x + 1], + top_out[max_base_x], pad_pixels * base_inc); + if (upsample_above) + BF(dav1d_ipred_z1_fill2, neon)(dst, stride, top_out, width, height, + dx, max_base_x); + else + BF(dav1d_ipred_z1_fill1, neon)(dst, stride, top_out, width, height, + dx, max_base_x); +} + +void BF(dav1d_ipred_reverse, neon)(pixel *dst, const pixel *const src, + const int n); + +void BF(dav1d_ipred_z2_upsample_edge, neon)(pixel *out, const int sz, + const pixel *const in + HIGHBD_DECL_SUFFIX); + +void BF(dav1d_ipred_z2_fill1, neon)(pixel *dst, ptrdiff_t stride, + const pixel *const top, + const pixel *const left, + const int width, const int height, + const int dx, const int dy); +void BF(dav1d_ipred_z2_fill2, neon)(pixel *dst, ptrdiff_t stride, + const pixel *const top, + const pixel *const left, + const int width, const int height, + const int dx, const int dy); +void BF(dav1d_ipred_z2_fill3, neon)(pixel *dst, ptrdiff_t stride, + const pixel *const top, + const pixel *const left, + const int width, const int height, + const int dx, const int dy); + +static void ipred_z2_neon(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft_in, + const int width, const int height, int angle, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + const int is_sm = (angle >> 9) & 0x1; + const int enable_intra_edge_filter = angle >> 10; + angle &= 511; + assert(angle > 90 && angle < 180); + int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1]; + int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1]; + const int upsample_left = enable_intra_edge_filter ? + get_upsample(width + height, 180 - angle, is_sm) : 0; + const int upsample_above = enable_intra_edge_filter ? + get_upsample(width + height, angle - 90, is_sm) : 0; + pixel buf[3*(64+1)]; + pixel *left = &buf[2*(64+1)]; + // The asm can underread below the start of top[] and left[]; to avoid + // surprising behaviour, make sure this is within the allocated stack space. + pixel *top = &buf[1*(64+1)]; + pixel *flipped = &buf[0*(64+1)]; + + if (upsample_above) { + BF(dav1d_ipred_z2_upsample_edge, neon)(top, width, topleft_in + HIGHBD_TAIL_SUFFIX); + dx <<= 1; + } else { + const int filter_strength = enable_intra_edge_filter ? + get_filter_strength(width + height, angle - 90, is_sm) : 0; + + if (filter_strength) { + BF(dav1d_ipred_z1_filter_edge, neon)(&top[1], imin(max_width, width), + topleft_in, width, + filter_strength); + if (max_width < width) + memcpy(&top[1 + max_width], &topleft_in[1 + max_width], + (width - max_width) * sizeof(pixel)); + } else { + pixel_copy(&top[1], &topleft_in[1], width); + } + } + if (upsample_left) { + flipped[0] = topleft_in[0]; + BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], + height); + BF(dav1d_ipred_z2_upsample_edge, neon)(left, height, flipped + HIGHBD_TAIL_SUFFIX); + dy <<= 1; + } else { + const int filter_strength = enable_intra_edge_filter ? + get_filter_strength(width + height, 180 - angle, is_sm) : 0; + + if (filter_strength) { + flipped[0] = topleft_in[0]; + BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], + height); + BF(dav1d_ipred_z1_filter_edge, neon)(&left[1], imin(max_height, height), + flipped, height, + filter_strength); + if (max_height < height) + memcpy(&left[1 + max_height], &flipped[1 + max_height], + (height - max_height) * sizeof(pixel)); + } else { + BF(dav1d_ipred_reverse, neon)(&left[1], &topleft_in[0], + height); + } + } + top[0] = left[0] = *topleft_in; + + assert(!(upsample_above && upsample_left)); + if (!upsample_above && !upsample_left) { + BF(dav1d_ipred_z2_fill1, neon)(dst, stride, top, left, width, height, + dx, dy); + } else if (upsample_above) { + BF(dav1d_ipred_z2_fill2, neon)(dst, stride, top, left, width, height, + dx, dy); + } else /*if (upsample_left)*/ { + BF(dav1d_ipred_z2_fill3, neon)(dst, stride, top, left, width, height, + dx, dy); + } +} + +void BF(dav1d_ipred_z3_fill1, neon)(pixel *dst, ptrdiff_t stride, + const pixel *const left, const int width, + const int height, const int dy, + const int max_base_y); +void BF(dav1d_ipred_z3_fill2, neon)(pixel *dst, ptrdiff_t stride, + const pixel *const left, const int width, + const int height, const int dy, + const int max_base_y); + +static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride, + const pixel *const topleft_in, + const int width, const int height, int angle, + const int max_width, const int max_height + HIGHBD_DECL_SUFFIX) +{ + const int is_sm = (angle >> 9) & 0x1; + const int enable_intra_edge_filter = angle >> 10; + angle &= 511; + assert(angle > 180); + int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1]; + pixel flipped[64 + 64 + 16]; + pixel left_out[64 + 64 + (64+15)*2]; + int max_base_y; + const int upsample_left = enable_intra_edge_filter ? + get_upsample(width + height, angle - 180, is_sm) : 0; + if (upsample_left) { + flipped[0] = topleft_in[0]; + BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], + height + imax(width, height)); + BF(dav1d_ipred_z1_upsample_edge, neon)(left_out, width + height, + flipped, + height + imin(width, height) + HIGHBD_TAIL_SUFFIX); + max_base_y = 2 * (width + height) - 2; + dy <<= 1; + } else { + const int filter_strength = enable_intra_edge_filter ? + get_filter_strength(width + height, angle - 180, is_sm) : 0; + + if (filter_strength) { + flipped[0] = topleft_in[0]; + BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], + height + imax(width, height)); + BF(dav1d_ipred_z1_filter_edge, neon)(left_out, width + height, + flipped, + height + imin(width, height), + filter_strength); + max_base_y = width + height - 1; + } else { + BF(dav1d_ipred_reverse, neon)(left_out, &topleft_in[0], + height + imin(width, height)); + max_base_y = height + imin(width, height) - 1; + } + } + const int base_inc = 1 + upsample_left; + // The tbx based implementation needs left[] to have 64 bytes intitialized, + // the other implementation can read height + max(dy >> 6) past the end. + int pad_pixels = imax(64 - max_base_y - 1, height + 15); + + BF(dav1d_ipred_pixel_set, neon)(&left_out[max_base_y + 1], + left_out[max_base_y], pad_pixels * base_inc); + if (upsample_left) + BF(dav1d_ipred_z3_fill2, neon)(dst, stride, left_out, width, height, + dy, max_base_y); + else + BF(dav1d_ipred_z3_fill1, neon)(dst, stride, left_out, width, height, + dy, max_base_y); +} +#endif + static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); @@ -65,6 +306,11 @@ static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *cons c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon); c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon); c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon); +#if ARCH_AARCH64 + c->intra_pred[Z1_PRED] = ipred_z1_neon; + c->intra_pred[Z2_PRED] = ipred_z2_neon; + c->intra_pred[Z3_PRED] = ipred_z3_neon; +#endif c->intra_pred[FILTER_PRED] = BF(dav1d_ipred_filter, neon); c->cfl_pred[DC_PRED] = BF(dav1d_ipred_cfl, neon); diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/looprestoration.h b/prog/3rdPartyLibs/codecs/dav1d/src/arm/looprestoration.h index 7993dbff6..1ac6d5fb5 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/looprestoration.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/looprestoration.h @@ -105,6 +105,7 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride, } #endif +#if ARCH_ARM void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum, const pixel (*left)[4], const pixel *src, const ptrdiff_t stride, @@ -246,6 +247,853 @@ static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride, tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX); } +#else +static void rotate(int32_t **sumsq_ptrs, int16_t **sum_ptrs, int n) { + int32_t *tmp32 = sumsq_ptrs[0]; + int16_t *tmp16 = sum_ptrs[0]; + for (int i = 0; i < n - 1; i++) { + sumsq_ptrs[i] = sumsq_ptrs[i+1]; + sum_ptrs[i] = sum_ptrs[i+1]; + } + sumsq_ptrs[n - 1] = tmp32; + sum_ptrs[n - 1] = tmp16; +} +static void rotate5_x2(int32_t **sumsq_ptrs, int16_t **sum_ptrs) { + int32_t *tmp32[2]; + int16_t *tmp16[2]; + for (int i = 0; i < 2; i++) { + tmp32[i] = sumsq_ptrs[i]; + tmp16[i] = sum_ptrs[i]; + } + for (int i = 0; i < 3; i++) { + sumsq_ptrs[i] = sumsq_ptrs[i+2]; + sum_ptrs[i] = sum_ptrs[i+2]; + } + for (int i = 0; i < 2; i++) { + sumsq_ptrs[3 + i] = tmp32[i]; + sum_ptrs[3 + i] = tmp16[i]; + } +} + +static void rotate_ab_3(int32_t **A_ptrs, int16_t **B_ptrs) { + rotate(A_ptrs, B_ptrs, 3); +} + +static void rotate_ab_2(int32_t **A_ptrs, int16_t **B_ptrs) { + rotate(A_ptrs, B_ptrs, 2); +} + +static void rotate_ab_4(int32_t **A_ptrs, int16_t **B_ptrs) { + rotate(A_ptrs, B_ptrs, 4); +} + +void BF(dav1d_sgr_box3_row_h, neon)(int32_t *sumsq, int16_t *sum, + const pixel (*left)[4], + const pixel *src, const int w, + const enum LrEdgeFlags edges); +void BF(dav1d_sgr_box5_row_h, neon)(int32_t *sumsq, int16_t *sum, + const pixel (*left)[4], + const pixel *src, const int w, + const enum LrEdgeFlags edges); +void BF(dav1d_sgr_box35_row_h, neon)(int32_t *sumsq3, int16_t *sum3, + int32_t *sumsq5, int16_t *sum5, + const pixel (*left)[4], + const pixel *src, const int w, + const enum LrEdgeFlags edges); + +void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum, + int32_t *AA, int16_t *BB, + const int w, const int s, + const int bitdepth_max); +void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum, + int32_t *AA, int16_t *BB, + const int w, const int s, + const int bitdepth_max); + +void BF(dav1d_sgr_finish_weighted1, neon)(pixel *dst, + int32_t **A_ptrs, int16_t **B_ptrs, + const int w, const int w1 + HIGHBD_DECL_SUFFIX); +void BF(dav1d_sgr_finish_weighted2, neon)(pixel *dst, const ptrdiff_t stride, + int32_t **A_ptrs, int16_t **B_ptrs, + const int w, const int h, + const int w1 HIGHBD_DECL_SUFFIX); + +void BF(dav1d_sgr_finish_filter1_2rows, neon)(int16_t *tmp, const pixel *src, + const ptrdiff_t src_stride, + int32_t **A_ptrs, + int16_t **B_ptrs, + const int w, const int h); +void BF(dav1d_sgr_finish_filter2_2rows, neon)(int16_t *tmp, const pixel *src, + const ptrdiff_t src_stride, + int32_t **A_ptrs, int16_t **B_ptrs, + const int w, const int h); +void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride, + const pixel *src, const ptrdiff_t src_stride, + const int16_t *t1, const int16_t *t2, + const int w, const int h, + const int16_t wt[2] HIGHBD_DECL_SUFFIX); + +static void sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum, + int32_t *sumsq_out, int16_t *sum_out, + const int w, int s, int bitdepth_max) { + // box3_v + calc_ab1 + dav1d_sgr_box3_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max); + rotate(sumsq, sum, 3); +} + +static void sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum, + int32_t *sumsq_out, int16_t *sum_out, + const int w, int s, int bitdepth_max) { + // box5_v + calc_ab2 + dav1d_sgr_box5_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max); + rotate5_x2(sumsq, sum); +} + +static void sgr_box3_hv_neon(int32_t **sumsq, int16_t **sum, + int32_t *AA, int16_t *BB, + const pixel (*left)[4], + const pixel *src, const int w, + const int s, + const enum LrEdgeFlags edges, + const int bitdepth_max) { + BF(dav1d_sgr_box3_row_h, neon)(sumsq[2], sum[2], left, src, w, edges); + sgr_box3_vert_neon(sumsq, sum, AA, BB, w, s, bitdepth_max); +} + + +static void sgr_finish1_neon(pixel **dst, const ptrdiff_t stride, + int32_t **A_ptrs, int16_t **B_ptrs, const int w, + const int w1 HIGHBD_DECL_SUFFIX) { + BF(dav1d_sgr_finish_weighted1, neon)(*dst, A_ptrs, B_ptrs, + w, w1 HIGHBD_TAIL_SUFFIX); + *dst += PXSTRIDE(stride); + rotate_ab_3(A_ptrs, B_ptrs); +} + +static void sgr_finish2_neon(pixel **dst, const ptrdiff_t stride, + int32_t **A_ptrs, int16_t **B_ptrs, + const int w, const int h, const int w1 + HIGHBD_DECL_SUFFIX) { + BF(dav1d_sgr_finish_weighted2, neon)(*dst, stride, A_ptrs, B_ptrs, + w, h, w1 HIGHBD_TAIL_SUFFIX); + *dst += 2*PXSTRIDE(stride); + rotate_ab_2(A_ptrs, B_ptrs); +} + +static void sgr_finish_mix_neon(pixel **dst, const ptrdiff_t stride, + int32_t **A5_ptrs, int16_t **B5_ptrs, + int32_t **A3_ptrs, int16_t **B3_ptrs, + const int w, const int h, + const int w0, const int w1 HIGHBD_DECL_SUFFIX) { +#define FILTER_OUT_STRIDE 384 + ALIGN_STK_16(int16_t, tmp5, 2*FILTER_OUT_STRIDE,); + ALIGN_STK_16(int16_t, tmp3, 2*FILTER_OUT_STRIDE,); + + BF(dav1d_sgr_finish_filter2_2rows, neon)(tmp5, *dst, stride, + A5_ptrs, B5_ptrs, w, h); + BF(dav1d_sgr_finish_filter1_2rows, neon)(tmp3, *dst, stride, + A3_ptrs, B3_ptrs, w, h); + const int16_t wt[2] = { w0, w1 }; + BF(dav1d_sgr_weighted2, neon)(*dst, stride, *dst, stride, + tmp5, tmp3, w, h, wt HIGHBD_TAIL_SUFFIX); + *dst += h*PXSTRIDE(stride); + rotate_ab_2(A5_ptrs, B5_ptrs); + rotate_ab_4(A3_ptrs, B3_ptrs); +} + + +static void sgr_filter_3x3_neon(pixel *dst, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ +#define BUF_STRIDE (384 + 16) + ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,); + ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 3 + 16,); + int32_t *sumsq_ptrs[3], *sumsq_rows[3]; + int16_t *sum_ptrs[3], *sum_rows[3]; + for (int i = 0; i < 3; i++) { + sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE]; + sum_rows[i] = &sum_buf[i * BUF_STRIDE]; + } + + ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,); + ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 3 + 16,); + int32_t *A_ptrs[3]; + int16_t *B_ptrs[3]; + for (int i = 0; i < 3; i++) { + A_ptrs[i] = &A_buf[i * BUF_STRIDE]; + B_ptrs[i] = &B_buf[i * BUF_STRIDE]; + } + const pixel *src = dst; + const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); + + if (edges & LR_HAVE_TOP) { + sumsq_ptrs[0] = sumsq_rows[0]; + sumsq_ptrs[1] = sumsq_rows[1]; + sumsq_ptrs[2] = sumsq_rows[2]; + sum_ptrs[0] = sum_rows[0]; + sum_ptrs[1] = sum_rows[1]; + sum_ptrs[2] = sum_rows[2]; + + BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0], + NULL, lpf, w, edges); + lpf += PXSTRIDE(stride); + BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[1], sum_rows[1], + NULL, lpf, w, edges); + + sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); + left++; + src += PXSTRIDE(stride); + rotate_ab_3(A_ptrs, B_ptrs); + + if (--h <= 0) + goto vert_1; + + sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); + left++; + src += PXSTRIDE(stride); + rotate_ab_3(A_ptrs, B_ptrs); + + if (--h <= 0) + goto vert_2; + } else { + sumsq_ptrs[0] = sumsq_rows[0]; + sumsq_ptrs[1] = sumsq_rows[0]; + sumsq_ptrs[2] = sumsq_rows[0]; + sum_ptrs[0] = sum_rows[0]; + sum_ptrs[1] = sum_rows[0]; + sum_ptrs[2] = sum_rows[0]; + + BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_3(A_ptrs, B_ptrs); + + if (--h <= 0) + goto vert_1; + + sumsq_ptrs[2] = sumsq_rows[1]; + sum_ptrs[2] = sum_rows[1]; + + sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); + left++; + src += PXSTRIDE(stride); + rotate_ab_3(A_ptrs, B_ptrs); + + if (--h <= 0) + goto vert_2; + + sumsq_ptrs[2] = sumsq_rows[2]; + sum_ptrs[2] = sum_rows[2]; + } + + do { + sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); + left++; + src += PXSTRIDE(stride); + + sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, + w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); + } while (--h > 0); + + if (!(edges & LR_HAVE_BOTTOM)) + goto vert_2; + + sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); + lpf_bottom += PXSTRIDE(stride); + + sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, + w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); + + sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); + + sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, + w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); + return; + +vert_2: + sumsq_ptrs[2] = sumsq_ptrs[1]; + sum_ptrs[2] = sum_ptrs[1]; + sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + w, params->sgr.s1, BITDEPTH_MAX); + + sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, + w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); + +output_1: + sumsq_ptrs[2] = sumsq_ptrs[1]; + sum_ptrs[2] = sum_ptrs[1]; + sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + w, params->sgr.s1, BITDEPTH_MAX); + + sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, + w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); + return; + +vert_1: + sumsq_ptrs[2] = sumsq_ptrs[1]; + sum_ptrs[2] = sum_ptrs[1]; + sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_3(A_ptrs, B_ptrs); + goto output_1; +} + +static void sgr_filter_5x5_neon(pixel *dst, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,); + ALIGN_STK_16(int16_t, sum_buf, BUF_STRIDE * 5 + 16,); + int32_t *sumsq_ptrs[5], *sumsq_rows[5]; + int16_t *sum_ptrs[5], *sum_rows[5]; + for (int i = 0; i < 5; i++) { + sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE]; + sum_rows[i] = &sum_buf[i * BUF_STRIDE]; + } + + ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,); + ALIGN_STK_16(int16_t, B_buf, BUF_STRIDE * 2 + 16,); + int32_t *A_ptrs[2]; + int16_t *B_ptrs[2]; + for (int i = 0; i < 2; i++) { + A_ptrs[i] = &A_buf[i * BUF_STRIDE]; + B_ptrs[i] = &B_buf[i * BUF_STRIDE]; + } + const pixel *src = dst; + const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); + + if (edges & LR_HAVE_TOP) { + sumsq_ptrs[0] = sumsq_rows[0]; + sumsq_ptrs[1] = sumsq_rows[0]; + sumsq_ptrs[2] = sumsq_rows[1]; + sumsq_ptrs[3] = sumsq_rows[2]; + sumsq_ptrs[4] = sumsq_rows[3]; + sum_ptrs[0] = sum_rows[0]; + sum_ptrs[1] = sum_rows[0]; + sum_ptrs[2] = sum_rows[1]; + sum_ptrs[3] = sum_rows[2]; + sum_ptrs[4] = sum_rows[3]; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0], + NULL, lpf, w, edges); + lpf += PXSTRIDE(stride); + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1], + NULL, lpf, w, edges); + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + if (--h <= 0) + goto vert_1; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + rotate_ab_2(A_ptrs, B_ptrs); + + if (--h <= 0) + goto vert_2; + + // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set + // one of them to point at the previously unused rows[4]. + sumsq_ptrs[3] = sumsq_rows[4]; + sum_ptrs[3] = sum_rows[4]; + } else { + sumsq_ptrs[0] = sumsq_rows[0]; + sumsq_ptrs[1] = sumsq_rows[0]; + sumsq_ptrs[2] = sumsq_rows[0]; + sumsq_ptrs[3] = sumsq_rows[0]; + sumsq_ptrs[4] = sumsq_rows[0]; + sum_ptrs[0] = sum_rows[0]; + sum_ptrs[1] = sum_rows[0]; + sum_ptrs[2] = sum_rows[0]; + sum_ptrs[3] = sum_rows[0]; + sum_ptrs[4] = sum_rows[0]; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + if (--h <= 0) + goto vert_1; + + sumsq_ptrs[4] = sumsq_rows[1]; + sum_ptrs[4] = sum_rows[1]; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + rotate_ab_2(A_ptrs, B_ptrs); + + if (--h <= 0) + goto vert_2; + + sumsq_ptrs[3] = sumsq_rows[2]; + sumsq_ptrs[4] = sumsq_rows[3]; + sum_ptrs[3] = sum_rows[2]; + sum_ptrs[4] = sum_rows[3]; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + if (--h <= 0) + goto odd; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, + w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); + + if (--h <= 0) + goto vert_2; + + // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set + // one of them to point at the previously unused rows[4]. + sumsq_ptrs[3] = sumsq_rows[4]; + sum_ptrs[3] = sum_rows[4]; + } + + do { + BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + if (--h <= 0) + goto odd; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, + w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); + } while (--h > 0); + + if (!(edges & LR_HAVE_BOTTOM)) + goto vert_2; + + BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3], + NULL, lpf_bottom, w, edges); + lpf_bottom += PXSTRIDE(stride); + BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4], + NULL, lpf_bottom, w, edges); + +output_2: + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, + w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); + return; + +vert_2: + // Duplicate the last row twice more + sumsq_ptrs[3] = sumsq_ptrs[2]; + sumsq_ptrs[4] = sumsq_ptrs[2]; + sum_ptrs[3] = sum_ptrs[2]; + sum_ptrs[4] = sum_ptrs[2]; + goto output_2; + +odd: + // Copy the last row as padding once + sumsq_ptrs[4] = sumsq_ptrs[3]; + sum_ptrs[4] = sum_ptrs[3]; + + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, + w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); + +output_1: + // Duplicate the last row twice more + sumsq_ptrs[3] = sumsq_ptrs[2]; + sumsq_ptrs[4] = sumsq_ptrs[2]; + sum_ptrs[3] = sum_ptrs[2]; + sum_ptrs[4] = sum_ptrs[2]; + + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + // Output only one row + sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, + w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX); + return; + +vert_1: + // Copy the last row as padding once + sumsq_ptrs[4] = sumsq_ptrs[3]; + sum_ptrs[4] = sum_ptrs[3]; + + sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + rotate_ab_2(A_ptrs, B_ptrs); + + goto output_1; +} + +static void sgr_filter_mix_neon(pixel *dst, const ptrdiff_t stride, + const pixel (*left)[4], const pixel *lpf, + const int w, int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +{ + ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,); + ALIGN_STK_16(int16_t, sum5_buf, BUF_STRIDE * 5 + 16,); + int32_t *sumsq5_ptrs[5], *sumsq5_rows[5]; + int16_t *sum5_ptrs[5], *sum5_rows[5]; + for (int i = 0; i < 5; i++) { + sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE]; + sum5_rows[i] = &sum5_buf[i * BUF_STRIDE]; + } + ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,); + ALIGN_STK_16(int16_t, sum3_buf, BUF_STRIDE * 3 + 16,); + int32_t *sumsq3_ptrs[3], *sumsq3_rows[3]; + int16_t *sum3_ptrs[3], *sum3_rows[3]; + for (int i = 0; i < 3; i++) { + sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE]; + sum3_rows[i] = &sum3_buf[i * BUF_STRIDE]; + } + + ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,); + ALIGN_STK_16(int16_t, B5_buf, BUF_STRIDE * 2 + 16,); + int32_t *A5_ptrs[2]; + int16_t *B5_ptrs[2]; + for (int i = 0; i < 2; i++) { + A5_ptrs[i] = &A5_buf[i * BUF_STRIDE]; + B5_ptrs[i] = &B5_buf[i * BUF_STRIDE]; + } + ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,); + ALIGN_STK_16(int16_t, B3_buf, BUF_STRIDE * 4 + 16,); + int32_t *A3_ptrs[4]; + int16_t *B3_ptrs[4]; + for (int i = 0; i < 4; i++) { + A3_ptrs[i] = &A3_buf[i * BUF_STRIDE]; + B3_ptrs[i] = &B3_buf[i * BUF_STRIDE]; + } + const pixel *src = dst; + const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); + + if (edges & LR_HAVE_TOP) { + sumsq5_ptrs[0] = sumsq5_rows[0]; + sumsq5_ptrs[1] = sumsq5_rows[0]; + sumsq5_ptrs[2] = sumsq5_rows[1]; + sumsq5_ptrs[3] = sumsq5_rows[2]; + sumsq5_ptrs[4] = sumsq5_rows[3]; + sum5_ptrs[0] = sum5_rows[0]; + sum5_ptrs[1] = sum5_rows[0]; + sum5_ptrs[2] = sum5_rows[1]; + sum5_ptrs[3] = sum5_rows[2]; + sum5_ptrs[4] = sum5_rows[3]; + + sumsq3_ptrs[0] = sumsq3_rows[0]; + sumsq3_ptrs[1] = sumsq3_rows[1]; + sumsq3_ptrs[2] = sumsq3_rows[2]; + sum3_ptrs[0] = sum3_rows[0]; + sum3_ptrs[1] = sum3_rows[1]; + sum3_ptrs[2] = sum3_rows[2]; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0], + sumsq5_rows[0], sum5_rows[0], + NULL, lpf, w, edges); + lpf += PXSTRIDE(stride); + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1], + sumsq5_rows[1], sum5_rows[1], + NULL, lpf, w, edges); + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2], + sumsq5_rows[2], sum5_rows[2], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + if (--h <= 0) + goto vert_1; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], + sumsq5_rows[3], sum5_rows[3], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + rotate_ab_2(A5_ptrs, B5_ptrs); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + if (--h <= 0) + goto vert_2; + + // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set + // one of them to point at the previously unused rows[4]. + sumsq5_ptrs[3] = sumsq5_rows[4]; + sum5_ptrs[3] = sum5_rows[4]; + } else { + sumsq5_ptrs[0] = sumsq5_rows[0]; + sumsq5_ptrs[1] = sumsq5_rows[0]; + sumsq5_ptrs[2] = sumsq5_rows[0]; + sumsq5_ptrs[3] = sumsq5_rows[0]; + sumsq5_ptrs[4] = sumsq5_rows[0]; + sum5_ptrs[0] = sum5_rows[0]; + sum5_ptrs[1] = sum5_rows[0]; + sum5_ptrs[2] = sum5_rows[0]; + sum5_ptrs[3] = sum5_rows[0]; + sum5_ptrs[4] = sum5_rows[0]; + + sumsq3_ptrs[0] = sumsq3_rows[0]; + sumsq3_ptrs[1] = sumsq3_rows[0]; + sumsq3_ptrs[2] = sumsq3_rows[0]; + sum3_ptrs[0] = sum3_rows[0]; + sum3_ptrs[1] = sum3_rows[0]; + sum3_ptrs[2] = sum3_rows[0]; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0], + sumsq5_rows[0], sum5_rows[0], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + if (--h <= 0) + goto vert_1; + + sumsq5_ptrs[4] = sumsq5_rows[1]; + sum5_ptrs[4] = sum5_rows[1]; + + sumsq3_ptrs[2] = sumsq3_rows[1]; + sum3_ptrs[2] = sum3_rows[1]; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1], + sumsq5_rows[1], sum5_rows[1], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + rotate_ab_2(A5_ptrs, B5_ptrs); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + if (--h <= 0) + goto vert_2; + + sumsq5_ptrs[3] = sumsq5_rows[2]; + sumsq5_ptrs[4] = sumsq5_rows[3]; + sum5_ptrs[3] = sum5_rows[2]; + sum5_ptrs[4] = sum5_rows[3]; + + sumsq3_ptrs[2] = sumsq3_rows[2]; + sum3_ptrs[2] = sum3_rows[2]; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2], + sumsq5_rows[2], sum5_rows[2], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + if (--h <= 0) + goto odd; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], + sumsq5_rows[3], sum5_rows[3], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, + w, 2, params->sgr.w0, params->sgr.w1 + HIGHBD_TAIL_SUFFIX); + + if (--h <= 0) + goto vert_2; + + // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set + // one of them to point at the previously unused rows[4]. + sumsq5_ptrs[3] = sumsq5_rows[4]; + sum5_ptrs[3] = sum5_rows[4]; + } + + do { + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], + sumsq5_ptrs[3], sum5_ptrs[3], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + if (--h <= 0) + goto odd; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], + sumsq5_ptrs[4], sum5_ptrs[4], + left, src, w, edges); + left++; + src += PXSTRIDE(stride); + + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, + w, 2, params->sgr.w0, params->sgr.w1 + HIGHBD_TAIL_SUFFIX); + } while (--h > 0); + + if (!(edges & LR_HAVE_BOTTOM)) + goto vert_2; + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], + sumsq5_ptrs[3], sum5_ptrs[3], + NULL, lpf_bottom, w, edges); + lpf_bottom += PXSTRIDE(stride); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], + sumsq5_ptrs[4], sum5_ptrs[4], + NULL, lpf_bottom, w, edges); + +output_2: + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, + w, 2, params->sgr.w0, params->sgr.w1 + HIGHBD_TAIL_SUFFIX); + return; + +vert_2: + // Duplicate the last row twice more + sumsq5_ptrs[3] = sumsq5_ptrs[2]; + sumsq5_ptrs[4] = sumsq5_ptrs[2]; + sum5_ptrs[3] = sum5_ptrs[2]; + sum5_ptrs[4] = sum5_ptrs[2]; + + sumsq3_ptrs[2] = sumsq3_ptrs[1]; + sum3_ptrs[2] = sum3_ptrs[1]; + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + sumsq3_ptrs[2] = sumsq3_ptrs[1]; + sum3_ptrs[2] = sum3_ptrs[1]; + + goto output_2; + +odd: + // Copy the last row as padding once + sumsq5_ptrs[4] = sumsq5_ptrs[3]; + sum5_ptrs[4] = sum5_ptrs[3]; + + sumsq3_ptrs[2] = sumsq3_ptrs[1]; + sum3_ptrs[2] = sum3_ptrs[1]; + + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, + w, 2, params->sgr.w0, params->sgr.w1 + HIGHBD_TAIL_SUFFIX); + +output_1: + // Duplicate the last row twice more + sumsq5_ptrs[3] = sumsq5_ptrs[2]; + sumsq5_ptrs[4] = sumsq5_ptrs[2]; + sum5_ptrs[3] = sum5_ptrs[2]; + sum5_ptrs[4] = sum5_ptrs[2]; + + sumsq3_ptrs[2] = sumsq3_ptrs[1]; + sum3_ptrs[2] = sum3_ptrs[1]; + + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + // Output only one row + sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, + w, 1, params->sgr.w0, params->sgr.w1 + HIGHBD_TAIL_SUFFIX); + return; + +vert_1: + // Copy the last row as padding once + sumsq5_ptrs[4] = sumsq5_ptrs[3]; + sum5_ptrs[4] = sum5_ptrs[3]; + + sumsq3_ptrs[2] = sumsq3_ptrs[1]; + sum3_ptrs[2] = sum3_ptrs[1]; + + sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], + w, params->sgr.s0, BITDEPTH_MAX); + rotate_ab_2(A5_ptrs, B5_ptrs); + sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], + w, params->sgr.s1, BITDEPTH_MAX); + rotate_ab_4(A3_ptrs, B3_ptrs); + + goto output_1; +} + +#endif + + static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) { const unsigned flags = dav1d_get_cpu_flags(); diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/arm/refmvs.h b/prog/3rdPartyLibs/codecs/dav1d/src/arm/refmvs.h index 4c96fc509..1c2dc704c 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/arm/refmvs.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/arm/refmvs.h @@ -28,6 +28,7 @@ #include "src/cpu.h" #include "src/refmvs.h" +decl_save_tmvs_fn(dav1d_save_tmvs_neon); decl_splat_mv_fn(dav1d_splat_mv_neon); static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) { @@ -35,5 +36,6 @@ static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) { if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; + c->save_tmvs = dav1d_save_tmvs_neon; c->splat_mv = dav1d_splat_mv_neon; } diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/data.c b/prog/3rdPartyLibs/codecs/dav1d/src/data.c index 8a1386ad9..bbbe02e8d 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/data.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/data.c @@ -44,7 +44,7 @@ uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) { validate_input_or_ret(buf != NULL, NULL); if (sz > SIZE_MAX / 2) return NULL; - buf->ref = dav1d_ref_create(sz); + buf->ref = dav1d_ref_create(ALLOC_DAV1DDATA, sz); if (!buf->ref) return NULL; buf->data = buf->ref->const_data; buf->sz = sz; @@ -64,8 +64,11 @@ int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr, validate_input_or_ret(ptr != NULL, DAV1D_ERR(EINVAL)); validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL)); - buf->ref = dav1d_ref_wrap(ptr, free_callback, cookie); - if (!buf->ref) return DAV1D_ERR(ENOMEM); + if (sz > SIZE_MAX / 2) return DAV1D_ERR(EINVAL); + Dav1dRef *const ref = dav1d_malloc(ALLOC_DAV1DDATA, sizeof(Dav1dRef)); + if (!ref) return DAV1D_ERR(ENOMEM); + + buf->ref = dav1d_ref_init(ref, ptr, free_callback, cookie, 1); buf->data = ptr; buf->sz = sz; dav1d_data_props_set_defaults(&buf->m); @@ -83,21 +86,22 @@ int dav1d_data_wrap_user_data_internal(Dav1dData *const buf, validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL)); validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL)); - buf->m.user_data.ref = dav1d_ref_wrap(user_data, free_callback, cookie); - if (!buf->m.user_data.ref) return DAV1D_ERR(ENOMEM); + Dav1dRef *const ref = dav1d_malloc(ALLOC_DAV1DDATA, sizeof(Dav1dRef)); + if (!ref) return DAV1D_ERR(ENOMEM); + + buf->m.user_data.ref = dav1d_ref_init(ref, user_data, free_callback, cookie, 1); buf->m.user_data.data = user_data; return 0; } - void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) { - validate_input(dst != NULL); - validate_input(dst->data == NULL); - validate_input(src != NULL); + assert(dst != NULL); + assert(dst->data == NULL); + assert(src != NULL); if (src->ref) { - validate_input(src->data != NULL); + assert(src->data != NULL); dav1d_ref_inc(src->ref); } if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref); diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/decode.c b/prog/3rdPartyLibs/codecs/dav1d/src/decode.c index 2ac190b2b..94ef17cb0 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/decode.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/decode.c @@ -370,142 +370,6 @@ static inline int findoddzero(const uint8_t *buf, int len) { return 0; } -static void read_pal_plane(Dav1dTaskContext *const t, Av1Block *const b, - const int pl, const int sz_ctx, - const int bx4, const int by4) -{ - Dav1dTileState *const ts = t->ts; - const Dav1dFrameContext *const f = t->f; - const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac, - ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2; - uint16_t cache[16], used_cache[8]; - int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4]; - int n_cache = 0; - // don't reuse above palette outside SB64 boundaries - int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0; - const uint16_t *l = t->al_pal[1][by4][pl], *a = t->al_pal[0][bx4][pl]; - - // fill/sort cache - while (l_cache && a_cache) { - if (*l < *a) { - if (!n_cache || cache[n_cache - 1] != *l) - cache[n_cache++] = *l; - l++; - l_cache--; - } else { - if (*a == *l) { - l++; - l_cache--; - } - if (!n_cache || cache[n_cache - 1] != *a) - cache[n_cache++] = *a; - a++; - a_cache--; - } - } - if (l_cache) { - do { - if (!n_cache || cache[n_cache - 1] != *l) - cache[n_cache++] = *l; - l++; - } while (--l_cache > 0); - } else if (a_cache) { - do { - if (!n_cache || cache[n_cache - 1] != *a) - cache[n_cache++] = *a; - a++; - } while (--a_cache > 0); - } - - // find reused cache entries - int i = 0; - for (int n = 0; n < n_cache && i < pal_sz; n++) - if (dav1d_msac_decode_bool_equi(&ts->msac)) - used_cache[i++] = cache[n]; - const int n_used_cache = i; - - // parse new entries - uint16_t *const pal = t->frame_thread.pass ? - f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + - ((t->bx >> 1) + (t->by & 1))][pl] : t->scratch.pal[pl]; - if (i < pal_sz) { - int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc); - - if (i < pal_sz) { - int bits = f->cur.p.bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2); - const int max = (1 << f->cur.p.bpc) - 1; - - do { - const int delta = dav1d_msac_decode_bools(&ts->msac, bits); - prev = pal[i++] = imin(prev + delta + !pl, max); - if (prev + !pl >= max) { - for (; i < pal_sz; i++) - pal[i] = max; - break; - } - bits = imin(bits, 1 + ulog2(max - prev - !pl)); - } while (i < pal_sz); - } - - // merge cache+new entries - int n = 0, m = n_used_cache; - for (i = 0; i < pal_sz; i++) { - if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) { - pal[i] = used_cache[n++]; - } else { - assert(m < pal_sz); - pal[i] = pal[m++]; - } - } - } else { - memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache)); - } - - if (DEBUG_BLOCK_INFO) { - printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=", - pl, pal_sz, n_cache, n_used_cache, ts->msac.rng); - for (int n = 0; n < n_cache; n++) - printf("%c%02x", n ? ' ' : '[', cache[n]); - printf("%s, pal=", n_cache ? "]" : "[]"); - for (int n = 0; n < pal_sz; n++) - printf("%c%02x", n ? ' ' : '[', pal[n]); - printf("]\n"); - } -} - -static void read_pal_uv(Dav1dTaskContext *const t, Av1Block *const b, - const int sz_ctx, const int bx4, const int by4) -{ - read_pal_plane(t, b, 1, sz_ctx, bx4, by4); - - // V pal coding - Dav1dTileState *const ts = t->ts; - const Dav1dFrameContext *const f = t->f; - uint16_t *const pal = t->frame_thread.pass ? - f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + - ((t->bx >> 1) + (t->by & 1))][2] : t->scratch.pal[2]; - if (dav1d_msac_decode_bool_equi(&ts->msac)) { - const int bits = f->cur.p.bpc - 4 + - dav1d_msac_decode_bools(&ts->msac, 2); - int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc); - const int max = (1 << f->cur.p.bpc) - 1; - for (int i = 1; i < b->pal_sz[1]; i++) { - int delta = dav1d_msac_decode_bools(&ts->msac, bits); - if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta; - prev = pal[i] = (prev + delta) & max; - } - } else { - for (int i = 0; i < b->pal_sz[1]; i++) - pal[i] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc); - } - if (DEBUG_BLOCK_INFO) { - printf("Post-pal[pl=2]: r=%d ", ts->msac.rng); - for (int n = 0; n < b->pal_sz[1]; n++) - printf("%c%02x", n ? ' ' : '[', pal[n]); - printf("]\n"); - } -} - // meant to be SIMD'able, so that theoretical complexity of this function // times block size goes from w4*h4 to w4+h4-1 // a and b are previous two lines containing (a) top/left entries or (b) @@ -584,7 +448,8 @@ static void read_pal_indices(Dav1dTaskContext *const t, Dav1dTileState *const ts = t->ts; const ptrdiff_t stride = bw4 * 4; assert(pal_idx); - pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]); + pixel *const pal_tmp = t->scratch.pal_idx_uv; + pal_tmp[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]); uint16_t (*const color_map_cdf)[8] = ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2]; uint8_t (*const order)[8] = t->scratch.pal_order; @@ -593,23 +458,16 @@ static void read_pal_indices(Dav1dTaskContext *const t, // top/left-to-bottom/right diagonals ("wave-front") const int first = imin(i, w4 * 4 - 1); const int last = imax(0, i - h4 * 4 + 1); - order_palette(pal_idx, stride, i, first, last, order, ctx); + order_palette(pal_tmp, stride, i, first, last, order, ctx); for (int j = first, m = 0; j >= last; j--, m++) { const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac, color_map_cdf[ctx[m]], b->pal_sz[pl] - 1); - pal_idx[(i - j) * stride + j] = order[m][color_idx]; + pal_tmp[(i - j) * stride + j] = order[m][color_idx]; } } - // fill invisible edges - if (bw4 > w4) - for (int y = 0; y < 4 * h4; y++) - memset(&pal_idx[y * stride + 4 * w4], - pal_idx[y * stride + 4 * w4 - 1], 4 * (bw4 - w4)); - if (h4 < bh4) { - const uint8_t *const src = &pal_idx[stride * (4 * h4 - 1)]; - for (int y = h4 * 4; y < bh4 * 4; y++) - memcpy(&pal_idx[y * stride], src, bw4 * 4); - } + + t->c->pal_dsp.pal_idx_finish(pal_idx, pal_tmp, bw4 * 4, bh4 * 4, + w4 * 4, h4 * 4); } static void read_vartx_tree(Dav1dTaskContext *const t, @@ -1306,7 +1164,7 @@ static int decode_b(Dav1dTaskContext *const t, if (DEBUG_BLOCK_INFO) printf("Post-y_pal[%d]: r=%d\n", use_y_pal, ts->msac.rng); if (use_y_pal) - read_pal_plane(t, b, 0, sz_ctx, bx4, by4); + f->bd_fn.read_pal_plane(t, b, 0, sz_ctx, bx4, by4); } if (has_chroma && b->uv_mode == DC_PRED) { @@ -1316,7 +1174,7 @@ static int decode_b(Dav1dTaskContext *const t, if (DEBUG_BLOCK_INFO) printf("Post-uv_pal[%d]: r=%d\n", use_uv_pal, ts->msac.rng); if (use_uv_pal) // see aomedia bug 2183 for why we use luma coordinates - read_pal_uv(t, b, sz_ctx, bx4, by4); + f->bd_fn.read_pal_uv(t, b, sz_ctx, bx4, by4); } } @@ -1341,9 +1199,9 @@ static int decode_b(Dav1dTaskContext *const t, const int p = t->frame_thread.pass & 1; assert(ts->frame_thread[p].pal_idx); pal_idx = ts->frame_thread[p].pal_idx; - ts->frame_thread[p].pal_idx += bw4 * bh4 * 16; + ts->frame_thread[p].pal_idx += bw4 * bh4 * 8; } else - pal_idx = t->scratch.pal_idx; + pal_idx = t->scratch.pal_idx_y; read_pal_indices(t, pal_idx, b, 0, w4, h4, bw4, bh4); if (DEBUG_BLOCK_INFO) printf("Post-y-pal-indices: r=%d\n", ts->msac.rng); @@ -1355,9 +1213,9 @@ static int decode_b(Dav1dTaskContext *const t, const int p = t->frame_thread.pass & 1; assert(ts->frame_thread[p].pal_idx); pal_idx = ts->frame_thread[p].pal_idx; - ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16; + ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8; } else - pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16]; + pal_idx = t->scratch.pal_idx_uv; read_pal_indices(t, pal_idx, b, 1, cw4, ch4, cbw4, cbh4); if (DEBUG_BLOCK_INFO) printf("Post-uv-pal-indices: r=%d\n", ts->msac.rng); @@ -1430,34 +1288,16 @@ static int decode_b(Dav1dTaskContext *const t, case_set(bh4, l., 1, by4); case_set(bw4, a->, 0, bx4); #undef set_ctx - if (b->pal_sz[0]) { - uint16_t *const pal = t->frame_thread.pass ? - f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + - ((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0]; - for (int x = 0; x < bw4; x++) - memcpy(t->al_pal[0][bx4 + x][0], pal, 16); - for (int y = 0; y < bh4; y++) - memcpy(t->al_pal[1][by4 + y][0], pal, 16); - } + if (b->pal_sz[0]) + f->bd_fn.copy_pal_block_y(t, bx4, by4, bw4, bh4); if (has_chroma) { #define set_ctx(type, dir, diridx, off, mul, rep_macro) \ rep_macro(type, t->dir uvmode, off, mul * b->uv_mode) case_set(cbh4, l., 1, cby4); case_set(cbw4, a->, 0, cbx4); #undef set_ctx - if (b->pal_sz[1]) { - const uint16_t (*const pal)[8] = t->frame_thread.pass ? - f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * - (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))] : - t->scratch.pal; - // see aomedia bug 2183 for why we use luma coordinates here - for (int pl = 1; pl <= 2; pl++) { - for (int x = 0; x < bw4; x++) - memcpy(t->al_pal[0][bx4 + x][pl], pal[pl], 16); - for (int y = 0; y < bh4; y++) - memcpy(t->al_pal[1][by4 + y][pl], pal[pl], 16); - } - } + if (b->pal_sz[1]) + f->bd_fn.copy_pal_block_uv(t, bx4, by4, bw4, bh4); } if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) splat_intraref(f->c, t, bs, bw4, bh4); @@ -2329,7 +2169,7 @@ static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl, if (!have_h_split && !have_v_split) { assert(bl < BL_8X8); - return decode_sb(t, bl + 1, ((const EdgeBranch *) node)->split[0]); + return decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0)); } uint16_t *pc; @@ -2390,19 +2230,19 @@ static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl, if (bl == BL_8X8) { const EdgeTip *const tip = (const EdgeTip *) node; assert(hsz == 1); - if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[0])) + if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, EDGE_ALL_TR_AND_BL)) return -1; const enum Filter2d tl_filter = t->tl_4x4_filter; t->bx++; - if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[1])) + if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[0])) return -1; t->bx--; t->by++; - if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[2])) + if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[1])) return -1; t->bx++; t->tl_4x4_filter = tl_filter; - if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[3])) + if (decode_b(t, bl, BS_4x4, PARTITION_SPLIT, tip->split[2])) return -1; t->bx--; t->by--; @@ -2417,74 +2257,69 @@ static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl, } #endif } else { - const EdgeBranch *const branch = (const EdgeBranch *) node; - if (decode_sb(t, bl + 1, branch->split[0])) + if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0))) return 1; t->bx += hsz; - if (decode_sb(t, bl + 1, branch->split[1])) + if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 1))) return 1; t->bx -= hsz; t->by += hsz; - if (decode_sb(t, bl + 1, branch->split[2])) + if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 2))) return 1; t->bx += hsz; - if (decode_sb(t, bl + 1, branch->split[3])) + if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 3))) return 1; t->bx -= hsz; t->by -= hsz; } break; case PARTITION_T_TOP_SPLIT: { - const EdgeBranch *const branch = (const EdgeBranch *) node; - if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, branch->tts[0])) + if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, EDGE_ALL_TR_AND_BL)) return -1; t->bx += hsz; - if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, branch->tts[1])) + if (decode_b(t, bl, b[0], PARTITION_T_TOP_SPLIT, node->v[1])) return -1; t->bx -= hsz; t->by += hsz; - if (decode_b(t, bl, b[1], PARTITION_T_TOP_SPLIT, branch->tts[2])) + if (decode_b(t, bl, b[1], PARTITION_T_TOP_SPLIT, node->h[1])) return -1; t->by -= hsz; break; } case PARTITION_T_BOTTOM_SPLIT: { - const EdgeBranch *const branch = (const EdgeBranch *) node; - if (decode_b(t, bl, b[0], PARTITION_T_BOTTOM_SPLIT, branch->tbs[0])) + if (decode_b(t, bl, b[0], PARTITION_T_BOTTOM_SPLIT, node->h[0])) return -1; t->by += hsz; - if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, branch->tbs[1])) + if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, node->v[0])) return -1; t->bx += hsz; - if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, branch->tbs[2])) + if (decode_b(t, bl, b[1], PARTITION_T_BOTTOM_SPLIT, 0)) return -1; t->bx -= hsz; t->by -= hsz; break; } case PARTITION_T_LEFT_SPLIT: { - const EdgeBranch *const branch = (const EdgeBranch *) node; - if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, branch->tls[0])) + if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, EDGE_ALL_TR_AND_BL)) return -1; t->by += hsz; - if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, branch->tls[1])) + if (decode_b(t, bl, b[0], PARTITION_T_LEFT_SPLIT, node->h[1])) return -1; t->by -= hsz; t->bx += hsz; - if (decode_b(t, bl, b[1], PARTITION_T_LEFT_SPLIT, branch->tls[2])) + if (decode_b(t, bl, b[1], PARTITION_T_LEFT_SPLIT, node->v[1])) return -1; t->bx -= hsz; break; } case PARTITION_T_RIGHT_SPLIT: { - const EdgeBranch *const branch = (const EdgeBranch *) node; - if (decode_b(t, bl, b[0], PARTITION_T_RIGHT_SPLIT, branch->trs[0])) + if (decode_b(t, bl, b[0], PARTITION_T_RIGHT_SPLIT, node->v[0])) return -1; t->bx += hsz; - if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, branch->trs[1])) + if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, node->h[0])) return -1; t->by += hsz; - if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, branch->trs[2])) + if (decode_b(t, bl, b[1], PARTITION_T_RIGHT_SPLIT, 0)) return -1; t->by -= hsz; t->bx -= hsz; @@ -2492,34 +2327,34 @@ static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl, } case PARTITION_H4: { const EdgeBranch *const branch = (const EdgeBranch *) node; - if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[0])) + if (decode_b(t, bl, b[0], PARTITION_H4, node->h[0])) return -1; t->by += hsz >> 1; - if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[1])) + if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4)) return -1; t->by += hsz >> 1; - if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[2])) + if (decode_b(t, bl, b[0], PARTITION_H4, EDGE_ALL_LEFT_HAS_BOTTOM)) return -1; t->by += hsz >> 1; if (t->by < f->bh) - if (decode_b(t, bl, b[0], PARTITION_H4, branch->h4[3])) + if (decode_b(t, bl, b[0], PARTITION_H4, node->h[1])) return -1; t->by -= hsz * 3 >> 1; break; } case PARTITION_V4: { const EdgeBranch *const branch = (const EdgeBranch *) node; - if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[0])) + if (decode_b(t, bl, b[0], PARTITION_V4, node->v[0])) return -1; t->bx += hsz >> 1; - if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[1])) + if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4)) return -1; t->bx += hsz >> 1; - if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[2])) + if (decode_b(t, bl, b[0], PARTITION_V4, EDGE_ALL_TOP_HAS_RIGHT)) return -1; t->bx += hsz >> 1; if (t->bx < f->bw) - if (decode_b(t, bl, b[0], PARTITION_V4, branch->v4[3])) + if (decode_b(t, bl, b[0], PARTITION_V4, node->v[1])) return -1; t->bx -= hsz * 3 >> 1; break; @@ -2542,11 +2377,10 @@ static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl, assert(bl < BL_8X8); if (is_split) { - const EdgeBranch *const branch = (const EdgeBranch *) node; bp = PARTITION_SPLIT; - if (decode_sb(t, bl + 1, branch->split[0])) return 1; + if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0))) return 1; t->bx += hsz; - if (decode_sb(t, bl + 1, branch->split[1])) return 1; + if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 1))) return 1; t->bx -= hsz; } else { bp = PARTITION_H; @@ -2573,11 +2407,10 @@ static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl, assert(bl < BL_8X8); if (is_split) { - const EdgeBranch *const branch = (const EdgeBranch *) node; bp = PARTITION_SPLIT; - if (decode_sb(t, bl + 1, branch->split[0])) return 1; + if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 0))) return 1; t->by += hsz; - if (decode_sb(t, bl + 1, branch->split[2])) return 1; + if (decode_sb(t, bl + 1, INTRA_EDGE_SPLIT(node, 2))) return 1; t->by -= hsz; } else { bp = PARTITION_V; @@ -2649,7 +2482,10 @@ static void setup_tile(Dav1dTileState *const ts, const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout]; for (int p = 0; p < 2; p++) { ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ? - &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] : + &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 8] : + NULL; + ts->frame_thread[p].cbi = f->frame_thread.cbi ? + &f->frame_thread.cbi[(size_t)tile_start_off * size_mul[0] / 64] : NULL; ts->frame_thread[p].cf = f->frame_thread.cf ? (uint8_t*)f->frame_thread.cf + @@ -2726,9 +2562,7 @@ static void read_restoration_info(Dav1dTaskContext *const t, if (frame_type == DAV1D_RESTORATION_SWITCHABLE) { const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.restore_switchable, 2); - lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ : - DAV1D_RESTORATION_WIENER : - DAV1D_RESTORATION_NONE; + lr->type = filter + !!filter; /* NONE/WIENER/SGRPROJ */ } else { const unsigned type = dav1d_msac_decode_bool_adapt(&ts->msac, @@ -2767,7 +2601,7 @@ static void read_restoration_info(Dav1dTaskContext *const t, } else if (lr->type == DAV1D_RESTORATION_SGRPROJ) { const unsigned idx = dav1d_msac_decode_bools(&ts->msac, 4); const uint16_t *const sgr_params = dav1d_sgr_params[idx]; - lr->sgr_idx = idx; + lr->type += idx; lr->sgr_weights[0] = sgr_params[0] ? dav1d_msac_decode_subexp(&ts->msac, ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 : 0; lr->sgr_weights[1] = sgr_params[1] ? dav1d_msac_decode_subexp(&ts->msac, @@ -2777,7 +2611,7 @@ static void read_restoration_info(Dav1dTaskContext *const t, ts->lr_ref[p] = lr; if (DEBUG_BLOCK_INFO) printf("Post-lr_sgrproj[pl=%d,idx=%d,w[%d,%d]]: r=%d\n", - p, lr->sgr_idx, lr->sgr_weights[0], + p, idx, lr->sgr_weights[0], lr->sgr_weights[1], ts->msac.rng); } } @@ -2816,7 +2650,7 @@ int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) { { if (atomic_load_explicit(c->flush, memory_order_acquire)) return 1; - if (decode_sb(t, root_bl, c->intra_edge.root[root_bl])) + if (decode_sb(t, root_bl, dav1d_intra_edge_tree[root_bl])) return 1; if (t->bx & 16 || f->seq_hdr->sb128) t->a++; @@ -2829,9 +2663,9 @@ int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) { if (ts->msac.cnt < -15) return 1; if (f->c->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) { - dav1d_refmvs_load_tmvs(&f->rf, ts->tiling.row, - ts->tiling.col_start >> 1, ts->tiling.col_end >> 1, - t->by >> 1, (t->by + sb_step) >> 1); + f->c->refmvs_dsp.load_tmvs(&f->rf, ts->tiling.row, + ts->tiling.col_start >> 1, ts->tiling.col_end >> 1, + t->by >> 1, (t->by + sb_step) >> 1); } memset(t->pal_sz_uv[1], 0, sizeof(*t->pal_sz_uv)); const int sb128y = t->by >> 5; @@ -2905,7 +2739,7 @@ int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) { read_restoration_info(t, lr, p, frame_type); } } - if (decode_sb(t, root_bl, c->intra_edge.root[root_bl])) + if (decode_sb(t, root_bl, dav1d_intra_edge_tree[root_bl])) return 1; if (t->bx & 16 || f->seq_hdr->sb128) { t->a++; @@ -2914,7 +2748,7 @@ int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) { } if (f->seq_hdr->ref_frame_mvs && f->c->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) { - dav1d_refmvs_save_tmvs(&t->rt, + dav1d_refmvs_save_tmvs(&f->c->refmvs_dsp, &t->rt, ts->tiling.col_start >> 1, ts->tiling.col_end >> 1, t->by >> 1, (t->by + sb_step) >> 1); } @@ -2941,8 +2775,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { int retval = DAV1D_ERR(ENOMEM); if (f->sbh > f->lf.start_of_tile_row_sz) { - free(f->lf.start_of_tile_row); - f->lf.start_of_tile_row = malloc(f->sbh * sizeof(uint8_t)); + dav1d_free(f->lf.start_of_tile_row); + f->lf.start_of_tile_row = dav1d_malloc(ALLOC_TILE, f->sbh * sizeof(uint8_t)); if (!f->lf.start_of_tile_row) { f->lf.start_of_tile_row_sz = 0; goto error; @@ -2959,24 +2793,24 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; if (n_ts != f->n_ts) { if (c->n_fc > 1) { - freep(&f->frame_thread.tile_start_off); + dav1d_free(f->frame_thread.tile_start_off); f->frame_thread.tile_start_off = - malloc(sizeof(*f->frame_thread.tile_start_off) * n_ts); + dav1d_malloc(ALLOC_TILE, sizeof(*f->frame_thread.tile_start_off) * n_ts); if (!f->frame_thread.tile_start_off) { f->n_ts = 0; goto error; } } dav1d_free_aligned(f->ts); - f->ts = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32); + f->ts = dav1d_alloc_aligned(ALLOC_TILE, sizeof(*f->ts) * n_ts, 32); if (!f->ts) goto error; f->n_ts = n_ts; } const int a_sz = f->sb128w * f->frame_hdr->tiling.rows * (1 + (c->n_fc > 1 && c->n_tc > 1)); if (a_sz != f->a_sz) { - freep(&f->a); - f->a = malloc(sizeof(*f->a) * a_sz); + dav1d_free(f->a); + f->a = dav1d_malloc(ALLOC_TILE, sizeof(*f->a) * a_sz); if (!f->a) { f->a_sz = 0; goto error; @@ -3002,9 +2836,10 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { const int lowest_pixel_mem_sz = f->frame_hdr->tiling.cols * f->sbh; if (lowest_pixel_mem_sz != f->tile_thread.lowest_pixel_mem_sz) { - free(f->tile_thread.lowest_pixel_mem); + dav1d_free(f->tile_thread.lowest_pixel_mem); f->tile_thread.lowest_pixel_mem = - malloc(lowest_pixel_mem_sz * sizeof(*f->tile_thread.lowest_pixel_mem)); + dav1d_malloc(ALLOC_TILE, lowest_pixel_mem_sz * + sizeof(*f->tile_thread.lowest_pixel_mem)); if (!f->tile_thread.lowest_pixel_mem) { f->tile_thread.lowest_pixel_mem_sz = 0; goto error; @@ -3023,11 +2858,24 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { } } + const int cbi_sz = num_sb128 * size_mul[0]; + if (cbi_sz != f->frame_thread.cbi_sz) { + dav1d_free_aligned(f->frame_thread.cbi); + f->frame_thread.cbi = + dav1d_alloc_aligned(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) * + cbi_sz * 32 * 32 / 4, 64); + if (!f->frame_thread.cbi) { + f->frame_thread.cbi_sz = 0; + goto error; + } + f->frame_thread.cbi_sz = cbi_sz; + } + const int cf_sz = (num_sb128 * size_mul[0]) << hbd; if (cf_sz != f->frame_thread.cf_sz) { - dav1d_freep_aligned(&f->frame_thread.cf); + dav1d_free_aligned(f->frame_thread.cf); f->frame_thread.cf = - dav1d_alloc_aligned((size_t)cf_sz * 128 * 128 / 2, 64); + dav1d_alloc_aligned(ALLOC_COEF, (size_t)cf_sz * 128 * 128 / 2, 64); if (!f->frame_thread.cf) { f->frame_thread.cf_sz = 0; goto error; @@ -3037,24 +2885,25 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { } if (f->frame_hdr->allow_screen_content_tools) { - if (num_sb128 != f->frame_thread.pal_sz) { - dav1d_freep_aligned(&f->frame_thread.pal); + const int pal_sz = num_sb128 << hbd; + if (pal_sz != f->frame_thread.pal_sz) { + dav1d_free_aligned(f->frame_thread.pal); f->frame_thread.pal = - dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) * - num_sb128 * 16 * 16, 64); + dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal) * + pal_sz * 16 * 16, 64); if (!f->frame_thread.pal) { f->frame_thread.pal_sz = 0; goto error; } - f->frame_thread.pal_sz = num_sb128; + f->frame_thread.pal_sz = pal_sz; } const int pal_idx_sz = num_sb128 * size_mul[1]; if (pal_idx_sz != f->frame_thread.pal_idx_sz) { - dav1d_freep_aligned(&f->frame_thread.pal_idx); + dav1d_free_aligned(f->frame_thread.pal_idx); f->frame_thread.pal_idx = - dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) * - pal_idx_sz * 128 * 128 / 4, 64); + dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal_idx) * + pal_idx_sz * 128 * 128 / 8, 64); if (!f->frame_thread.pal_idx) { f->frame_thread.pal_idx_sz = 0; goto error; @@ -3081,7 +2930,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { size_t alloc_sz = 64; alloc_sz += (size_t)llabs(y_stride) * 4 * f->sbh << need_cdef_lpf_copy; alloc_sz += (size_t)llabs(uv_stride) * 8 * f->sbh << need_cdef_lpf_copy; - uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(alloc_sz, 32); + uint8_t *ptr = f->lf.cdef_line_buf = dav1d_alloc_aligned(ALLOC_CDEF, alloc_sz, 32); if (!ptr) { f->lf.cdef_buf_plane_sz[0] = f->lf.cdef_buf_plane_sz[1] = 0; goto error; @@ -3141,7 +2990,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { size_t alloc_sz = 128; alloc_sz += (size_t)llabs(y_stride) * num_lines; alloc_sz += (size_t)llabs(uv_stride) * num_lines * 2; - uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(alloc_sz, 64); + uint8_t *ptr = f->lf.lr_line_buf = dav1d_alloc_aligned(ALLOC_LR, alloc_sz, 64); if (!ptr) { f->lf.lr_buf_plane_sz[0] = f->lf.lr_buf_plane_sz[1] = 0; goto error; @@ -3167,24 +3016,21 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { // update allocation for loopfilter masks if (num_sb128 != f->lf.mask_sz) { - freep(&f->lf.mask); - freep(&f->lf.level); - f->lf.mask = malloc(sizeof(*f->lf.mask) * num_sb128); + dav1d_free(f->lf.mask); + dav1d_free(f->lf.level); + f->lf.mask = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.mask) * num_sb128); // over-allocate by 3 bytes since some of the SIMD implementations // index this from the level type and can thus over-read by up to 3 - f->lf.level = malloc(sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3); + f->lf.level = dav1d_malloc(ALLOC_LF, sizeof(*f->lf.level) * num_sb128 * 32 * 32 + 3); if (!f->lf.mask || !f->lf.level) { f->lf.mask_sz = 0; goto error; } if (c->n_fc > 1) { - freep(&f->frame_thread.b); - freep(&f->frame_thread.cbi); - f->frame_thread.b = malloc(sizeof(*f->frame_thread.b) * - num_sb128 * 32 * 32); - f->frame_thread.cbi = malloc(sizeof(*f->frame_thread.cbi) * - num_sb128 * 32 * 32); - if (!f->frame_thread.b || !f->frame_thread.cbi) { + dav1d_free(f->frame_thread.b); + f->frame_thread.b = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.b) * + num_sb128 * 32 * 32); + if (!f->frame_thread.b) { f->lf.mask_sz = 0; goto error; } @@ -3195,8 +3041,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { f->sr_sb128w = (f->sr_cur.p.p.w + 127) >> 7; const int lr_mask_sz = f->sr_sb128w * f->sb128h; if (lr_mask_sz != f->lf.lr_mask_sz) { - freep(&f->lf.lr_mask); - f->lf.lr_mask = malloc(sizeof(*f->lf.lr_mask) * lr_mask_sz); + dav1d_free(f->lf.lr_mask); + f->lf.lr_mask = dav1d_malloc(ALLOC_LR, sizeof(*f->lf.lr_mask) * lr_mask_sz); if (!f->lf.lr_mask) { f->lf.lr_mask_sz = 0; goto error; @@ -3216,9 +3062,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { const int ipred_edge_sz = f->sbh * f->sb128w << hbd; if (ipred_edge_sz != f->ipred_edge_sz) { - dav1d_freep_aligned(&f->ipred_edge[0]); + dav1d_free_aligned(f->ipred_edge[0]); uint8_t *ptr = f->ipred_edge[0] = - dav1d_alloc_aligned(ipred_edge_sz * 128 * 3, 64); + dav1d_alloc_aligned(ALLOC_IPRED, ipred_edge_sz * 128 * 3, 64); if (!ptr) { f->ipred_edge_sz = 0; goto error; @@ -3230,8 +3076,8 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { const int re_sz = f->sb128h * f->frame_hdr->tiling.cols; if (re_sz != f->lf.re_sz) { - freep(&f->lf.tx_lpf_right_edge[0]); - f->lf.tx_lpf_right_edge[0] = malloc(re_sz * 32 * 2); + dav1d_free(f->lf.tx_lpf_right_edge[0]); + f->lf.tx_lpf_right_edge[0] = dav1d_malloc(ALLOC_LF, re_sz * 32 * 2); if (!f->lf.tx_lpf_right_edge[0]) { f->lf.re_sz = 0; goto error; @@ -3303,7 +3149,6 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) { * dereference those pointers so it doesn't really matter what they * point at, as long as the pointers are valid. */ const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400; - f->lf.mask_ptr = f->lf.mask; f->lf.p[0] = f->cur.data[0]; f->lf.p[1] = f->cur.data[has_chroma ? 1 : 0]; f->lf.p[2] = f->cur.data[has_chroma ? 2 : 0]; @@ -3394,15 +3239,16 @@ int dav1d_decode_frame_main(Dav1dFrameContext *const f) { t->by = sby << (4 + f->seq_hdr->sb128); const int by_end = (t->by + f->sb_step) >> 1; if (f->frame_hdr->use_ref_frame_mvs) { - dav1d_refmvs_load_tmvs(&f->rf, tile_row, - 0, f->bw >> 1, t->by >> 1, by_end); + f->c->refmvs_dsp.load_tmvs(&f->rf, tile_row, + 0, f->bw >> 1, t->by >> 1, by_end); } for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) { t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col]; if (dav1d_decode_tile_sbrow(t)) goto error; } if (IS_INTER_OR_SWITCH(f->frame_hdr)) { - dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end); + dav1d_refmvs_save_tmvs(&f->c->refmvs_dsp, &t->rt, + 0, f->bw >> 1, t->by >> 1, by_end); } // loopfilter + cdef + restoration @@ -3426,7 +3272,7 @@ void dav1d_decode_frame_exit(Dav1dFrameContext *const f, const int retval) { (size_t)f->frame_thread.cf_sz * 128 * 128 / 2); } for (int i = 0; i < 7; i++) { - if (f->refp[i].p.data[0]) + if (f->refp[i].p.frame_hdr) dav1d_thread_picture_unref(&f->refp[i]); dav1d_ref_dec(&f->ref_mvs_ref[i]); } @@ -3592,7 +3438,11 @@ int dav1d_submit_frame(Dav1dContext *const c) { f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \ f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \ f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \ - f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc + f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc; \ + f->bd_fn.copy_pal_block_y = dav1d_copy_pal_block_y_##bd##bpc; \ + f->bd_fn.copy_pal_block_uv = dav1d_copy_pal_block_uv_##bd##bpc; \ + f->bd_fn.read_pal_plane = dav1d_read_pal_plane_##bd##bpc; \ + f->bd_fn.read_pal_uv = dav1d_read_pal_uv_##bd##bpc if (!f->seq_hdr->hbd) { #if CONFIG_8BPC assign_bitdepth_case(8); @@ -3665,9 +3515,9 @@ int dav1d_submit_frame(Dav1dContext *const c) { // FIXME qsort so tiles are in order (for frame threading) if (f->n_tile_data_alloc < c->n_tile_data) { - freep(&f->tile); + dav1d_free(f->tile); assert(c->n_tile_data < INT_MAX / (int)sizeof(*f->tile)); - f->tile = malloc(c->n_tile_data * sizeof(*f->tile)); + f->tile = dav1d_malloc(ALLOC_TILE, c->n_tile_data * sizeof(*f->tile)); if (!f->tile) { f->n_tile_data_alloc = f->n_tile_data = 0; res = DAV1D_ERR(ENOMEM); @@ -3832,7 +3682,7 @@ int dav1d_submit_frame(Dav1dContext *const c) { const unsigned refresh_frame_flags = f->frame_hdr->refresh_frame_flags; for (int i = 0; i < 8; i++) { if (refresh_frame_flags & (1 << i)) { - if (c->refs[i].p.p.data[0]) + if (c->refs[i].p.p.frame_hdr) dav1d_thread_picture_unref(&c->refs[i].p); dav1d_thread_picture_ref(&c->refs[i].p, &f->sr_cur); @@ -3862,7 +3712,7 @@ int dav1d_submit_frame(Dav1dContext *const c) { dav1d_thread_picture_unref(&c->out); for (int i = 0; i < 8; i++) { if (refresh_frame_flags & (1 << i)) { - if (c->refs[i].p.p.data[0]) + if (c->refs[i].p.p.frame_hdr) dav1d_thread_picture_unref(&c->refs[i].p); dav1d_cdf_thread_unref(&c->cdf[i]); dav1d_ref_dec(&c->refs[i].segmap); @@ -3883,7 +3733,7 @@ int dav1d_submit_frame(Dav1dContext *const c) { if (f->frame_hdr->refresh_context) dav1d_cdf_thread_unref(&f->out_cdf); for (int i = 0; i < 7; i++) { - if (f->refp[i].p.data[0]) + if (f->refp[i].p.frame_hdr) dav1d_thread_picture_unref(&f->refp[i]); dav1d_ref_dec(&f->ref_mvs_ref[i]); } diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/fg_apply_tmpl.c b/prog/3rdPartyLibs/codecs/dav1d/src/fg_apply_tmpl.c index 581bcb72f..044e257de 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/fg_apply_tmpl.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/fg_apply_tmpl.c @@ -126,7 +126,6 @@ void bitfn(dav1d_prep_grain)(const Dav1dFilmGrainDSPContext *const dsp, generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]); // Copy over the non-modified planes - // TODO: eliminate in favor of per-plane refs assert(out->stride[0] == in->stride[0]); if (!data->num_y_points) { const ptrdiff_t stride = out->stride[0]; @@ -173,14 +172,14 @@ void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp, const int cpw = (out->p.w + ss_x) >> ss_x; const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY; pixel *const luma_src = - ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]); + ((pixel *) in->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(in->stride[0]); #if BITDEPTH != 8 const int bitdepth_max = (1 << out->p.bpc) - 1; #endif if (data->num_y_points) { - const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE); - dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]), + const int bh = imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE); + dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[0]), luma_src, out->stride[0], data, out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX); } @@ -191,7 +190,7 @@ void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp, return; } - const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y; + const int bh = (imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE) + ss_y) >> ss_y; // extend padding pixels if (out->p.w & ss_x) { @@ -202,7 +201,7 @@ void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp, } } - const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y; + const ptrdiff_t uv_off = row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y; if (data->chroma_scaling_from_luma) { for (int pl = 0; pl < 2; pl++) dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off, @@ -233,7 +232,7 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp, #else uint8_t scaling[3][SCALING_SIZE]; #endif - const int rows = (out->p.h + 31) >> 5; + const int rows = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE; bitfn(dav1d_prep_grain)(dsp, out, in, scaling, grain_lut); for (int row = 0; row < rows; row++) diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/filmgrain.h b/prog/3rdPartyLibs/codecs/dav1d/src/filmgrain.h index a5d6be6d4..1509bb67e 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/filmgrain.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/filmgrain.h @@ -34,7 +34,7 @@ #define GRAIN_WIDTH 82 #define GRAIN_HEIGHT 73 -#define BLOCK_SIZE 32 +#define FG_BLOCK_SIZE 32 #if !defined(BITDEPTH) || BITDEPTH == 8 #define SCALING_SIZE 256 typedef int8_t entry; diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/filmgrain_tmpl.c b/prog/3rdPartyLibs/codecs/dav1d/src/filmgrain_tmpl.c index 0986ac2a5..12e91dd66 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/filmgrain_tmpl.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/filmgrain_tmpl.c @@ -162,8 +162,8 @@ static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH], const int randval = offsets[bx][by]; const int offx = 3 + (2 >> subx) * (3 + (randval >> 4)); const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF)); - return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by] - [offx + x + (BLOCK_SIZE >> subx) * bx]; + return grain_lut[offy + y + (FG_BLOCK_SIZE >> suby) * by] + [offx + x + (FG_BLOCK_SIZE >> subx) * bx]; } static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row, @@ -195,13 +195,13 @@ static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row, seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); } - assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0); + assert(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0); int offsets[2 /* col offset */][2 /* row offset */]; - // process this row in BLOCK_SIZE^2 blocks - for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) { - const int bw = imin(BLOCK_SIZE, (int) pw - bx); + // process this row in FG_BLOCK_SIZE^2 blocks + for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) { + const int bw = imin(FG_BLOCK_SIZE, (int) pw - bx); if (data->overlap_flag && bx) { // shift previous offsets left @@ -306,13 +306,13 @@ fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row, seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); } - assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0); + assert(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0); int offsets[2 /* col offset */][2 /* row offset */]; - // process this row in BLOCK_SIZE^2 blocks (subsampled) - for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { - const int bw = imin(BLOCK_SIZE >> sx, (int)(pw - bx)); + // process this row in FG_BLOCK_SIZE^2 blocks (subsampled) + for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) { + const int bw = imin(FG_BLOCK_SIZE >> sx, (int)(pw - bx)); if (data->overlap_flag && bx) { // shift previous offsets left for (int i = 0; i < rows; i++) diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/getbits.c b/prog/3rdPartyLibs/codecs/dav1d/src/getbits.c index 7bb20140e..03776285d 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/getbits.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/getbits.c @@ -36,51 +36,62 @@ void dav1d_init_get_bits(GetBits *const c, const uint8_t *const data, const size_t sz) { - // If sz were 0, c->eof would need to be initialized to 1. assert(sz); c->ptr = c->ptr_start = data; c->ptr_end = &c->ptr_start[sz]; - c->bits_left = 0; c->state = 0; + c->bits_left = 0; c->error = 0; - c->eof = 0; } -static void refill(GetBits *const c, const unsigned n) { - assert(c->bits_left <= 56); - uint64_t state = 0; - do { - state <<= 8; - c->bits_left += 8; - if (!c->eof) - state |= *c->ptr++; +unsigned dav1d_get_bit(GetBits *const c) { + if (!c->bits_left) { if (c->ptr >= c->ptr_end) { - c->error = c->eof; - c->eof = 1; + c->error = 1; + } else { + const unsigned state = *c->ptr++; + c->bits_left = 7; + c->state = (uint64_t) state << 57; + return state >> 7; } - } while (n > c->bits_left); - c->state |= state << (64 - c->bits_left); -} - -unsigned dav1d_get_bits(GetBits *const c, const unsigned n) { - assert(n <= 32 /* can go up to 57 if we change return type */); - assert(n /* can't shift state by 64 */); - - if (n > c->bits_left) refill(c, n); + } const uint64_t state = c->state; - c->bits_left -= n; - c->state <<= n; + c->bits_left--; + c->state = state << 1; + return (unsigned) (state >> 63); +} - return (unsigned) (state >> (64 - n)); +static inline void refill(GetBits *const c, const int n) { + assert(c->bits_left >= 0 && c->bits_left < 32); + unsigned state = 0; + do { + if (c->ptr >= c->ptr_end) { + c->error = 1; + if (state) break; + return; + } + state = (state << 8) | *c->ptr++; + c->bits_left += 8; + } while (n > c->bits_left); + c->state |= (uint64_t) state << (64 - c->bits_left); } -int dav1d_get_sbits(GetBits *const c, const unsigned n) { - const int shift = 31 - n; - const int res = dav1d_get_bits(c, n + 1) << shift; - return res >> shift; +#define GET_BITS(name, type, type64) \ +type name(GetBits *const c, const int n) { \ + assert(n > 0 && n <= 32); \ + /* Unsigned cast avoids refill after eob */ \ + if ((unsigned) n > (unsigned) c->bits_left) \ + refill(c, n); \ + const uint64_t state = c->state; \ + c->bits_left -= n; \ + c->state = state << n; \ + return (type) ((type64) state >> (64 - n)); \ } +GET_BITS(dav1d_get_bits, unsigned, uint64_t) +GET_BITS(dav1d_get_sbits, int, int64_t) + unsigned dav1d_get_uleb128(GetBits *const c) { uint64_t val = 0; unsigned i = 0, more; @@ -108,15 +119,20 @@ unsigned dav1d_get_uniform(GetBits *const c, const unsigned max) { assert(l > 1); const unsigned m = (1U << l) - max; const unsigned v = dav1d_get_bits(c, l - 1); - return v < m ? v : (v << 1) - m + dav1d_get_bits(c, 1); + return v < m ? v : (v << 1) - m + dav1d_get_bit(c); } unsigned dav1d_get_vlc(GetBits *const c) { + if (dav1d_get_bit(c)) + return 0; + int n_bits = 0; - while (!dav1d_get_bits(c, 1)) + do { if (++n_bits == 32) return 0xFFFFFFFFU; - return n_bits ? ((1U << n_bits) - 1) + dav1d_get_bits(c, n_bits) : 0; + } while (!dav1d_get_bit(c)); + + return ((1U << n_bits) - 1) + dav1d_get_bits(c, n_bits); } static unsigned get_bits_subexp_u(GetBits *const c, const unsigned ref, @@ -132,7 +148,7 @@ static unsigned get_bits_subexp_u(GetBits *const c, const unsigned ref, break; } - if (!dav1d_get_bits(c, 1)) { + if (!dav1d_get_bit(c)) { v += dav1d_get_bits(c, b); break; } @@ -146,17 +162,3 @@ static unsigned get_bits_subexp_u(GetBits *const c, const unsigned ref, int dav1d_get_bits_subexp(GetBits *const c, const int ref, const unsigned n) { return (int) get_bits_subexp_u(c, ref + (1 << n), 2 << n) - (1 << n); } - -void dav1d_bytealign_get_bits(GetBits *c) { - // bits_left is never more than 7, because it is only incremented - // by refill(), called by dav1d_get_bits and that never reads more - // than 7 bits more than it needs. - // - // If this wasn't true, we would need to work out how many bits to - // discard (bits_left % 8), subtract that from bits_left and then - // shift state right by that amount. - assert(c->bits_left <= 7); - - c->bits_left = 0; - c->state = 0; -} diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/getbits.h b/prog/3rdPartyLibs/codecs/dav1d/src/getbits.h index fc382148b..67925943c 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/getbits.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/getbits.h @@ -32,15 +32,15 @@ #include typedef struct GetBits { - int error, eof; uint64_t state; - unsigned bits_left; + int bits_left, error; const uint8_t *ptr, *ptr_start, *ptr_end; } GetBits; void dav1d_init_get_bits(GetBits *c, const uint8_t *data, size_t sz); -unsigned dav1d_get_bits(GetBits *c, unsigned n); -int dav1d_get_sbits(GetBits *c, unsigned n); +unsigned dav1d_get_bit(GetBits *c); +unsigned dav1d_get_bits(GetBits *c, int n); +int dav1d_get_sbits(GetBits *c, int n); unsigned dav1d_get_uleb128(GetBits *c); // Output in range 0..max-1 @@ -49,7 +49,19 @@ unsigned dav1d_get_vlc(GetBits *c); int dav1d_get_bits_subexp(GetBits *c, int ref, unsigned n); // Discard bits from the buffer until we're next byte-aligned. -void dav1d_bytealign_get_bits(GetBits *c); +static inline void dav1d_bytealign_get_bits(GetBits *c) { + // bits_left is never more than 7, because it is only incremented + // by refill(), called by dav1d_get_bits and that never reads more + // than 7 bits more than it needs. + // + // If this wasn't true, we would need to work out how many bits to + // discard (bits_left % 8), subtract that from bits_left and then + // shift state right by that amount. + assert(c->bits_left <= 7); + + c->bits_left = 0; + c->state = 0; +} // Return the current bit position relative to the start of the buffer. static inline unsigned dav1d_get_bits_pos(const GetBits *c) { diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/internal.h b/prog/3rdPartyLibs/codecs/dav1d/src/internal.h index 29d07b803..b1a94f826 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/internal.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/internal.h @@ -53,6 +53,7 @@ typedef struct Dav1dTask Dav1dTask; #include "src/looprestoration.h" #include "src/mc.h" #include "src/msac.h" +#include "src/pal.h" #include "src/picture.h" #include "src/recon.h" #include "src/refmvs.h" @@ -115,6 +116,7 @@ struct Dav1dContext { Dav1dMasteringDisplay *mastering_display; Dav1dRef *itut_t35_ref; Dav1dITUTT35 *itut_t35; + int n_itut_t35; // decoded output picture queue Dav1dData in; @@ -173,17 +175,9 @@ struct Dav1dContext { CdfThreadContext cdf[8]; Dav1dDSPContext dsp[3 /* 8, 10, 12 bits/component */]; + Dav1dPalDSPContext pal_dsp; Dav1dRefmvsDSPContext refmvs_dsp; - // tree to keep track of which edges are available - struct { - EdgeNode *root[2 /* BL_128X128 vs. BL_64X64 */]; - EdgeBranch branch_sb128[1 + 4 + 16 + 64]; - EdgeBranch branch_sb64[1 + 4 + 16]; - EdgeTip tip_sb128[256]; - EdgeTip tip_sb64[64]; - } intra_edge; - Dav1dPicAllocator allocator; int apply_grain; int operating_point; @@ -194,6 +188,7 @@ struct Dav1dContext { int strict_std_compliance; int output_invisible_frames; enum Dav1dInloopFilterType inloop_filters; + enum Dav1dDecodeFrameType decode_frame_type; int drain; enum PictureFlags frame_flags; enum Dav1dEventFlags event_flags; @@ -203,6 +198,7 @@ struct Dav1dContext { Dav1dLogger logger; Dav1dMemPool *picture_pool; + Dav1dMemPool *pic_ctx_pool; }; struct Dav1dTask { @@ -259,6 +255,10 @@ struct Dav1dFrameContext { filter_sbrow_fn filter_sbrow_lr; backup_ipred_edge_fn backup_ipred_edge; read_coef_blocks_fn read_coef_blocks; + copy_pal_block_fn copy_pal_block_y; + copy_pal_block_fn copy_pal_block_uv; + read_pal_plane_fn read_pal_plane; + read_pal_uv_fn read_pal_uv; } bd_fn; int ipred_edge_sz; @@ -280,17 +280,14 @@ struct Dav1dFrameContext { atomic_uint *frame_progress, *copy_lpf_progress; // indexed using t->by * f->b4_stride + t->bx Av1Block *b; - struct CodedBlockInfo { - int16_t eob[3 /* plane */]; - uint8_t txtp[3 /* plane */]; - } *cbi; + int16_t *cbi; /* bits 0-4: txtp, bits 5-15: eob */ // indexed using (t->by >> 1) * (f->b4_stride >> 1) + (t->bx >> 1) - uint16_t (*pal)[3 /* plane */][8 /* idx */]; + pixel (*pal)[3 /* plane */][8 /* idx */]; // iterated over inside tile state uint8_t *pal_idx; coef *cf; int prog_sz; - int pal_sz, pal_idx_sz, cf_sz; + int cbi_sz, pal_sz, pal_idx_sz, cf_sz; // start offsets per tile int *tile_start_off; } frame_thread; @@ -319,7 +316,6 @@ struct Dav1dFrameContext { int start_of_tile_row_sz; int need_cdef_lpf_copy; pixel *p[3], *sr_p[3]; - Av1Filter *mask_ptr, *prev_mask_ptr; int restore_planes; // enum LrRestorePlanes } lf; @@ -368,6 +364,7 @@ struct Dav1dTileState { atomic_int progress[2 /* 0: reconstruction, 1: entropy */]; struct { uint8_t *pal_idx; + int16_t *cbi; coef *cf; } frame_thread[2 /* 0: reconstruction, 1: entropy */]; @@ -397,11 +394,11 @@ struct Dav1dTaskContext { int16_t cf_8bpc [32 * 32]; int32_t cf_16bpc[32 * 32]; }; - // FIXME types can be changed to pixel (and dynamically allocated) - // which would make copy/assign operations slightly faster? - uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */]; + union { + uint8_t al_pal_8bpc [2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */]; + uint16_t al_pal_16bpc[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */]; + }; uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */]; - uint8_t txtp_map[32 * 32]; // inter-only ALIGN(union, 64) { struct { union { @@ -426,17 +423,22 @@ struct Dav1dTaskContext { uint8_t pal_ctx[64]; }; }; - int16_t ac[32 * 32]; - uint8_t pal_idx[2 * 64 * 64]; - uint16_t pal[3 /* plane */][8 /* palette_idx */]; - ALIGN(union, 64) { + union { + int16_t ac[32 * 32]; // intra-only + uint8_t txtp_map[32 * 32]; // inter-only + }; + uint8_t pal_idx_y[32 * 64]; + uint8_t pal_idx_uv[64 * 64]; /* also used as pre-pack scratch buffer */ + union { struct { uint8_t interintra_8bpc[64 * 64]; uint8_t edge_8bpc[257]; + ALIGN(uint8_t pal_8bpc[3 /* plane */][8 /* palette_idx */], 8); }; struct { uint16_t interintra_16bpc[64 * 64]; uint16_t edge_16bpc[257]; + ALIGN(uint16_t pal_16bpc[3 /* plane */][8 /* palette_idx */], 16); }; }; }; diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/intra_edge.c b/prog/3rdPartyLibs/codecs/dav1d/src/intra_edge.c index 684d113fa..e9261e6cb 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/intra_edge.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/intra_edge.c @@ -1,6 +1,6 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors - * Copyright © 2018, Two Orioles, LLC + * Copyright © 2018-2023, VideoLAN and dav1d authors + * Copyright © 2018-2023, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -39,97 +39,83 @@ struct ModeSelMem { EdgeTip *nt; }; -static void init_edges(EdgeNode *const node, - const enum BlockLevel bl, - const enum EdgeFlags edge_flags) +/* Because we're using 16-bit offsets to refer to other nodes those arrays + * are placed in a struct to ensure they're consecutive in memory. */ +static struct { + EdgeBranch branch_sb128[1 + 4 + 16 + 64]; + EdgeTip tip_sb128[256]; + EdgeBranch branch_sb64[1 + 4 + 16]; + EdgeTip tip_sb64[64]; +} ALIGN(nodes, 16); + +const EdgeNode *dav1d_intra_edge_tree[2] = { + (EdgeNode*)nodes.branch_sb128, (EdgeNode*)nodes.branch_sb64 +}; + +static COLD void init_edges(EdgeNode *const node, + const enum BlockLevel bl, + const enum EdgeFlags edge_flags) { node->o = edge_flags; + node->h[0] = edge_flags | EDGE_ALL_LEFT_HAS_BOTTOM; + node->v[0] = edge_flags | EDGE_ALL_TOP_HAS_RIGHT; -#define ALL_FL(t) (EDGE_I444_##t | EDGE_I422_##t | EDGE_I420_##t) if (bl == BL_8X8) { EdgeTip *const nt = (EdgeTip *) node; - node->h[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM); - node->h[1] = edge_flags & (ALL_FL(LEFT_HAS_BOTTOM) | + node->h[1] = edge_flags & (EDGE_ALL_LEFT_HAS_BOTTOM | EDGE_I420_TOP_HAS_RIGHT); - - node->v[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT); - node->v[1] = edge_flags & (ALL_FL(TOP_HAS_RIGHT) | + node->v[1] = edge_flags & (EDGE_ALL_TOP_HAS_RIGHT | EDGE_I420_LEFT_HAS_BOTTOM | EDGE_I422_LEFT_HAS_BOTTOM); - nt->split[0] = ALL_FL(TOP_HAS_RIGHT) | ALL_FL(LEFT_HAS_BOTTOM); - nt->split[1] = (edge_flags & ALL_FL(TOP_HAS_RIGHT)) | + nt->split[0] = (edge_flags & EDGE_ALL_TOP_HAS_RIGHT) | EDGE_I422_LEFT_HAS_BOTTOM; - nt->split[2] = edge_flags | EDGE_I444_TOP_HAS_RIGHT; - nt->split[3] = edge_flags & (EDGE_I420_TOP_HAS_RIGHT | + nt->split[1] = edge_flags | EDGE_I444_TOP_HAS_RIGHT; + nt->split[2] = edge_flags & (EDGE_I420_TOP_HAS_RIGHT | EDGE_I420_LEFT_HAS_BOTTOM | EDGE_I422_LEFT_HAS_BOTTOM); } else { EdgeBranch *const nwc = (EdgeBranch *) node; - node->h[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM); - node->h[1] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM); - - node->v[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT); - node->v[1] = edge_flags & ALL_FL(TOP_HAS_RIGHT); - - nwc->h4[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM); - nwc->h4[1] = - nwc->h4[2] = ALL_FL(LEFT_HAS_BOTTOM); - nwc->h4[3] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM); - if (bl == BL_16X16) - nwc->h4[1] |= edge_flags & EDGE_I420_TOP_HAS_RIGHT; - - nwc->v4[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT); - nwc->v4[1] = - nwc->v4[2] = ALL_FL(TOP_HAS_RIGHT); - nwc->v4[3] = edge_flags & ALL_FL(TOP_HAS_RIGHT); - if (bl == BL_16X16) - nwc->v4[1] |= edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM | - EDGE_I422_LEFT_HAS_BOTTOM); - - nwc->tls[0] = ALL_FL(TOP_HAS_RIGHT) | ALL_FL(LEFT_HAS_BOTTOM); - nwc->tls[1] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM); - nwc->tls[2] = edge_flags & ALL_FL(TOP_HAS_RIGHT); - - nwc->trs[0] = edge_flags | ALL_FL(TOP_HAS_RIGHT); - nwc->trs[1] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM); - nwc->trs[2] = 0; - - nwc->tts[0] = ALL_FL(TOP_HAS_RIGHT) | ALL_FL(LEFT_HAS_BOTTOM); - nwc->tts[1] = edge_flags & ALL_FL(TOP_HAS_RIGHT); - nwc->tts[2] = edge_flags & ALL_FL(LEFT_HAS_BOTTOM); - - nwc->tbs[0] = edge_flags | ALL_FL(LEFT_HAS_BOTTOM); - nwc->tbs[1] = edge_flags | ALL_FL(TOP_HAS_RIGHT); - nwc->tbs[2] = 0; + node->h[1] = edge_flags & EDGE_ALL_LEFT_HAS_BOTTOM; + node->v[1] = edge_flags & EDGE_ALL_TOP_HAS_RIGHT; + + nwc->h4 = EDGE_ALL_LEFT_HAS_BOTTOM; + nwc->v4 = EDGE_ALL_TOP_HAS_RIGHT; + if (bl == BL_16X16) { + nwc->h4 |= edge_flags & EDGE_I420_TOP_HAS_RIGHT; + nwc->v4 |= edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM | + EDGE_I422_LEFT_HAS_BOTTOM); + } } } -static void init_mode_node(EdgeBranch *const nwc, - const enum BlockLevel bl, - struct ModeSelMem *const mem, - const int top_has_right, - const int left_has_bottom) +#define PTR_OFFSET(a, b) ((uint16_t)((uintptr_t)(b) - (uintptr_t)(a))) + +static COLD void init_mode_node(EdgeBranch *const nwc, + const enum BlockLevel bl, + struct ModeSelMem *const mem, + const int top_has_right, + const int left_has_bottom) { init_edges(&nwc->node, bl, - (top_has_right ? ALL_FL(TOP_HAS_RIGHT) : 0) | - (left_has_bottom ? ALL_FL(LEFT_HAS_BOTTOM) : 0)); + (top_has_right ? EDGE_ALL_TOP_HAS_RIGHT : 0) | + (left_has_bottom ? EDGE_ALL_LEFT_HAS_BOTTOM : 0)); if (bl == BL_16X16) { for (int n = 0; n < 4; n++) { EdgeTip *const nt = mem->nt++; - nwc->split[n] = &nt->node; + nwc->split_offset[n] = PTR_OFFSET(nwc, nt); init_edges(&nt->node, bl + 1, ((n == 3 || (n == 1 && !top_has_right)) ? 0 : - ALL_FL(TOP_HAS_RIGHT)) | + EDGE_ALL_TOP_HAS_RIGHT) | (!(n == 0 || (n == 2 && left_has_bottom)) ? 0 : - ALL_FL(LEFT_HAS_BOTTOM))); + EDGE_ALL_LEFT_HAS_BOTTOM)); } } else { for (int n = 0; n < 4; n++) { EdgeBranch *const nwc_child = mem->nwc[bl]++; - nwc->split[n] = &nwc_child->node; + nwc->split_offset[n] = PTR_OFFSET(nwc, nwc_child); init_mode_node(nwc_child, bl + 1, mem, !(n == 3 || (n == 1 && !top_has_right)), n == 0 || (n == 2 && left_has_bottom)); @@ -137,29 +123,26 @@ static void init_mode_node(EdgeBranch *const nwc, } } -void dav1d_init_mode_tree(EdgeNode *const root_node, EdgeTip *const nt, - const int allow_sb128) -{ - EdgeBranch *const root = (EdgeBranch *) root_node; +COLD void dav1d_init_intra_edge_tree(void) { + // This function is guaranteed to be called only once struct ModeSelMem mem; - mem.nt = nt; - - if (allow_sb128) { - mem.nwc[BL_128X128] = &root[1]; - mem.nwc[BL_64X64] = &root[1 + 4]; - mem.nwc[BL_32X32] = &root[1 + 4 + 16]; - init_mode_node(root, BL_128X128, &mem, 1, 0); - assert(mem.nwc[BL_128X128] == &root[1 + 4]); - assert(mem.nwc[BL_64X64] == &root[1 + 4 + 16]); - assert(mem.nwc[BL_32X32] == &root[1 + 4 + 16 + 64]); - assert(mem.nt == &nt[256]); - } else { - mem.nwc[BL_128X128] = NULL; - mem.nwc[BL_64X64] = &root[1]; - mem.nwc[BL_32X32] = &root[1 + 4]; - init_mode_node(root, BL_64X64, &mem, 1, 0); - assert(mem.nwc[BL_64X64] == &root[1 + 4]); - assert(mem.nwc[BL_32X32] == &root[1 + 4 + 16]); - assert(mem.nt == &nt[64]); - } + + mem.nwc[BL_128X128] = &nodes.branch_sb128[1]; + mem.nwc[BL_64X64] = &nodes.branch_sb128[1 + 4]; + mem.nwc[BL_32X32] = &nodes.branch_sb128[1 + 4 + 16]; + mem.nt = nodes.tip_sb128; + init_mode_node(nodes.branch_sb128, BL_128X128, &mem, 1, 0); + assert(mem.nwc[BL_128X128] == &nodes.branch_sb128[1 + 4]); + assert(mem.nwc[BL_64X64] == &nodes.branch_sb128[1 + 4 + 16]); + assert(mem.nwc[BL_32X32] == &nodes.branch_sb128[1 + 4 + 16 + 64]); + assert(mem.nt == &nodes.tip_sb128[256]); + + mem.nwc[BL_128X128] = NULL; + mem.nwc[BL_64X64] = &nodes.branch_sb64[1]; + mem.nwc[BL_32X32] = &nodes.branch_sb64[1 + 4]; + mem.nt = nodes.tip_sb64; + init_mode_node(nodes.branch_sb64, BL_64X64, &mem, 1, 0); + assert(mem.nwc[BL_64X64] == &nodes.branch_sb64[1 + 4]); + assert(mem.nwc[BL_32X32] == &nodes.branch_sb64[1 + 4 + 16]); + assert(mem.nt == &nodes.tip_sb64[64]); } diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/intra_edge.h b/prog/3rdPartyLibs/codecs/dav1d/src/intra_edge.h index 8b4e15018..ecfb3de56 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/intra_edge.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/intra_edge.h @@ -1,6 +1,6 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors - * Copyright © 2018, Two Orioles, LLC + * Copyright © 2018-2023, VideoLAN and dav1d authors + * Copyright © 2018-2023, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,30 +28,46 @@ #ifndef DAV1D_SRC_INTRA_EDGE_H #define DAV1D_SRC_INTRA_EDGE_H +#include + enum EdgeFlags { - EDGE_I444_TOP_HAS_RIGHT = 1 << 0, - EDGE_I422_TOP_HAS_RIGHT = 1 << 1, - EDGE_I420_TOP_HAS_RIGHT = 1 << 2, + EDGE_I444_TOP_HAS_RIGHT = 1 << 0, + EDGE_I422_TOP_HAS_RIGHT = 1 << 1, + EDGE_I420_TOP_HAS_RIGHT = 1 << 2, EDGE_I444_LEFT_HAS_BOTTOM = 1 << 3, EDGE_I422_LEFT_HAS_BOTTOM = 1 << 4, EDGE_I420_LEFT_HAS_BOTTOM = 1 << 5, + EDGE_ALL_TOP_HAS_RIGHT = EDGE_I444_TOP_HAS_RIGHT | + EDGE_I422_TOP_HAS_RIGHT | + EDGE_I420_TOP_HAS_RIGHT, + EDGE_ALL_LEFT_HAS_BOTTOM = EDGE_I444_LEFT_HAS_BOTTOM | + EDGE_I422_LEFT_HAS_BOTTOM | + EDGE_I420_LEFT_HAS_BOTTOM, + EDGE_ALL_TR_AND_BL = EDGE_ALL_TOP_HAS_RIGHT | + EDGE_ALL_LEFT_HAS_BOTTOM, }; -typedef struct EdgeNode EdgeNode; -struct EdgeNode { - enum EdgeFlags o, h[2], v[2]; -}; +#define INTRA_EDGE_SPLIT(n, i) \ + ((const EdgeNode*)((uintptr_t)(n) + ((const EdgeBranch*)(n))->split_offset[i])) + +typedef struct EdgeNode { + uint8_t /* enum EdgeFlags */ o, h[2], v[2]; +} EdgeNode; + typedef struct EdgeTip { EdgeNode node; - enum EdgeFlags split[4]; + uint8_t /* enum EdgeFlags */ split[3]; } EdgeTip; + typedef struct EdgeBranch { EdgeNode node; - enum EdgeFlags tts[3], tbs[3], tls[3], trs[3], h4[4], v4[4]; - EdgeNode *split[4]; + uint8_t /* enum EdgeFlags */ h4, v4; + uint16_t split_offset[4]; /* relative to the address of this node */ } EdgeBranch; -void dav1d_init_mode_tree(EdgeNode *const root, EdgeTip *const nt, - const int allow_sb128); +/* Tree to keep track of which edges are available. */ +EXTERN const EdgeNode *dav1d_intra_edge_tree[2 /* BL_128X128, BL_64X64 */]; + +void dav1d_init_intra_edge_tree(void); #endif /* DAV1D_SRC_INTRA_EDGE_H */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/ipred.h b/prog/3rdPartyLibs/codecs/dav1d/src/ipred.h index 739ef1a26..35adb02ed 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/ipred.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/ipred.h @@ -74,7 +74,7 @@ typedef decl_cfl_pred_fn(*cfl_pred_fn); * - only 16-byte alignment is guaranteed for idx. */ #define decl_pal_pred_fn(name) \ -void (name)(pixel *dst, ptrdiff_t stride, const uint16_t *pal, \ +void (name)(pixel *dst, ptrdiff_t stride, const pixel *pal, \ const uint8_t *idx, int w, int h) typedef decl_pal_pred_fn(*pal_pred_fn); diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/ipred_tmpl.c b/prog/3rdPartyLibs/codecs/dav1d/src/ipred_tmpl.c index 151d4842a..997581674 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/ipred_tmpl.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/ipred_tmpl.c @@ -715,13 +715,16 @@ cfl_ac_fn(422, 1, 0) cfl_ac_fn(444, 0, 0) static void pal_pred_c(pixel *dst, const ptrdiff_t stride, - const uint16_t *const pal, const uint8_t *idx, + const pixel *const pal, const uint8_t *idx, const int w, const int h) { for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) - dst[x] = (pixel) pal[idx[x]]; - idx += w; + for (int x = 0; x < w; x += 2) { + const int i = *idx++; + assert(!(i & 0x88)); + dst[x + 0] = pal[i & 7]; + dst[x + 1] = pal[i >> 4]; + } dst += PXSTRIDE(stride); } } diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/lf_mask.c b/prog/3rdPartyLibs/codecs/dav1d/src/lf_mask.c index 91fe4a02c..062ba6737 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/lf_mask.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/lf_mask.c @@ -301,6 +301,7 @@ void dav1d_create_lf_mask_intra(Av1Filter *const lflvl, const int bh4 = imin(ih - by, b_dim[1]); const int bx4 = bx & 31; const int by4 = by & 31; + assert(bw4 >= 0 && bh4 >= 0); if (bw4 && bh4) { uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx; @@ -323,6 +324,7 @@ void dav1d_create_lf_mask_intra(Av1Filter *const lflvl, (b_dim[0] + ss_hor) >> ss_hor); const int cbh4 = imin(((ih + ss_ver) >> ss_ver) - (by >> ss_ver), (b_dim[1] + ss_ver) >> ss_ver); + assert(cbw4 >= 0 && cbh4 >= 0); if (!cbw4 || !cbh4) return; @@ -362,6 +364,7 @@ void dav1d_create_lf_mask_inter(Av1Filter *const lflvl, const int bh4 = imin(ih - by, b_dim[1]); const int bx4 = bx & 31; const int by4 = by & 31; + assert(bw4 >= 0 && bh4 >= 0); if (bw4 && bh4) { uint8_t (*level_cache_ptr)[4] = level_cache + by * b4_stride + bx; @@ -385,6 +388,7 @@ void dav1d_create_lf_mask_inter(Av1Filter *const lflvl, (b_dim[0] + ss_hor) >> ss_hor); const int cbh4 = imin(((ih + ss_ver) >> ss_ver) - (by >> ss_ver), (b_dim[1] + ss_ver) >> ss_ver); + assert(cbw4 >= 0 && cbh4 >= 0); if (!cbw4 || !cbh4) return; diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/lf_mask.h b/prog/3rdPartyLibs/codecs/dav1d/src/lf_mask.h index 5edf4a093..8991ed418 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/lf_mask.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/lf_mask.h @@ -40,10 +40,10 @@ typedef struct Av1FilterLUT { } Av1FilterLUT; typedef struct Av1RestorationUnit { + /* SGR: type = DAV1D_RESTORATION_SGRPROJ + sgr_idx */ uint8_t /* enum Dav1dRestorationType */ type; int8_t filter_h[3]; int8_t filter_v[3]; - uint8_t sgr_idx; int8_t sgr_weights[2]; } Av1RestorationUnit; diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/lib.c b/prog/3rdPartyLibs/codecs/dav1d/src/lib.c index 6cd8d5446..3807efdcc 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/lib.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/lib.c @@ -52,16 +52,22 @@ static COLD void init_internal(void) { dav1d_init_cpu(); - dav1d_init_interintra_masks(); + dav1d_init_ii_wedge_masks(); + dav1d_init_intra_edge_tree(); dav1d_init_qm_tables(); dav1d_init_thread(); - dav1d_init_wedge_masks(); } COLD const char *dav1d_version(void) { return DAV1D_VERSION; } +COLD unsigned dav1d_version_api(void) { + return (DAV1D_API_VERSION_MAJOR << 16) | + (DAV1D_API_VERSION_MINOR << 8) | + (DAV1D_API_VERSION_PATCH << 0); +} + COLD void dav1d_default_settings(Dav1dSettings *const s) { s->n_threads = 0; s->max_frame_delay = 0; @@ -77,6 +83,7 @@ COLD void dav1d_default_settings(Dav1dSettings *const s) { s->strict_std_compliance = 0; s->output_invisible_frames = 0; s->inloop_filters = DAV1D_INLOOPFILTER_ALL; + s->decode_frame_type = DAV1D_DECODEFRAMETYPE_ALL; } static void close_internal(Dav1dContext **const c_out, int flush); @@ -144,6 +151,8 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { DAV1D_ERR(EINVAL)); validate_input_or_ret(s->operating_point >= 0 && s->operating_point <= 31, DAV1D_ERR(EINVAL)); + validate_input_or_ret(s->decode_frame_type >= DAV1D_DECODEFRAMETYPE_ALL && + s->decode_frame_type <= DAV1D_DECODEFRAMETYPE_KEY, DAV1D_ERR(EINVAL)); pthread_attr_t thread_attr; if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM); @@ -151,13 +160,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { pthread_attr_setstacksize(&thread_attr, stack_size); -#if defined(_TARGET_C1) - -#elif defined(_TARGET_C2) - -#endif - - Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 64); + Dav1dContext *const c = *c_out = dav1d_alloc_aligned(ALLOC_COMMON_CTX, sizeof(*c), 64); if (!c) goto error; memset(c, 0, sizeof(*c)); @@ -170,14 +173,16 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { c->strict_std_compliance = s->strict_std_compliance; c->output_invisible_frames = s->output_invisible_frames; c->inloop_filters = s->inloop_filters; + c->decode_frame_type = s->decode_frame_type; dav1d_data_props_set_defaults(&c->cached_error_props); - if (dav1d_mem_pool_init(&c->seq_hdr_pool) || - dav1d_mem_pool_init(&c->frame_hdr_pool) || - dav1d_mem_pool_init(&c->segmap_pool) || - dav1d_mem_pool_init(&c->refmvs_pool) || - dav1d_mem_pool_init(&c->cdf_pool)) + if (dav1d_mem_pool_init(ALLOC_OBU_HDR, &c->seq_hdr_pool) || + dav1d_mem_pool_init(ALLOC_OBU_HDR, &c->frame_hdr_pool) || + dav1d_mem_pool_init(ALLOC_SEGMAP, &c->segmap_pool) || + dav1d_mem_pool_init(ALLOC_REFMVS, &c->refmvs_pool) || + dav1d_mem_pool_init(ALLOC_PIC_CTX, &c->pic_ctx_pool) || + dav1d_mem_pool_init(ALLOC_CDF, &c->cdf_pool)) { goto error; } @@ -186,7 +191,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { c->allocator.release_picture_callback == dav1d_default_picture_release) { if (c->allocator.cookie) goto error; - if (dav1d_mem_pool_init(&c->picture_pool)) goto error; + if (dav1d_mem_pool_init(ALLOC_PIC, &c->picture_pool)) goto error; c->allocator.cookie = c->picture_pool; } else if (c->allocator.alloc_picture_callback == dav1d_default_picture_alloc || c->allocator.release_picture_callback == dav1d_default_picture_release) @@ -210,11 +215,11 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { get_num_threads(c, s, &c->n_tc, &c->n_fc); - c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * c->n_fc, 32); + c->fc = dav1d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->fc) * c->n_fc, 32); if (!c->fc) goto error; memset(c->fc, 0, sizeof(*c->fc) * c->n_fc); - c->tc = dav1d_alloc_aligned(sizeof(*c->tc) * c->n_tc, 64); + c->tc = dav1d_alloc_aligned(ALLOC_THREAD_CTX, sizeof(*c->tc) * c->n_tc, 64); if (!c->tc) goto error; memset(c->tc, 0, sizeof(*c->tc) * c->n_tc); if (c->n_tc > 1) { @@ -235,9 +240,11 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { } if (c->n_fc > 1) { + const size_t out_delayed_sz = sizeof(*c->frame_thread.out_delayed) * c->n_fc; c->frame_thread.out_delayed = - calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed)); + dav1d_malloc(ALLOC_THREAD_CTX, out_delayed_sz); if (!c->frame_thread.out_delayed) goto error; + memset(c->frame_thread.out_delayed, 0, out_delayed_sz); } for (unsigned n = 0; n < c->n_fc; n++) { Dav1dFrameContext *const f = &c->fc[n]; @@ -279,14 +286,9 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { t->task_thread.td.inited = 1; } } + dav1d_pal_dsp_init(&c->pal_dsp); dav1d_refmvs_dsp_init(&c->refmvs_dsp); - // intra edge tree - c->intra_edge.root[BL_128X128] = &c->intra_edge.branch_sb128[0].node; - dav1d_init_mode_tree(c->intra_edge.root[BL_128X128], c->intra_edge.tip_sb128, 1); - c->intra_edge.root[BL_64X64] = &c->intra_edge.branch_sb64[0].node; - dav1d_init_mode_tree(c->intra_edge.root[BL_64X64], c->intra_edge.tip_sb64, 0); - pthread_attr_destroy(&thread_attr); return 0; @@ -297,56 +299,6 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { return DAV1D_ERR(ENOMEM); } -static void dummy_free(const uint8_t *const data, void *const user_data) { - assert(data && !user_data); -} - -int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out, - const uint8_t *const ptr, const size_t sz) -{ - Dav1dData buf = { 0 }; - int res; - - validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL)); - - Dav1dSettings s; - dav1d_default_settings(&s); - s.n_threads = 1; - s.logger.callback = NULL; - - Dav1dContext *c; - res = dav1d_open(&c, &s); - if (res < 0) return res; - - if (ptr) { - res = dav1d_data_wrap_internal(&buf, ptr, sz, dummy_free, NULL); - if (res < 0) goto error; - } - - while (buf.sz > 0) { - res = dav1d_parse_obus(c, &buf, 1); - if (res < 0) goto error; - - assert((size_t)res <= buf.sz); - buf.sz -= res; - buf.data += res; - } - - if (!c->seq_hdr) { - res = DAV1D_ERR(ENOENT); - goto error; - } - - memcpy(out, c->seq_hdr, sizeof(*out)); - - res = 0; -error: - dav1d_data_unref_internal(&buf); - dav1d_close(&c); - - return res; -} - static int has_grain(const Dav1dPicture *const pic) { const Dav1dFilmGrainData *fgdata = &pic->frame_hdr->film_grain.data; @@ -399,6 +351,7 @@ static int output_picture_ready(Dav1dContext *const c, const int drain) { static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) { unsigned drain_count = 0; + int drained = 0; do { const unsigned next = c->frame_thread.next; Dav1dFrameContext *const f = &c->fc[next]; @@ -418,6 +371,10 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) { &first, UINT_MAX); if (c->task_thread.cur && c->task_thread.cur < c->n_fc) c->task_thread.cur--; + drained = 1; + } else if (drained) { + pthread_mutex_unlock(&c->task_thread.lock); + break; } if (++c->frame_thread.next == c->n_fc) c->frame_thread.next = 0; @@ -453,14 +410,13 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) { static int gen_picture(Dav1dContext *const c) { - int res; Dav1dData *const in = &c->in; if (output_picture_ready(c, 0)) return 0; while (in->sz > 0) { - res = dav1d_parse_obus(c, in, 0); + const ptrdiff_t res = dav1d_parse_obus(c, in); if (res < 0) { dav1d_data_unref_internal(in); } else { @@ -472,7 +428,7 @@ static int gen_picture(Dav1dContext *const c) if (output_picture_ready(c, 0)) break; if (res < 0) - return res; + return (int)res; } return 0; @@ -482,10 +438,11 @@ int dav1d_send_data(Dav1dContext *const c, Dav1dData *const in) { validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL)); validate_input_or_ret(in != NULL, DAV1D_ERR(EINVAL)); - validate_input_or_ret(in->data == NULL || in->sz, DAV1D_ERR(EINVAL)); - if (in->data) + if (in->data) { + validate_input_or_ret(in->sz > 0 && in->sz <= SIZE_MAX / 2, DAV1D_ERR(EINVAL)); c->drain = 0; + } if (c->in.data) return DAV1D_ERR(EAGAIN); dav1d_data_ref(&c->in, in); @@ -567,16 +524,16 @@ int dav1d_apply_grain(Dav1dContext *const c, Dav1dPicture *const out, void dav1d_flush(Dav1dContext *const c) { dav1d_data_unref_internal(&c->in); - if (c->out.p.data[0]) + if (c->out.p.frame_hdr) dav1d_thread_picture_unref(&c->out); - if (c->cache.p.data[0]) + if (c->cache.p.frame_hdr) dav1d_thread_picture_unref(&c->cache); c->drain = 0; c->cached_error = 0; for (int i = 0; i < 8; i++) { - if (c->refs[i].p.p.data[0]) + if (c->refs[i].p.p.frame_hdr) dav1d_thread_picture_unref(&c->refs[i].p); dav1d_ref_dec(&c->refs[i].segmap); dav1d_ref_dec(&c->refs[i].refmvs); @@ -589,6 +546,7 @@ void dav1d_flush(Dav1dContext *const c) { c->mastering_display = NULL; c->content_light = NULL; c->itut_t35 = NULL; + c->n_itut_t35 = 0; dav1d_ref_dec(&c->mastering_display_ref); dav1d_ref_dec(&c->content_light_ref); dav1d_ref_dec(&c->itut_t35_ref); @@ -631,7 +589,7 @@ void dav1d_flush(Dav1dContext *const c) { f->n_tile_data = 0; f->task_thread.retval = 0; Dav1dThreadPicture *out_delayed = &c->frame_thread.out_delayed[next]; - if (out_delayed->p.data[0]) { + if (out_delayed->p.frame_hdr) { dav1d_thread_picture_unref(out_delayed); } } @@ -642,6 +600,9 @@ void dav1d_flush(Dav1dContext *const c) { COLD void dav1d_close(Dav1dContext **const c_out) { validate_input(c_out != NULL); +#if TRACK_HEAP_ALLOCATIONS + dav1d_log_alloc_stats(*c_out); +#endif close_internal(c_out, 1); } @@ -678,31 +639,31 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) { // clean-up threading stuff if (c->n_fc > 1) { - freep(&f->tile_thread.lowest_pixel_mem); - freep(&f->frame_thread.b); - dav1d_freep_aligned(&f->frame_thread.pal_idx); - dav1d_freep_aligned(&f->frame_thread.cf); - freep(&f->frame_thread.tile_start_off); - dav1d_freep_aligned(&f->frame_thread.pal); - freep(&f->frame_thread.cbi); + dav1d_free(f->tile_thread.lowest_pixel_mem); + dav1d_free(f->frame_thread.b); + dav1d_free_aligned(f->frame_thread.cbi); + dav1d_free_aligned(f->frame_thread.pal_idx); + dav1d_free_aligned(f->frame_thread.cf); + dav1d_free(f->frame_thread.tile_start_off); + dav1d_free_aligned(f->frame_thread.pal); } if (c->n_tc > 1) { pthread_mutex_destroy(&f->task_thread.pending_tasks.lock); pthread_cond_destroy(&f->task_thread.cond); pthread_mutex_destroy(&f->task_thread.lock); } - freep(&f->frame_thread.frame_progress); - freep(&f->task_thread.tasks); - freep(&f->task_thread.tile_tasks[0]); + dav1d_free(f->frame_thread.frame_progress); + dav1d_free(f->task_thread.tasks); + dav1d_free(f->task_thread.tile_tasks[0]); dav1d_free_aligned(f->ts); dav1d_free_aligned(f->ipred_edge[0]); - free(f->a); - free(f->tile); - free(f->lf.mask); - free(f->lf.lr_mask); - free(f->lf.level); - free(f->lf.tx_lpf_right_edge[0]); - free(f->lf.start_of_tile_row); + dav1d_free(f->a); + dav1d_free(f->tile); + dav1d_free(f->lf.mask); + dav1d_free(f->lf.level); + dav1d_free(f->lf.lr_mask); + dav1d_free(f->lf.tx_lpf_right_edge[0]); + dav1d_free(f->lf.start_of_tile_row); dav1d_refmvs_clear(&f->rf); dav1d_free_aligned(f->lf.cdef_line_buf); dav1d_free_aligned(f->lf.lr_line_buf); @@ -710,16 +671,16 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) { dav1d_free_aligned(c->fc); if (c->n_fc > 1 && c->frame_thread.out_delayed) { for (unsigned n = 0; n < c->n_fc; n++) - if (c->frame_thread.out_delayed[n].p.data[0]) + if (c->frame_thread.out_delayed[n].p.frame_hdr) dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]); - free(c->frame_thread.out_delayed); + dav1d_free(c->frame_thread.out_delayed); } for (int n = 0; n < c->n_tile_data; n++) dav1d_data_unref_internal(&c->tile[n].data); - free(c->tile); + dav1d_free(c->tile); for (int n = 0; n < 8; n++) { dav1d_cdf_thread_unref(&c->cdf[n]); - if (c->refs[n].p.p.data[0]) + if (c->refs[n].p.p.frame_hdr) dav1d_thread_picture_unref(&c->refs[n].p); dav1d_ref_dec(&c->refs[n].refmvs); dav1d_ref_dec(&c->refs[n].segmap); @@ -737,6 +698,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) { dav1d_mem_pool_end(c->refmvs_pool); dav1d_mem_pool_end(c->cdf_pool); dav1d_mem_pool_end(c->picture_pool); + dav1d_mem_pool_end(c->pic_ctx_pool); dav1d_freep_aligned(c_out); } diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/log.c b/prog/3rdPartyLibs/codecs/dav1d/src/log.c index de6776a61..a08f6eb68 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/log.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/log.c @@ -44,7 +44,7 @@ COLD void dav1d_log_default_callback(void *const cookie, } COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) { - validate_input(c != NULL); + assert(c != NULL); if (!c->logger.callback) return; diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/lr_apply_tmpl.c b/prog/3rdPartyLibs/codecs/dav1d/src/lr_apply_tmpl.c index c517f8982..ec0acdf60 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/lr_apply_tmpl.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/lr_apply_tmpl.c @@ -71,8 +71,9 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p, lr_fn = dsp->lr.wiener[!(filter[0][0] | filter[1][0])]; } else { - assert(lr->type == DAV1D_RESTORATION_SGRPROJ); - const uint16_t *const sgr_params = dav1d_sgr_params[lr->sgr_idx]; + assert(lr->type >= DAV1D_RESTORATION_SGRPROJ); + const int sgr_idx = lr->type - DAV1D_RESTORATION_SGRPROJ; + const uint16_t *const sgr_params = dav1d_sgr_params[sgr_idx]; params.sgr.s0 = sgr_params[0]; params.sgr.s1 = sgr_params[1]; params.sgr.w0 = lr->sgr_weights[0]; diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/mem.c b/prog/3rdPartyLibs/codecs/dav1d/src/mem.c index 558bc01ca..7e6eb4c06 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/mem.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/mem.c @@ -31,9 +31,208 @@ #include "src/internal.h" +#if TRACK_HEAP_ALLOCATIONS +#include + +#include "src/log.h" + +#define DEFAULT_ALIGN 16 + +typedef struct { + size_t sz; + unsigned align; + enum AllocationType type; +} Dav1dAllocationData; + +typedef struct { + size_t curr_sz; + size_t peak_sz; + unsigned num_allocs; + unsigned num_reuses; +} AllocStats; + +static AllocStats tracked_allocs[N_ALLOC_TYPES]; +static size_t curr_total_sz; +static size_t peak_total_sz; +static pthread_mutex_t track_alloc_mutex = PTHREAD_MUTEX_INITIALIZER; + +static void *track_alloc(const enum AllocationType type, char *ptr, + const size_t sz, const size_t align) +{ + assert(align >= sizeof(Dav1dAllocationData)); + if (ptr) { + ptr += align; + Dav1dAllocationData *const d = &((Dav1dAllocationData*)ptr)[-1]; + AllocStats *const s = &tracked_allocs[type]; + + d->sz = sz; + d->align = (unsigned)align; + d->type = type; + + pthread_mutex_lock(&track_alloc_mutex); + s->num_allocs++; + s->curr_sz += sz; + if (s->curr_sz > s->peak_sz) + s->peak_sz = s->curr_sz; + + curr_total_sz += sz; + if (curr_total_sz > peak_total_sz) + peak_total_sz = curr_total_sz; + pthread_mutex_unlock(&track_alloc_mutex); + } + return ptr; +} + +static void *track_free(char *const ptr) { + const Dav1dAllocationData *const d = &((Dav1dAllocationData*)ptr)[-1]; + const size_t sz = d->sz; + + pthread_mutex_lock(&track_alloc_mutex); + tracked_allocs[d->type].curr_sz -= sz; + curr_total_sz -= sz; + pthread_mutex_unlock(&track_alloc_mutex); + + return ptr - d->align; +} + +static void dav1d_track_reuse(const enum AllocationType type) { + pthread_mutex_lock(&track_alloc_mutex); + tracked_allocs[type].num_reuses++; + pthread_mutex_unlock(&track_alloc_mutex); +} + +void *dav1d_malloc(const enum AllocationType type, const size_t sz) { + void *const ptr = malloc(sz + DEFAULT_ALIGN); + return track_alloc(type, ptr, sz, DEFAULT_ALIGN); +} + +void *dav1d_alloc_aligned(const enum AllocationType type, + const size_t sz, const size_t align) +{ + assert(!(align & (align - 1))); + void *ptr; +#ifdef _WIN32 + ptr = _aligned_malloc(sz + align, align); +#elif defined(HAVE_POSIX_MEMALIGN) + if (posix_memalign(&ptr, align, sz + align)) return NULL; +#else + ptr = memalign(align, sz + align); +#endif + + return track_alloc(type, ptr, sz, align); +} + +void *dav1d_realloc(const enum AllocationType type, + void *ptr, const size_t sz) +{ + if (!ptr) + return dav1d_malloc(type, sz); + ptr = realloc((char*)ptr - DEFAULT_ALIGN, sz + DEFAULT_ALIGN); + if (ptr) + ptr = track_free((char*)ptr + DEFAULT_ALIGN); + return track_alloc(type, ptr, sz, DEFAULT_ALIGN); +} + +void dav1d_free(void *ptr) { + if (ptr) + free(track_free(ptr)); +} + +void dav1d_free_aligned(void *ptr) { + if (ptr) { + ptr = track_free(ptr); +#ifdef _WIN32 + _aligned_free(ptr); +#else + free(ptr); +#endif + } +} + +static COLD int cmp_stats(const void *const a, const void *const b) { + const size_t a_sz = ((const AllocStats*)a)->peak_sz; + const size_t b_sz = ((const AllocStats*)b)->peak_sz; + return a_sz < b_sz ? -1 : a_sz > b_sz; +} + +/* Insert spaces as thousands separators for better readability */ +static COLD int format_tsep(char *const s, const size_t n, const size_t value) { + if (value < 1000) + return snprintf(s, n, "%u", (unsigned)value); + + const int len = format_tsep(s, n, value / 1000); + assert((size_t)len < n); + return len + snprintf(s + len, n - len, " %03u", (unsigned)(value % 1000)); +} + +COLD void dav1d_log_alloc_stats(Dav1dContext *const c) { + static const char *const type_names[N_ALLOC_TYPES] = { + [ALLOC_BLOCK ] = "Block data", + [ALLOC_CDEF ] = "CDEF line buffers", + [ALLOC_CDF ] = "CDF contexts", + [ALLOC_COEF ] = "Coefficient data", + [ALLOC_COMMON_CTX] = "Common context data", + [ALLOC_DAV1DDATA ] = "Dav1dData", + [ALLOC_IPRED ] = "Intra pred edges", + [ALLOC_LF ] = "Loopfilter data", + [ALLOC_LR ] = "Looprestoration data", + [ALLOC_OBU_HDR ] = "OBU headers", + [ALLOC_OBU_META ] = "OBU metadata", + [ALLOC_PAL ] = "Palette data", + [ALLOC_PIC ] = "Picture buffers", + [ALLOC_PIC_CTX ] = "Picture context data", + [ALLOC_REFMVS ] = "Reference mv data", + [ALLOC_SEGMAP ] = "Segmentation maps", + [ALLOC_THREAD_CTX] = "Thread context data", + [ALLOC_TILE ] = "Tile data", + }; + + struct { + AllocStats stats; + enum AllocationType type; + } data[N_ALLOC_TYPES]; + unsigned total_allocs = 0; + unsigned total_reuses = 0; + + pthread_mutex_lock(&track_alloc_mutex); + for (int i = 0; i < N_ALLOC_TYPES; i++) { + AllocStats *const s = &data[i].stats; + *s = tracked_allocs[i]; + data[i].type = i; + total_allocs += s->num_allocs; + total_reuses += s->num_reuses; + } + size_t total_sz = peak_total_sz; + pthread_mutex_unlock(&track_alloc_mutex); + + /* Sort types by memory usage */ + qsort(&data, N_ALLOC_TYPES, sizeof(*data), cmp_stats); + + const double inv_total_share = 100.0 / total_sz; + char total_sz_buf[32]; + const int sz_len = 4 + format_tsep(total_sz_buf, sizeof(total_sz_buf), total_sz); + + dav1d_log(c, "\n Type Allocs Reuses Share Peak size\n" + "---------------------------------------------------------------------\n"); + for (int i = N_ALLOC_TYPES - 1; i >= 0; i--) { + const AllocStats *const s = &data[i].stats; + if (s->num_allocs) { + const double share = s->peak_sz * inv_total_share; + char sz_buf[32]; + format_tsep(sz_buf, sizeof(sz_buf), s->peak_sz); + dav1d_log(c, " %-20s%10u%10u%8.1f%%%*s\n", type_names[data[i].type], + s->num_allocs, s->num_reuses, share, sz_len, sz_buf); + } + } + dav1d_log(c, "---------------------------------------------------------------------\n" + "%31u%10u %s\n", + total_allocs, total_reuses, total_sz_buf); +} +#endif /* TRACK_HEAP_ALLOCATIONS */ + static COLD void mem_pool_destroy(Dav1dMemPool *const pool) { pthread_mutex_destroy(&pool->lock); - free(pool); + dav1d_free(pool); } void dav1d_mem_pool_push(Dav1dMemPool *const pool, Dav1dMemPoolBuffer *const buf) { @@ -66,10 +265,14 @@ Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *const pool, const size_t si dav1d_free_aligned(data); goto alloc; } +#if TRACK_HEAP_ALLOCATIONS + dav1d_track_reuse(pool->type); +#endif } else { pthread_mutex_unlock(&pool->lock); alloc: - data = dav1d_alloc_aligned(size + sizeof(Dav1dMemPoolBuffer), 64); + data = dav1d_alloc_aligned(pool->type, + size + sizeof(Dav1dMemPoolBuffer), 64); if (!data) { pthread_mutex_lock(&pool->lock); const int ref_cnt = --pool->ref_cnt; @@ -84,17 +287,23 @@ Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *const pool, const size_t si return buf; } -COLD int dav1d_mem_pool_init(Dav1dMemPool **const ppool) { - Dav1dMemPool *const pool = malloc(sizeof(Dav1dMemPool)); +COLD int dav1d_mem_pool_init(const enum AllocationType type, + Dav1dMemPool **const ppool) +{ + Dav1dMemPool *const pool = dav1d_malloc(ALLOC_COMMON_CTX, + sizeof(Dav1dMemPool)); if (pool) { if (!pthread_mutex_init(&pool->lock, NULL)) { pool->buf = NULL; pool->ref_cnt = 1; pool->end = 0; +#if TRACK_HEAP_ALLOCATIONS + pool->type = type; +#endif *ppool = pool; return 0; } - free(pool); + dav1d_free(pool); } *ppool = NULL; return DAV1D_ERR(ENOMEM); diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/mem.h b/prog/3rdPartyLibs/codecs/dav1d/src/mem.h index 41ae47a2f..0a8c18d70 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/mem.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/mem.h @@ -28,16 +28,42 @@ #ifndef DAV1D_SRC_MEM_H #define DAV1D_SRC_MEM_H +#define TRACK_HEAP_ALLOCATIONS 0 + #include -#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN) +#if defined(_WIN32) || !defined(HAVE_POSIX_MEMALIGN) #include #endif +#include "dav1d/dav1d.h" + #include "common/attributes.h" #include "src/thread.h" +enum AllocationType { + ALLOC_BLOCK, + ALLOC_CDEF, + ALLOC_CDF, + ALLOC_COEF, + ALLOC_COMMON_CTX, + ALLOC_DAV1DDATA, + ALLOC_IPRED, + ALLOC_LF, + ALLOC_LR, + ALLOC_OBU_HDR, + ALLOC_OBU_META, + ALLOC_PAL, + ALLOC_PIC, + ALLOC_PIC_CTX, + ALLOC_REFMVS, + ALLOC_SEGMAP, + ALLOC_THREAD_CTX, + ALLOC_TILE, + N_ALLOC_TYPES, +}; + typedef struct Dav1dMemPoolBuffer { void *data; struct Dav1dMemPoolBuffer *next; @@ -48,54 +74,62 @@ typedef struct Dav1dMemPool { Dav1dMemPoolBuffer *buf; int ref_cnt; int end; +#if TRACK_HEAP_ALLOCATIONS + enum AllocationType type; +#endif } Dav1dMemPool; -void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf); -Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size); -int dav1d_mem_pool_init(Dav1dMemPool **pool); -void dav1d_mem_pool_end(Dav1dMemPool *pool); + +#if TRACK_HEAP_ALLOCATIONS +void *dav1d_malloc(enum AllocationType type, size_t sz); +void *dav1d_realloc(enum AllocationType type, void *ptr, size_t sz); +void *dav1d_alloc_aligned(enum AllocationType type, size_t sz, size_t align); +void dav1d_free(void *ptr); +void dav1d_free_aligned(void *ptr); +void dav1d_log_alloc_stats(Dav1dContext *c); +#else +#define dav1d_mem_pool_init(type, pool) dav1d_mem_pool_init(pool) +#define dav1d_malloc(type, sz) malloc(sz) +#define dav1d_realloc(type, ptr, sz) realloc(ptr, sz) +#define dav1d_free(ptr) free(ptr) /* * Allocate align-byte aligned memory. The return value can be released * by calling the dav1d_free_aligned() function. */ -static inline void *dav1d_alloc_aligned(size_t sz, size_t align) { +static inline void *dav1d_alloc_aligned(const size_t sz, const size_t align) { assert(!(align & (align - 1))); -#ifdef HAVE_POSIX_MEMALIGN +#ifdef _WIN32 + return _aligned_malloc(sz, align); +#elif defined(HAVE_POSIX_MEMALIGN) void *ptr; if (posix_memalign(&ptr, align, sz)) return NULL; return ptr; -#elif defined(HAVE_ALIGNED_MALLOC) - return _aligned_malloc(sz, align); -#elif defined(HAVE_MEMALIGN) - return memalign(align, sz); #else -#error Missing aligned alloc implementation + return memalign(align, sz); #endif } +#define dav1d_alloc_aligned(type, sz, align) dav1d_alloc_aligned(sz, align) -static inline void dav1d_free_aligned(void* ptr) { -#ifdef HAVE_POSIX_MEMALIGN - free(ptr); -#elif defined(HAVE_ALIGNED_MALLOC) +static inline void dav1d_free_aligned(void *ptr) { +#ifdef _WIN32 _aligned_free(ptr); -#elif defined(HAVE_MEMALIGN) +#else free(ptr); #endif } -static inline void dav1d_freep_aligned(void* ptr) { - void **mem = (void **) ptr; - if (*mem) { - dav1d_free_aligned(*mem); - *mem = NULL; - } -} +#endif /* TRACK_HEAP_ALLOCATIONS */ + +void dav1d_mem_pool_push(Dav1dMemPool *pool, Dav1dMemPoolBuffer *buf); +Dav1dMemPoolBuffer *dav1d_mem_pool_pop(Dav1dMemPool *pool, size_t size); +int dav1d_mem_pool_init(enum AllocationType type, Dav1dMemPool **pool); +void dav1d_mem_pool_end(Dav1dMemPool *pool); -static inline void freep(void *ptr) { +static inline void dav1d_freep_aligned(void *ptr) { void **mem = (void **) ptr; if (*mem) { - free(*mem); + dav1d_free_aligned(*mem); *mem = NULL; } } diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/meson.build b/prog/3rdPartyLibs/codecs/dav1d/src/meson.build index 719015496..3a34e76a8 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/meson.build +++ b/prog/3rdPartyLibs/codecs/dav1d/src/meson.build @@ -37,16 +37,19 @@ libdav1d_sources = files( 'intra_edge.c', 'itx_1d.c', 'lf_mask.c', + 'lib.c', 'log.c', 'mem.c', 'msac.c', 'obu.c', + 'pal.c', 'picture.c', 'qm.c', 'ref.c', 'refmvs.c', 'scan.c', 'tables.c', + 'thread_task.c', 'warpmv.c', 'wedge.c', ) @@ -74,14 +77,6 @@ libdav1d_arch_tmpl_sources = [] libdav1d_bitdepth_objs = [] -# libdav1d entrypoint source files -# These source files contain library entry points and are -# built with the stack-realign flag set, where necessary. -libdav1d_entrypoints_sources = files( - 'lib.c', - 'thread_task.c' -) - # ASM specific sources libdav1d_asm_objs = [] # Arch-specific flags @@ -173,6 +168,7 @@ if is_asm_enabled libdav1d_sources_asm = files( 'x86/cpuid.asm', 'x86/msac.asm', + 'x86/pal.asm', 'x86/refmvs.asm', 'x86/itx_avx512.asm', 'x86/cdef_avx2.asm', @@ -245,7 +241,7 @@ endif libdav1d_rc_obj = [] -libdav1d_flags = [stackalign_flag] +libdav1d_flags = [] api_export_flags = [] # @@ -280,18 +276,6 @@ endif # Library definitions # -# Helper library for dav1d entrypoints -libdav1d_entrypoints_objs = static_library('dav1d_entrypoint', - libdav1d_entrypoints_sources, - rev_target, config_h_target, - - include_directories : dav1d_inc_dirs, - dependencies: [stdatomic_dependencies], - c_args : [libdav1d_flags, stackrealign_flag, api_export_flags], - install : false, - build_by_default : false, -).extract_all_objects(recursive: true) - # Helper library for each bitdepth libdav1d_bitdepth_objs = [] foreach bitdepth : dav1d_bitdepths @@ -330,10 +314,11 @@ libdav1d = library('dav1d', libdav1d_sources, libdav1d_asm_objs, libdav1d_rc_obj, + rev_target, + config_h_target, objects : [ libdav1d_bitdepth_objs, - libdav1d_entrypoints_objs ], include_directories : dav1d_inc_dirs, diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/obu.c b/prog/3rdPartyLibs/codecs/dav1d/src/obu.c index bd117944a..78d652b4c 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/obu.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/obu.c @@ -35,6 +35,7 @@ #include "common/frame.h" #include "common/intops.h" +#include "common/validate.h" #include "src/decode.h" #include "src/getbits.h" @@ -44,8 +45,33 @@ #include "src/ref.h" #include "src/thread_task.h" -static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, - Dav1dSequenceHeader *const hdr) +static int check_trailing_bits(GetBits *const gb, + const int strict_std_compliance) +{ + const int trailing_one_bit = dav1d_get_bit(gb); + + if (gb->error) + return DAV1D_ERR(EINVAL); + + if (!strict_std_compliance) + return 0; + + if (!trailing_one_bit || gb->state) + return DAV1D_ERR(EINVAL); + + ptrdiff_t size = gb->ptr_end - gb->ptr; + while (size > 0 && gb->ptr[size - 1] == 0) + size--; + + if (size) + return DAV1D_ERR(EINVAL); + + return 0; +} + +static NOINLINE int parse_seq_hdr(Dav1dSequenceHeader *const hdr, + GetBits *const gb, + const int strict_std_compliance) { #define DEBUG_SEQ_HDR 0 @@ -53,6 +79,7 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, const unsigned init_bit_pos = dav1d_get_bits_pos(gb); #endif + memset(hdr, 0, sizeof(*hdr)); hdr->profile = dav1d_get_bits(gb, 3); if (hdr->profile > 2) goto error; #if DEBUG_SEQ_HDR @@ -60,8 +87,8 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, dav1d_get_bits_pos(gb) - init_bit_pos); #endif - hdr->still_picture = dav1d_get_bits(gb, 1); - hdr->reduced_still_picture_header = dav1d_get_bits(gb, 1); + hdr->still_picture = dav1d_get_bit(gb); + hdr->reduced_still_picture_header = dav1d_get_bit(gb); if (hdr->reduced_still_picture_header && !hdr->still_picture) goto error; #if DEBUG_SEQ_HDR printf("SEQHDR: post-stillpicture_flags: off=%u\n", @@ -69,22 +96,18 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, #endif if (hdr->reduced_still_picture_header) { - hdr->timing_info_present = 0; - hdr->decoder_model_info_present = 0; - hdr->display_model_info_present = 0; hdr->num_operating_points = 1; - hdr->operating_points[0].idc = 0; hdr->operating_points[0].major_level = dav1d_get_bits(gb, 3); hdr->operating_points[0].minor_level = dav1d_get_bits(gb, 2); - hdr->operating_points[0].tier = 0; - hdr->operating_points[0].decoder_model_param_present = 0; - hdr->operating_points[0].display_model_param_present = 0; + hdr->operating_points[0].initial_display_delay = 10; } else { - hdr->timing_info_present = dav1d_get_bits(gb, 1); + hdr->timing_info_present = dav1d_get_bit(gb); if (hdr->timing_info_present) { hdr->num_units_in_tick = dav1d_get_bits(gb, 32); hdr->time_scale = dav1d_get_bits(gb, 32); - hdr->equal_picture_interval = dav1d_get_bits(gb, 1); + if (strict_std_compliance && (!hdr->num_units_in_tick || !hdr->time_scale)) + goto error; + hdr->equal_picture_interval = dav1d_get_bit(gb); if (hdr->equal_picture_interval) { const unsigned num_ticks_per_picture = dav1d_get_vlc(gb); if (num_ticks_per_picture == 0xFFFFFFFFU) @@ -92,22 +115,22 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, hdr->num_ticks_per_picture = num_ticks_per_picture + 1; } - hdr->decoder_model_info_present = dav1d_get_bits(gb, 1); + hdr->decoder_model_info_present = dav1d_get_bit(gb); if (hdr->decoder_model_info_present) { hdr->encoder_decoder_buffer_delay_length = dav1d_get_bits(gb, 5) + 1; hdr->num_units_in_decoding_tick = dav1d_get_bits(gb, 32); + if (strict_std_compliance && !hdr->num_units_in_decoding_tick) + goto error; hdr->buffer_removal_delay_length = dav1d_get_bits(gb, 5) + 1; hdr->frame_presentation_delay_length = dav1d_get_bits(gb, 5) + 1; } - } else { - hdr->decoder_model_info_present = 0; } #if DEBUG_SEQ_HDR printf("SEQHDR: post-timinginfo: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif - hdr->display_model_info_present = dav1d_get_bits(gb, 1); + hdr->display_model_info_present = dav1d_get_bit(gb); hdr->num_operating_points = dav1d_get_bits(gb, 5) + 1; for (int i = 0; i < hdr->num_operating_points; i++) { struct Dav1dSequenceHeaderOperatingPoint *const op = @@ -117,23 +140,24 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, goto error; op->major_level = 2 + dav1d_get_bits(gb, 3); op->minor_level = dav1d_get_bits(gb, 2); - op->tier = op->major_level > 3 ? dav1d_get_bits(gb, 1) : 0; - op->decoder_model_param_present = - hdr->decoder_model_info_present && dav1d_get_bits(gb, 1); - if (op->decoder_model_param_present) { - struct Dav1dSequenceHeaderOperatingParameterInfo *const opi = - &hdr->operating_parameter_info[i]; - opi->decoder_buffer_delay = - dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length); - opi->encoder_buffer_delay = - dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length); - opi->low_delay_mode = dav1d_get_bits(gb, 1); - } - op->display_model_param_present = - hdr->display_model_info_present && dav1d_get_bits(gb, 1); - if (op->display_model_param_present) { - op->initial_display_delay = dav1d_get_bits(gb, 4) + 1; + if (op->major_level > 3) + op->tier = dav1d_get_bit(gb); + if (hdr->decoder_model_info_present) { + op->decoder_model_param_present = dav1d_get_bit(gb); + if (op->decoder_model_param_present) { + struct Dav1dSequenceHeaderOperatingParameterInfo *const opi = + &hdr->operating_parameter_info[i]; + opi->decoder_buffer_delay = + dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length); + opi->encoder_buffer_delay = + dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length); + opi->low_delay_mode = dav1d_get_bit(gb); + } } + if (hdr->display_model_info_present) + op->display_model_param_present = dav1d_get_bit(gb); + op->initial_display_delay = + op->display_model_param_present ? dav1d_get_bits(gb, 4) + 1 : 10; } #if DEBUG_SEQ_HDR printf("SEQHDR: post-operating-points: off=%u\n", @@ -141,12 +165,6 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, #endif } - const int op_idx = - c->operating_point < hdr->num_operating_points ? c->operating_point : 0; - c->operating_point_idc = hdr->operating_points[op_idx].idc; - const unsigned spatial_mask = c->operating_point_idc >> 8; - c->max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0; - hdr->width_n_bits = dav1d_get_bits(gb, 4) + 1; hdr->height_n_bits = dav1d_get_bits(gb, 4) + 1; hdr->max_width = dav1d_get_bits(gb, hdr->width_n_bits) + 1; @@ -155,67 +173,58 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, printf("SEQHDR: post-size: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif - hdr->frame_id_numbers_present = - hdr->reduced_still_picture_header ? 0 : dav1d_get_bits(gb, 1); - if (hdr->frame_id_numbers_present) { - hdr->delta_frame_id_n_bits = dav1d_get_bits(gb, 4) + 2; - hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1; + if (!hdr->reduced_still_picture_header) { + hdr->frame_id_numbers_present = dav1d_get_bit(gb); + if (hdr->frame_id_numbers_present) { + hdr->delta_frame_id_n_bits = dav1d_get_bits(gb, 4) + 2; + hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1; + } } #if DEBUG_SEQ_HDR printf("SEQHDR: post-frame-id-numbers-present: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif - hdr->sb128 = dav1d_get_bits(gb, 1); - hdr->filter_intra = dav1d_get_bits(gb, 1); - hdr->intra_edge_filter = dav1d_get_bits(gb, 1); + hdr->sb128 = dav1d_get_bit(gb); + hdr->filter_intra = dav1d_get_bit(gb); + hdr->intra_edge_filter = dav1d_get_bit(gb); if (hdr->reduced_still_picture_header) { - hdr->inter_intra = 0; - hdr->masked_compound = 0; - hdr->warped_motion = 0; - hdr->dual_filter = 0; - hdr->order_hint = 0; - hdr->jnt_comp = 0; - hdr->ref_frame_mvs = 0; - hdr->order_hint_n_bits = 0; hdr->screen_content_tools = DAV1D_ADAPTIVE; hdr->force_integer_mv = DAV1D_ADAPTIVE; } else { - hdr->inter_intra = dav1d_get_bits(gb, 1); - hdr->masked_compound = dav1d_get_bits(gb, 1); - hdr->warped_motion = dav1d_get_bits(gb, 1); - hdr->dual_filter = dav1d_get_bits(gb, 1); - hdr->order_hint = dav1d_get_bits(gb, 1); + hdr->inter_intra = dav1d_get_bit(gb); + hdr->masked_compound = dav1d_get_bit(gb); + hdr->warped_motion = dav1d_get_bit(gb); + hdr->dual_filter = dav1d_get_bit(gb); + hdr->order_hint = dav1d_get_bit(gb); if (hdr->order_hint) { - hdr->jnt_comp = dav1d_get_bits(gb, 1); - hdr->ref_frame_mvs = dav1d_get_bits(gb, 1); - } else { - hdr->jnt_comp = 0; - hdr->ref_frame_mvs = 0; - hdr->order_hint_n_bits = 0; + hdr->jnt_comp = dav1d_get_bit(gb); + hdr->ref_frame_mvs = dav1d_get_bit(gb); } - hdr->screen_content_tools = dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1); + hdr->screen_content_tools = dav1d_get_bit(gb) ? DAV1D_ADAPTIVE : dav1d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-screentools: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif hdr->force_integer_mv = hdr->screen_content_tools ? - dav1d_get_bits(gb, 1) ? DAV1D_ADAPTIVE : dav1d_get_bits(gb, 1) : 2; + dav1d_get_bit(gb) ? DAV1D_ADAPTIVE : dav1d_get_bit(gb) : 2; if (hdr->order_hint) hdr->order_hint_n_bits = dav1d_get_bits(gb, 3) + 1; } - hdr->super_res = dav1d_get_bits(gb, 1); - hdr->cdef = dav1d_get_bits(gb, 1); - hdr->restoration = dav1d_get_bits(gb, 1); + hdr->super_res = dav1d_get_bit(gb); + hdr->cdef = dav1d_get_bit(gb); + hdr->restoration = dav1d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-featurebits: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif - hdr->hbd = dav1d_get_bits(gb, 1); - if (hdr->profile == 2 && hdr->hbd) hdr->hbd += dav1d_get_bits(gb, 1); - hdr->monochrome = hdr->profile != 1 ? dav1d_get_bits(gb, 1) : 0; - hdr->color_description_present = dav1d_get_bits(gb, 1); + hdr->hbd = dav1d_get_bit(gb); + if (hdr->profile == 2 && hdr->hbd) + hdr->hbd += dav1d_get_bit(gb); + if (hdr->profile != 1) + hdr->monochrome = dav1d_get_bit(gb); + hdr->color_description_present = dav1d_get_bit(gb); if (hdr->color_description_present) { hdr->pri = dav1d_get_bits(gb, 8); hdr->trc = dav1d_get_bits(gb, 8); @@ -226,76 +235,109 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, hdr->mtrx = DAV1D_MC_UNKNOWN; } if (hdr->monochrome) { - hdr->color_range = dav1d_get_bits(gb, 1); + hdr->color_range = dav1d_get_bit(gb); hdr->layout = DAV1D_PIXEL_LAYOUT_I400; hdr->ss_hor = hdr->ss_ver = 1; hdr->chr = DAV1D_CHR_UNKNOWN; - hdr->separate_uv_delta_q = 0; } else if (hdr->pri == DAV1D_COLOR_PRI_BT709 && hdr->trc == DAV1D_TRC_SRGB && hdr->mtrx == DAV1D_MC_IDENTITY) { hdr->layout = DAV1D_PIXEL_LAYOUT_I444; - hdr->ss_hor = hdr->ss_ver = 0; hdr->color_range = 1; if (hdr->profile != 1 && !(hdr->profile == 2 && hdr->hbd == 2)) goto error; } else { - hdr->color_range = dav1d_get_bits(gb, 1); + hdr->color_range = dav1d_get_bit(gb); switch (hdr->profile) { case 0: hdr->layout = DAV1D_PIXEL_LAYOUT_I420; hdr->ss_hor = hdr->ss_ver = 1; break; case 1: hdr->layout = DAV1D_PIXEL_LAYOUT_I444; - hdr->ss_hor = hdr->ss_ver = 0; break; case 2: if (hdr->hbd == 2) { - hdr->ss_hor = dav1d_get_bits(gb, 1); - hdr->ss_ver = hdr->ss_hor && dav1d_get_bits(gb, 1); - } else { + hdr->ss_hor = dav1d_get_bit(gb); + if (hdr->ss_hor) + hdr->ss_ver = dav1d_get_bit(gb); + } else hdr->ss_hor = 1; - hdr->ss_ver = 0; - } hdr->layout = hdr->ss_hor ? hdr->ss_ver ? DAV1D_PIXEL_LAYOUT_I420 : DAV1D_PIXEL_LAYOUT_I422 : DAV1D_PIXEL_LAYOUT_I444; break; } - hdr->chr = hdr->ss_hor == 1 && hdr->ss_ver == 1 ? + hdr->chr = (hdr->ss_hor & hdr->ss_ver) ? dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN; } - if (c->strict_std_compliance && + if (strict_std_compliance && hdr->mtrx == DAV1D_MC_IDENTITY && hdr->layout != DAV1D_PIXEL_LAYOUT_I444) { goto error; } - hdr->separate_uv_delta_q = !hdr->monochrome && dav1d_get_bits(gb, 1); + if (!hdr->monochrome) + hdr->separate_uv_delta_q = dav1d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-colorinfo: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif - hdr->film_grain_present = dav1d_get_bits(gb, 1); + hdr->film_grain_present = dav1d_get_bit(gb); #if DEBUG_SEQ_HDR printf("SEQHDR: post-filmgrain: off=%u\n", dav1d_get_bits_pos(gb) - init_bit_pos); #endif - dav1d_get_bits(gb, 1); // dummy bit - // We needn't bother flushing the OBU here: we'll check we didn't // overrun in the caller and will then discard gb, so there's no // point in setting its position properly. - return 0; + return check_trailing_bits(gb, strict_std_compliance); error: - dav1d_log(c, "Error parsing sequence header\n"); return DAV1D_ERR(EINVAL); } +int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out, + const uint8_t *const ptr, const size_t sz) +{ + validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(ptr != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(sz > 0 && sz <= SIZE_MAX / 2, DAV1D_ERR(EINVAL)); + + GetBits gb; + dav1d_init_get_bits(&gb, ptr, sz); + int res = DAV1D_ERR(ENOENT); + + do { + dav1d_get_bit(&gb); // obu_forbidden_bit + const enum Dav1dObuType type = dav1d_get_bits(&gb, 4); + const int has_extension = dav1d_get_bit(&gb); + const int has_length_field = dav1d_get_bit(&gb); + dav1d_get_bits(&gb, 1 + 8 * has_extension); // ignore + + const uint8_t *obu_end = gb.ptr_end; + if (has_length_field) { + const size_t len = dav1d_get_uleb128(&gb); + if (len > (size_t)(obu_end - gb.ptr)) return DAV1D_ERR(EINVAL); + obu_end = gb.ptr + len; + } + + if (type == DAV1D_OBU_SEQ_HDR) { + if ((res = parse_seq_hdr(out, &gb, 0)) < 0) return res; + if (gb.ptr > obu_end) return DAV1D_ERR(EINVAL); + dav1d_bytealign_get_bits(&gb); + } + + if (gb.error) return DAV1D_ERR(EINVAL); + assert(gb.state == 0 && gb.bits_left == 0); + gb.ptr = obu_end; + } while (gb.ptr < gb.ptr_end); + + return res; +} + static int read_frame_size(Dav1dContext *const c, GetBits *const gb, const int use_ref) { @@ -304,15 +346,15 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb, if (use_ref) { for (int i = 0; i < 7; i++) { - if (dav1d_get_bits(gb, 1)) { + if (dav1d_get_bit(gb)) { const Dav1dThreadPicture *const ref = &c->refs[c->frame_hdr->refidx[i]].p; - if (!ref->p.data[0]) return -1; - hdr->width[1] = ref->p.p.w; - hdr->height = ref->p.p.h; + if (!ref->p.frame_hdr) return -1; + hdr->width[1] = ref->p.frame_hdr->width[1]; + hdr->height = ref->p.frame_hdr->height; hdr->render_width = ref->p.frame_hdr->render_width; hdr->render_height = ref->p.frame_hdr->render_height; - hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1); + hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bit(gb); if (hdr->super_res.enabled) { const int d = hdr->super_res.width_scale_denominator = 9 + dav1d_get_bits(gb, 3); @@ -334,7 +376,7 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb, hdr->width[1] = seqhdr->max_width; hdr->height = seqhdr->max_height; } - hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1); + hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bit(gb); if (hdr->super_res.enabled) { const int d = hdr->super_res.width_scale_denominator = 9 + dav1d_get_bits(gb, 3); hdr->width[0] = imax((hdr->width[1] * 8 + (d >> 1)) / d, imin(16, hdr->width[1])); @@ -342,7 +384,7 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb, hdr->super_res.width_scale_denominator = 8; hdr->width[0] = hdr->width[1]; } - hdr->have_render_size = dav1d_get_bits(gb, 1); + hdr->have_render_size = dav1d_get_bit(gb); if (hdr->have_render_size) { hdr->render_width = dav1d_get_bits(gb, 16) + 1; hdr->render_height = dav1d_get_bits(gb, 16) + 1; @@ -374,7 +416,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { Dav1dFrameHeader *const hdr = c->frame_hdr; hdr->show_existing_frame = - !seqhdr->reduced_still_picture_header && dav1d_get_bits(gb, 1); + !seqhdr->reduced_still_picture_header && dav1d_get_bit(gb); #if DEBUG_FRAME_HDR printf("HDR: post-show_existing_frame: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); @@ -392,27 +434,27 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { } hdr->frame_type = seqhdr->reduced_still_picture_header ? DAV1D_FRAME_TYPE_KEY : dav1d_get_bits(gb, 2); - hdr->show_frame = seqhdr->reduced_still_picture_header || dav1d_get_bits(gb, 1); + hdr->show_frame = seqhdr->reduced_still_picture_header || dav1d_get_bit(gb); if (hdr->show_frame) { if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval) hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length); hdr->showable_frame = hdr->frame_type != DAV1D_FRAME_TYPE_KEY; } else - hdr->showable_frame = dav1d_get_bits(gb, 1); + hdr->showable_frame = dav1d_get_bit(gb); hdr->error_resilient_mode = (hdr->frame_type == DAV1D_FRAME_TYPE_KEY && hdr->show_frame) || hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH || - seqhdr->reduced_still_picture_header || dav1d_get_bits(gb, 1); + seqhdr->reduced_still_picture_header || dav1d_get_bit(gb); #if DEBUG_FRAME_HDR printf("HDR: post-frametype_bits: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif - hdr->disable_cdf_update = dav1d_get_bits(gb, 1); + hdr->disable_cdf_update = dav1d_get_bit(gb); hdr->allow_screen_content_tools = seqhdr->screen_content_tools == DAV1D_ADAPTIVE ? - dav1d_get_bits(gb, 1) : seqhdr->screen_content_tools; + dav1d_get_bit(gb) : seqhdr->screen_content_tools; if (hdr->allow_screen_content_tools) hdr->force_integer_mv = seqhdr->force_integer_mv == DAV1D_ADAPTIVE ? - dav1d_get_bits(gb, 1) : seqhdr->force_integer_mv; + dav1d_get_bit(gb) : seqhdr->force_integer_mv; else hdr->force_integer_mv = 0; @@ -423,7 +465,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits); hdr->frame_size_override = seqhdr->reduced_still_picture_header ? 0 : - hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 1 : dav1d_get_bits(gb, 1); + hdr->frame_type == DAV1D_FRAME_TYPE_SWITCH ? 1 : dav1d_get_bit(gb); #if DEBUG_FRAME_HDR printf("HDR: post-frame_size_override_flag: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); @@ -434,7 +476,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { dav1d_get_bits(gb, 3) : DAV1D_PRIMARY_REF_NONE; if (seqhdr->decoder_model_info_present) { - hdr->buffer_removal_time_present = dav1d_get_bits(gb, 1); + hdr->buffer_removal_time_present = dav1d_get_bit(gb); if (hdr->buffer_removal_time_present) { for (int i = 0; i < c->seq_hdr->num_operating_points; i++) { const struct Dav1dSequenceHeaderOperatingPoint *const seqop = &seqhdr->operating_points[i]; @@ -462,7 +504,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { } if (read_frame_size(c, gb, 0) < 0) goto error; hdr->allow_intrabc = hdr->allow_screen_content_tools && - !hdr->super_res.enabled && dav1d_get_bits(gb, 1); + !hdr->super_res.enabled && dav1d_get_bit(gb); hdr->use_ref_frame_mvs = 0; } else { hdr->allow_intrabc = 0; @@ -472,7 +514,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { for (int i = 0; i < 8; i++) dav1d_get_bits(gb, seqhdr->order_hint_n_bits); hdr->frame_ref_short_signaling = - seqhdr->order_hint && dav1d_get_bits(gb, 1); + seqhdr->order_hint && dav1d_get_bit(gb); if (hdr->frame_ref_short_signaling) { // FIXME: Nearly verbatim copy from section 7.8 hdr->refidx[0] = dav1d_get_bits(gb, 3); hdr->refidx[1] = hdr->refidx[2] = -1; @@ -567,8 +609,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { if (!hdr->frame_ref_short_signaling) hdr->refidx[i] = dav1d_get_bits(gb, 3); if (seqhdr->frame_id_numbers_present) { - const int delta_ref_frame_id_minus_1 = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits); - const int ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id_minus_1 - 1) & ((1 << seqhdr->frame_id_n_bits) - 1); + const unsigned delta_ref_frame_id = dav1d_get_bits(gb, seqhdr->delta_frame_id_n_bits) + 1; + const unsigned ref_frame_id = (hdr->frame_id + (1 << seqhdr->frame_id_n_bits) - delta_ref_frame_id) & ((1 << seqhdr->frame_id_n_bits) - 1); Dav1dFrameHeader *const ref_frame_hdr = c->refs[hdr->refidx[i]].p.p.frame_hdr; if (!ref_frame_hdr || ref_frame_hdr->frame_id != ref_frame_id) goto error; } @@ -576,13 +618,13 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { const int use_ref = !hdr->error_resilient_mode && hdr->frame_size_override; if (read_frame_size(c, gb, use_ref) < 0) goto error; - hdr->hp = !hdr->force_integer_mv && dav1d_get_bits(gb, 1); - hdr->subpel_filter_mode = dav1d_get_bits(gb, 1) ? DAV1D_FILTER_SWITCHABLE : + hdr->hp = !hdr->force_integer_mv && dav1d_get_bit(gb); + hdr->subpel_filter_mode = dav1d_get_bit(gb) ? DAV1D_FILTER_SWITCHABLE : dav1d_get_bits(gb, 2); - hdr->switchable_motion_mode = dav1d_get_bits(gb, 1); + hdr->switchable_motion_mode = dav1d_get_bit(gb); hdr->use_ref_frame_mvs = !hdr->error_resilient_mode && seqhdr->ref_frame_mvs && seqhdr->order_hint && - IS_INTER_OR_SWITCH(hdr) && dav1d_get_bits(gb, 1); + IS_INTER_OR_SWITCH(hdr) && dav1d_get_bit(gb); } #if DEBUG_FRAME_HDR printf("HDR: post-frametype-specific-bits: off=%td\n", @@ -590,14 +632,14 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { #endif hdr->refresh_context = !seqhdr->reduced_still_picture_header && - !hdr->disable_cdf_update && !dav1d_get_bits(gb, 1); + !hdr->disable_cdf_update && !dav1d_get_bit(gb); #if DEBUG_FRAME_HDR printf("HDR: post-refresh_context: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif // tile data - hdr->tiling.uniform = dav1d_get_bits(gb, 1); + hdr->tiling.uniform = dav1d_get_bit(gb); const int sbsz_min1 = (64 << seqhdr->sb128) - 1; const int sbsz_log2 = 6 + seqhdr->sb128; const int sbw = (hdr->width[0] + sbsz_min1) >> sbsz_log2; @@ -611,7 +653,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { hdr->tiling.min_log2_cols); if (hdr->tiling.uniform) { for (hdr->tiling.log2_cols = hdr->tiling.min_log2_cols; - hdr->tiling.log2_cols < hdr->tiling.max_log2_cols && dav1d_get_bits(gb, 1); + hdr->tiling.log2_cols < hdr->tiling.max_log2_cols && dav1d_get_bit(gb); hdr->tiling.log2_cols++) ; const int tile_w = 1 + ((sbw - 1) >> hdr->tiling.log2_cols); hdr->tiling.cols = 0; @@ -621,7 +663,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { imax(min_log2_tiles - hdr->tiling.log2_cols, 0); for (hdr->tiling.log2_rows = hdr->tiling.min_log2_rows; - hdr->tiling.log2_rows < hdr->tiling.max_log2_rows && dav1d_get_bits(gb, 1); + hdr->tiling.log2_rows < hdr->tiling.max_log2_rows && dav1d_get_bit(gb); hdr->tiling.log2_rows++) ; const int tile_h = 1 + ((sbh - 1) >> hdr->tiling.log2_rows); hdr->tiling.rows = 0; @@ -663,7 +705,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { goto error; hdr->tiling.n_bytes = dav1d_get_bits(gb, 2) + 1; } else { - hdr->tiling.n_bytes = hdr->tiling.update = 0; + hdr->tiling.n_bytes = 0; + hdr->tiling.update = 0; } #if DEBUG_FRAME_HDR printf("HDR: post-tiling: off=%td\n", @@ -672,17 +715,17 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { // quant data hdr->quant.yac = dav1d_get_bits(gb, 8); - hdr->quant.ydc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; + hdr->quant.ydc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0; if (!seqhdr->monochrome) { // If the sequence header says that delta_q might be different // for U, V, we must check whether it actually is for this // frame. - const int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 1) : 0; - hdr->quant.udc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; - hdr->quant.uac_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; + const int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bit(gb) : 0; + hdr->quant.udc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0; + hdr->quant.uac_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0; if (diff_uv_delta) { - hdr->quant.vdc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; - hdr->quant.vac_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; + hdr->quant.vdc_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0; + hdr->quant.vac_delta = dav1d_get_bit(gb) ? dav1d_get_sbits(gb, 7) : 0; } else { hdr->quant.vdc_delta = hdr->quant.udc_delta; hdr->quant.vac_delta = hdr->quant.uac_delta; @@ -692,12 +735,12 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { printf("HDR: post-quant: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif - hdr->quant.qm = dav1d_get_bits(gb, 1); + hdr->quant.qm = dav1d_get_bit(gb); if (hdr->quant.qm) { hdr->quant.qm_y = dav1d_get_bits(gb, 4); hdr->quant.qm_u = dav1d_get_bits(gb, 4); hdr->quant.qm_v = - seqhdr->separate_uv_delta_q ? (int)dav1d_get_bits(gb, 4) : + seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 4) : hdr->quant.qm_u; } #if DEBUG_FRAME_HDR @@ -706,17 +749,17 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { #endif // segmentation data - hdr->segmentation.enabled = dav1d_get_bits(gb, 1); + hdr->segmentation.enabled = dav1d_get_bit(gb); if (hdr->segmentation.enabled) { if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) { hdr->segmentation.update_map = 1; hdr->segmentation.temporal = 0; hdr->segmentation.update_data = 1; } else { - hdr->segmentation.update_map = dav1d_get_bits(gb, 1); + hdr->segmentation.update_map = dav1d_get_bit(gb); hdr->segmentation.temporal = - hdr->segmentation.update_map ? dav1d_get_bits(gb, 1) : 0; - hdr->segmentation.update_data = dav1d_get_bits(gb, 1); + hdr->segmentation.update_map ? dav1d_get_bit(gb) : 0; + hdr->segmentation.update_data = dav1d_get_bit(gb); } if (hdr->segmentation.update_data) { @@ -725,48 +768,48 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) { Dav1dSegmentationData *const seg = &hdr->segmentation.seg_data.d[i]; - if (dav1d_get_bits(gb, 1)) { - seg->delta_q = dav1d_get_sbits(gb, 8); + if (dav1d_get_bit(gb)) { + seg->delta_q = dav1d_get_sbits(gb, 9); hdr->segmentation.seg_data.last_active_segid = i; } else { seg->delta_q = 0; } - if (dav1d_get_bits(gb, 1)) { - seg->delta_lf_y_v = dav1d_get_sbits(gb, 6); + if (dav1d_get_bit(gb)) { + seg->delta_lf_y_v = dav1d_get_sbits(gb, 7); hdr->segmentation.seg_data.last_active_segid = i; } else { seg->delta_lf_y_v = 0; } - if (dav1d_get_bits(gb, 1)) { - seg->delta_lf_y_h = dav1d_get_sbits(gb, 6); + if (dav1d_get_bit(gb)) { + seg->delta_lf_y_h = dav1d_get_sbits(gb, 7); hdr->segmentation.seg_data.last_active_segid = i; } else { seg->delta_lf_y_h = 0; } - if (dav1d_get_bits(gb, 1)) { - seg->delta_lf_u = dav1d_get_sbits(gb, 6); + if (dav1d_get_bit(gb)) { + seg->delta_lf_u = dav1d_get_sbits(gb, 7); hdr->segmentation.seg_data.last_active_segid = i; } else { seg->delta_lf_u = 0; } - if (dav1d_get_bits(gb, 1)) { - seg->delta_lf_v = dav1d_get_sbits(gb, 6); + if (dav1d_get_bit(gb)) { + seg->delta_lf_v = dav1d_get_sbits(gb, 7); hdr->segmentation.seg_data.last_active_segid = i; } else { seg->delta_lf_v = 0; } - if (dav1d_get_bits(gb, 1)) { + if (dav1d_get_bit(gb)) { seg->ref = dav1d_get_bits(gb, 3); hdr->segmentation.seg_data.last_active_segid = i; hdr->segmentation.seg_data.preskip = 1; } else { seg->ref = -1; } - if ((seg->skip = dav1d_get_bits(gb, 1))) { + if ((seg->skip = dav1d_get_bit(gb))) { hdr->segmentation.seg_data.last_active_segid = i; hdr->segmentation.seg_data.preskip = 1; } - if ((seg->globalmv = dav1d_get_bits(gb, 1))) { + if ((seg->globalmv = dav1d_get_bit(gb))) { hdr->segmentation.seg_data.last_active_segid = i; hdr->segmentation.seg_data.preskip = 1; } @@ -791,12 +834,12 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { #endif // delta q - hdr->delta.q.present = hdr->quant.yac ? dav1d_get_bits(gb, 1) : 0; + hdr->delta.q.present = hdr->quant.yac ? dav1d_get_bit(gb) : 0; hdr->delta.q.res_log2 = hdr->delta.q.present ? dav1d_get_bits(gb, 2) : 0; hdr->delta.lf.present = hdr->delta.q.present && !hdr->allow_intrabc && - dav1d_get_bits(gb, 1); + dav1d_get_bit(gb); hdr->delta.lf.res_log2 = hdr->delta.lf.present ? dav1d_get_bits(gb, 2) : 0; - hdr->delta.lf.multi = hdr->delta.lf.present ? dav1d_get_bits(gb, 1) : 0; + hdr->delta.lf.multi = hdr->delta.lf.present ? dav1d_get_bit(gb) : 0; #if DEBUG_FRAME_HDR printf("HDR: post-delta_q_lf_flags: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); @@ -842,18 +885,18 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { hdr->loopfilter.mode_ref_deltas = c->refs[ref].p.p.frame_hdr->loopfilter.mode_ref_deltas; } - hdr->loopfilter.mode_ref_delta_enabled = dav1d_get_bits(gb, 1); + hdr->loopfilter.mode_ref_delta_enabled = dav1d_get_bit(gb); if (hdr->loopfilter.mode_ref_delta_enabled) { - hdr->loopfilter.mode_ref_delta_update = dav1d_get_bits(gb, 1); + hdr->loopfilter.mode_ref_delta_update = dav1d_get_bit(gb); if (hdr->loopfilter.mode_ref_delta_update) { for (int i = 0; i < 8; i++) - if (dav1d_get_bits(gb, 1)) + if (dav1d_get_bit(gb)) hdr->loopfilter.mode_ref_deltas.ref_delta[i] = - dav1d_get_sbits(gb, 6); + dav1d_get_sbits(gb, 7); for (int i = 0; i < 2; i++) - if (dav1d_get_bits(gb, 1)) + if (dav1d_get_bit(gb)) hdr->loopfilter.mode_ref_deltas.mode_delta[i] = - dav1d_get_sbits(gb, 6); + dav1d_get_sbits(gb, 7); } } } @@ -899,16 +942,16 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { { // Log2 of the restoration unit size. hdr->restoration.unit_size[0] = 6 + seqhdr->sb128; - if (dav1d_get_bits(gb, 1)) { + if (dav1d_get_bit(gb)) { hdr->restoration.unit_size[0]++; if (!seqhdr->sb128) - hdr->restoration.unit_size[0] += dav1d_get_bits(gb, 1); + hdr->restoration.unit_size[0] += dav1d_get_bit(gb); } hdr->restoration.unit_size[1] = hdr->restoration.unit_size[0]; if ((hdr->restoration.type[1] || hdr->restoration.type[2]) && seqhdr->ss_hor == 1 && seqhdr->ss_ver == 1) { - hdr->restoration.unit_size[1] -= dav1d_get_bits(gb, 1); + hdr->restoration.unit_size[1] -= dav1d_get_bit(gb); } } else { hdr->restoration.unit_size[0] = 8; @@ -924,12 +967,12 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { #endif hdr->txfm_mode = hdr->all_lossless ? DAV1D_TX_4X4_ONLY : - dav1d_get_bits(gb, 1) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST; + dav1d_get_bit(gb) ? DAV1D_TX_SWITCHABLE : DAV1D_TX_LARGEST; #if DEBUG_FRAME_HDR printf("HDR: post-txfmmode: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif - hdr->switchable_comp_refs = IS_INTER_OR_SWITCH(hdr) ? dav1d_get_bits(gb, 1) : 0; + hdr->switchable_comp_refs = IS_INTER_OR_SWITCH(hdr) ? dav1d_get_bit(gb) : 0; #if DEBUG_FRAME_HDR printf("HDR: post-refmode: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); @@ -941,7 +984,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { int off_after = -1; int off_before_idx, off_after_idx; for (int i = 0; i < 7; i++) { - if (!c->refs[hdr->refidx[i]].p.p.data[0]) goto error; + if (!c->refs[hdr->refidx[i]].p.p.frame_hdr) goto error; const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset; const int diff = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc); @@ -969,7 +1012,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { unsigned off_before2 = 0xFFFFFFFFU; int off_before2_idx; for (int i = 0; i < 7; i++) { - if (!c->refs[hdr->refidx[i]].p.p.data[0]) goto error; + if (!c->refs[hdr->refidx[i]].p.p.frame_hdr) goto error; const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset; if (get_poc_diff(seqhdr->order_hint_n_bits, refpoc, off_before) < 0) { @@ -990,18 +1033,18 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { } } } - hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bits(gb, 1) : 0; + hdr->skip_mode_enabled = hdr->skip_mode_allowed ? dav1d_get_bit(gb) : 0; #if DEBUG_FRAME_HDR printf("HDR: post-extskip: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif hdr->warp_motion = !hdr->error_resilient_mode && IS_INTER_OR_SWITCH(hdr) && - seqhdr->warped_motion && dav1d_get_bits(gb, 1); + seqhdr->warped_motion && dav1d_get_bit(gb); #if DEBUG_FRAME_HDR printf("HDR: post-warpmotionbit: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); #endif - hdr->reduced_txtp_set = dav1d_get_bits(gb, 1); + hdr->reduced_txtp_set = dav1d_get_bit(gb); #if DEBUG_FRAME_HDR printf("HDR: post-reducedtxtpset: off=%td\n", (gb->ptr - init_ptr) * 8 - gb->bits_left); @@ -1012,9 +1055,9 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { if (IS_INTER_OR_SWITCH(hdr)) { for (int i = 0; i < 7; i++) { - hdr->gmv[i].type = !dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_IDENTITY : - dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_ROT_ZOOM : - dav1d_get_bits(gb, 1) ? DAV1D_WM_TYPE_TRANSLATION : + hdr->gmv[i].type = !dav1d_get_bit(gb) ? DAV1D_WM_TYPE_IDENTITY : + dav1d_get_bit(gb) ? DAV1D_WM_TYPE_ROT_ZOOM : + dav1d_get_bit(gb) ? DAV1D_WM_TYPE_TRANSLATION : DAV1D_WM_TYPE_AFFINE; if (hdr->gmv[i].type == DAV1D_WM_TYPE_IDENTITY) continue; @@ -1063,10 +1106,10 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { hdr->film_grain.present = seqhdr->film_grain_present && (hdr->show_frame || hdr->showable_frame) && - dav1d_get_bits(gb, 1); + dav1d_get_bit(gb); if (hdr->film_grain.present) { const unsigned seed = dav1d_get_bits(gb, 16); - hdr->film_grain.update = hdr->frame_type != DAV1D_FRAME_TYPE_INTER || dav1d_get_bits(gb, 1); + hdr->film_grain.update = hdr->frame_type != DAV1D_FRAME_TYPE_INTER || dav1d_get_bit(gb); if (!hdr->film_grain.update) { const int refidx = dav1d_get_bits(gb, 3); int i; @@ -1090,7 +1133,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { } fgd->chroma_scaling_from_luma = - !seqhdr->monochrome && dav1d_get_bits(gb, 1); + !seqhdr->monochrome && dav1d_get_bit(gb); if (seqhdr->monochrome || fgd->chroma_scaling_from_luma || (seqhdr->ss_ver == 1 && seqhdr->ss_hor == 1 && !fgd->num_y_points)) { @@ -1134,8 +1177,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { fgd->uv_luma_mult[pl] = dav1d_get_bits(gb, 8) - 128; fgd->uv_offset[pl] = dav1d_get_bits(gb, 9) - 256; } - fgd->overlap_flag = dav1d_get_bits(gb, 1); - fgd->clip_to_restricted_range = dav1d_get_bits(gb, 1); + fgd->overlap_flag = dav1d_get_bit(gb); + fgd->clip_to_restricted_range = dav1d_get_bit(gb); } } else { memset(&hdr->film_grain.data, 0, sizeof(hdr->film_grain.data)); @@ -1154,7 +1197,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) { const int n_tiles = c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows; - const int have_tile_pos = n_tiles > 1 ? dav1d_get_bits(gb, 1) : 0; + const int have_tile_pos = n_tiles > 1 ? dav1d_get_bit(gb) : 0; if (have_tile_pos) { const int n_bits = c->frame_hdr->tiling.log2_cols + @@ -1167,44 +1210,19 @@ static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) { } } -// Check that we haven't read more than obu_len bytes from the buffer -// since init_bit_pos. -static int check_for_overrun(Dav1dContext *const c, GetBits *const gb, - const unsigned init_bit_pos, - const unsigned obu_len) -{ - // Make sure we haven't actually read past the end of the gb buffer - if (gb->error) { - dav1d_log(c, "Overrun in OBU bit buffer\n"); - return 1; - } - - const unsigned pos = dav1d_get_bits_pos(gb); - - // We assume that init_bit_pos was the bit position of the buffer - // at some point in the past, so cannot be smaller than pos. - assert (init_bit_pos <= pos); - - if (pos - init_bit_pos > 8 * obu_len) { - dav1d_log(c, "Overrun in OBU bit buffer into next OBU\n"); - return 1; - } - - return 0; -} - -int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int global) { +ptrdiff_t dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) { GetBits gb; int res; dav1d_init_get_bits(&gb, in->data, in->sz); // obu header - dav1d_get_bits(&gb, 1); // obu_forbidden_bit + const int obu_forbidden_bit = dav1d_get_bit(&gb); + if (c->strict_std_compliance && obu_forbidden_bit) goto error; const enum Dav1dObuType type = dav1d_get_bits(&gb, 4); - const int has_extension = dav1d_get_bits(&gb, 1); - const int has_length_field = dav1d_get_bits(&gb, 1); - dav1d_get_bits(&gb, 1); // reserved + const int has_extension = dav1d_get_bit(&gb); + const int has_length_field = dav1d_get_bit(&gb); + dav1d_get_bit(&gb); // reserved int temporal_id = 0, spatial_id = 0; if (has_extension) { @@ -1213,27 +1231,17 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa dav1d_get_bits(&gb, 3); // reserved } - // obu length field - const unsigned len = has_length_field ? - dav1d_get_uleb128(&gb) : (unsigned) in->sz - 1 - has_extension; + if (has_length_field) { + const size_t len = dav1d_get_uleb128(&gb); + if (len > (size_t)(gb.ptr_end - gb.ptr)) goto error; + gb.ptr_end = gb.ptr + len; + } if (gb.error) goto error; - const unsigned init_bit_pos = dav1d_get_bits_pos(&gb); - const unsigned init_byte_pos = init_bit_pos >> 3; - // We must have read a whole number of bytes at this point (1 byte // for the header and whole bytes at a time when reading the // leb128 length field). - assert((init_bit_pos & 7) == 0); - - // We also know that we haven't tried to read more than in->sz - // bytes yet (otherwise the error flag would have been set by the - // code in getbits.c) - assert(in->sz >= init_byte_pos); - - // Make sure that there are enough bits left in the buffer for the - // rest of the OBU. - if (len > in->sz - init_byte_pos) goto error; + assert(gb.bits_left == 0); // skip obu not belonging to the selected temporal/spatial layer if (type != DAV1D_OBU_SEQ_HDR && type != DAV1D_OBU_TD && @@ -1242,7 +1250,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa const int in_temporal_layer = (c->operating_point_idc >> temporal_id) & 1; const int in_spatial_layer = (c->operating_point_idc >> (spatial_id + 8)) & 1; if (!in_temporal_layer || !in_spatial_layer) - return len + init_byte_pos; + return gb.ptr_end - gb.ptr_start; } switch (type) { @@ -1251,15 +1259,18 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa sizeof(Dav1dSequenceHeader)); if (!ref) return DAV1D_ERR(ENOMEM); Dav1dSequenceHeader *seq_hdr = ref->data; - memset(seq_hdr, 0, sizeof(*seq_hdr)); - if ((res = parse_seq_hdr(c, &gb, seq_hdr)) < 0) { - dav1d_ref_dec(&ref); - goto error; - } - if (check_for_overrun(c, &gb, init_bit_pos, len)) { + if ((res = parse_seq_hdr(seq_hdr, &gb, c->strict_std_compliance)) < 0) { + dav1d_log(c, "Error parsing sequence header\n"); dav1d_ref_dec(&ref); goto error; } + + const int op_idx = + c->operating_point < seq_hdr->num_operating_points ? c->operating_point : 0; + c->operating_point_idc = seq_hdr->operating_points[op_idx].idc; + const unsigned spatial_mask = c->operating_point_idc >> 8; + c->max_spatial_id = spatial_mask ? ulog2(spatial_mask) : 0; + // If we have read a sequence header which is different from // the old one, this is a new video sequence and can't use any // previous state. Free that state. @@ -1276,7 +1287,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa dav1d_ref_dec(&c->mastering_display_ref); dav1d_ref_dec(&c->content_light_ref); for (int i = 0; i < 8; i++) { - if (c->refs[i].p.p.data[0]) + if (c->refs[i].p.p.frame_hdr) dav1d_thread_picture_unref(&c->refs[i].p); dav1d_ref_dec(&c->refs[i].segmap); dav1d_ref_dec(&c->refs[i].refmvs); @@ -1299,7 +1310,6 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa // fall-through case DAV1D_OBU_FRAME: case DAV1D_OBU_FRAME_HDR: - if (global) break; if (!c->seq_hdr) goto error; if (!c->frame_hdr_ref) { c->frame_hdr_ref = dav1d_ref_create_using_pool(c->frame_hdr_pool, @@ -1325,8 +1335,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa if (type != DAV1D_OBU_FRAME) { // This is actually a frame header OBU so read the // trailing bit and check for overrun. - dav1d_get_bits(&gb, 1); - if (check_for_overrun(c, &gb, init_bit_pos, len)) { + if (check_trailing_bits(&gb, c->strict_std_compliance) < 0) { c->frame_hdr = NULL; goto error; } @@ -1355,11 +1364,11 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa dav1d_bytealign_get_bits(&gb); // fall-through case DAV1D_OBU_TILE_GRP: { - if (global) break; if (!c->frame_hdr) goto error; if (c->n_tile_data_alloc < c->n_tile_data + 1) { if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error; - struct Dav1dTileGroup *tile = realloc(c->tile, (c->n_tile_data + 1) * sizeof(*c->tile)); + struct Dav1dTileGroup *tile = dav1d_realloc(ALLOC_TILE, c->tile, + (c->n_tile_data + 1) * sizeof(*c->tile)); if (!tile) goto error; c->tile = tile; memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile)); @@ -1368,18 +1377,11 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa parse_tile_hdr(c, &gb); // Align to the next byte boundary and check for overrun. dav1d_bytealign_get_bits(&gb); - if (check_for_overrun(c, &gb, init_bit_pos, len)) - goto error; - // The current bit position is a multiple of 8 (because we - // just aligned it) and less than 8*pkt_bytelen because - // otherwise the overrun check would have fired. - const unsigned pkt_bytelen = init_byte_pos + len; - const unsigned bit_pos = dav1d_get_bits_pos(&gb); - assert((bit_pos & 7) == 0); - assert(pkt_bytelen >= (bit_pos >> 3)); + if (gb.error) goto error; + dav1d_data_ref(&c->tile[c->n_tile_data].data, in); - c->tile[c->n_tile_data].data.data += bit_pos >> 3; - c->tile[c->n_tile_data].data.sz = pkt_bytelen - (bit_pos >> 3); + c->tile[c->n_tile_data].data.data = gb.ptr; + c->tile[c->n_tile_data].data.sz = (size_t)(gb.ptr_end - gb.ptr); // ensure tile groups are in order and sane, see 6.10.1 if (c->tile[c->n_tile_data].start > c->tile[c->n_tile_data].end || c->tile[c->n_tile_data].start != c->n_tiles) @@ -1402,12 +1404,12 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa #endif // obu metadta type field const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb); - const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3; if (gb.error) goto error; switch (meta_type) { case OBU_META_HDR_CLL: { - Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel)); + Dav1dRef *ref = dav1d_ref_create(ALLOC_OBU_META, + sizeof(Dav1dContentLightLevel)); if (!ref) return DAV1D_ERR(ENOMEM); Dav1dContentLightLevel *const content_light = ref->data; @@ -1424,10 +1426,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa (gb.ptr - init_ptr) * 8 - gb.bits_left); #endif - // Skip the trailing bit, align to the next byte boundary and check for overrun. - dav1d_get_bits(&gb, 1); - dav1d_bytealign_get_bits(&gb); - if (check_for_overrun(c, &gb, init_bit_pos, len)) { + if (check_trailing_bits(&gb, c->strict_std_compliance) < 0) { dav1d_ref_dec(&ref); goto error; } @@ -1438,7 +1437,8 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa break; } case OBU_META_HDR_MDCV: { - Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay)); + Dav1dRef *ref = dav1d_ref_create(ALLOC_OBU_META, + sizeof(Dav1dMasteringDisplay)); if (!ref) return DAV1D_ERR(ENOMEM); Dav1dMasteringDisplay *const mastering_display = ref->data; @@ -1476,10 +1476,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa mastering_display->min_luminance, (gb.ptr - init_ptr) * 8 - gb.bits_left); #endif - // Skip the trailing bit, align to the next byte boundary and check for overrun. - dav1d_get_bits(&gb, 1); - dav1d_bytealign_get_bits(&gb); - if (check_for_overrun(c, &gb, init_bit_pos, len)) { + if (check_trailing_bits(&gb, c->strict_std_compliance) < 0) { dav1d_ref_dec(&ref); goto error; } @@ -1490,15 +1487,12 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa break; } case OBU_META_ITUT_T35: { - int payload_size = len; + ptrdiff_t payload_size = gb.ptr_end - gb.ptr; // Don't take into account all the trailing bits for payload_size - while (payload_size > 0 && !in->data[init_byte_pos + payload_size - 1]) + while (payload_size > 0 && !gb.ptr[payload_size - 1]) payload_size--; // trailing_zero_bit x 8 payload_size--; // trailing_one_bit + trailing_zero_bit x 7 - // Don't take into account meta_type bytes - payload_size -= meta_type_len; - int country_code_extension_byte = 0; const int country_code = dav1d_get_bits(&gb, 8); payload_size--; @@ -1507,27 +1501,47 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa payload_size--; } - if (payload_size <= 0) { + if (payload_size <= 0 || gb.ptr[payload_size] != 0x80) { dav1d_log(c, "Malformed ITU-T T.35 metadata message format\n"); break; } - Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t)); - if (!ref) return DAV1D_ERR(ENOMEM); - Dav1dITUTT35 *const itut_t35_metadata = ref->data; + if ((c->n_itut_t35 + 1) > INT_MAX / (int)sizeof(*c->itut_t35)) goto error; + struct Dav1dITUTT35 *itut_t35 = dav1d_realloc(ALLOC_OBU_META, c->itut_t35, + (c->n_itut_t35 + 1) * sizeof(*c->itut_t35)); + if (!itut_t35) goto error; + c->itut_t35 = itut_t35; + memset(c->itut_t35 + c->n_itut_t35, 0, sizeof(*c->itut_t35)); + + struct itut_t35_ctx_context *itut_t35_ctx; + if (!c->n_itut_t35) { + assert(!c->itut_t35_ref); + itut_t35_ctx = dav1d_malloc(ALLOC_OBU_META, sizeof(struct itut_t35_ctx_context)); + if (!itut_t35_ctx) goto error; + c->itut_t35_ref = dav1d_ref_init(&itut_t35_ctx->ref, c->itut_t35, + dav1d_picture_free_itut_t35, itut_t35_ctx, 0); + } else { + assert(c->itut_t35_ref && atomic_load(&c->itut_t35_ref->ref_cnt) == 1); + itut_t35_ctx = c->itut_t35_ref->user_data; + c->itut_t35_ref->const_data = (uint8_t *)c->itut_t35; + } + itut_t35_ctx->itut_t35 = c->itut_t35; + itut_t35_ctx->n_itut_t35 = c->n_itut_t35 + 1; + + Dav1dITUTT35 *const itut_t35_metadata = &c->itut_t35[c->n_itut_t35]; + itut_t35_metadata->payload = dav1d_malloc(ALLOC_OBU_META, payload_size); + if (!itut_t35_metadata->payload) goto error; - // We need our public headers to be C++ compatible, so payload can't be - // a flexible array member - itut_t35_metadata->payload = (uint8_t *) &itut_t35_metadata[1]; itut_t35_metadata->country_code = country_code; itut_t35_metadata->country_code_extension_byte = country_code_extension_byte; - for (int i = 0; i < payload_size; i++) - itut_t35_metadata->payload[i] = dav1d_get_bits(&gb, 8); itut_t35_metadata->payload_size = payload_size; - dav1d_ref_dec(&c->itut_t35_ref); - c->itut_t35 = itut_t35_metadata; - c->itut_t35_ref = ref; + // We know that we've read a whole number of bytes and that the + // payload is within the OBU boundaries, so just use memcpy() + assert(gb.bits_left == 0); + memcpy(itut_t35_metadata->payload, gb.ptr, payload_size); + + c->n_itut_t35++; break; } case OBU_META_SCALABILITY: @@ -1550,12 +1564,26 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa break; default: // print a warning but don't fail for unknown types - dav1d_log(c, "Unknown OBU type %d of size %u\n", type, len); + dav1d_log(c, "Unknown OBU type %d of size %td\n", type, gb.ptr_end - gb.ptr); break; } if (c->seq_hdr && c->frame_hdr) { if (c->frame_hdr->show_existing_frame) { + if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr) goto error; + switch (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type) { + case DAV1D_FRAME_TYPE_INTER: + case DAV1D_FRAME_TYPE_SWITCH: + if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_REFERENCE) + goto skip; + break; + case DAV1D_FRAME_TYPE_INTRA: + if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_INTRA) + goto skip; + // fall-through + default: + break; + } if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) goto error; if (c->strict_std_compliance && !c->refs[c->frame_hdr->existing_frame_idx].p.showable) @@ -1565,7 +1593,15 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa if (c->n_fc == 1) { dav1d_thread_picture_ref(&c->out, &c->refs[c->frame_hdr->existing_frame_idx].p); - dav1d_data_props_copy(&c->out.p.m, &in->m); + dav1d_picture_copy_props(&c->out.p, + c->content_light, c->content_light_ref, + c->mastering_display, c->mastering_display_ref, + c->itut_t35, c->itut_t35_ref, c->n_itut_t35, + &in->m); + // Must be removed from the context after being attached to the frame + dav1d_ref_dec(&c->itut_t35_ref); + c->itut_t35 = NULL; + c->n_itut_t35 = 0; c->event_flags |= dav1d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p); } else { pthread_mutex_lock(&c->task_thread.lock); @@ -1611,7 +1647,16 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa dav1d_thread_picture_ref(out_delayed, &c->refs[c->frame_hdr->existing_frame_idx].p); out_delayed->visible = 1; - dav1d_data_props_copy(&out_delayed->p.m, &in->m); + dav1d_picture_copy_props(&out_delayed->p, + c->content_light, c->content_light_ref, + c->mastering_display, c->mastering_display_ref, + c->itut_t35, c->itut_t35_ref, c->n_itut_t35, + &in->m); + // Must be removed from the context after being attached to the frame + dav1d_ref_dec(&c->itut_t35_ref); + c->itut_t35 = NULL; + c->n_itut_t35 = 0; + pthread_mutex_unlock(&c->task_thread.lock); } if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) { @@ -1620,7 +1665,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa for (int i = 0; i < 8; i++) { if (i == r) continue; - if (c->refs[i].p.p.data[0]) + if (c->refs[i].p.p.frame_hdr) dav1d_thread_picture_unref(&c->refs[i].p); dav1d_thread_picture_ref(&c->refs[i].p, &c->refs[r].p); @@ -1636,6 +1681,23 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa } c->frame_hdr = NULL; } else if (c->n_tiles == c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows) { + switch (c->frame_hdr->frame_type) { + case DAV1D_FRAME_TYPE_INTER: + case DAV1D_FRAME_TYPE_SWITCH: + if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_REFERENCE || + (c->decode_frame_type == DAV1D_DECODEFRAMETYPE_REFERENCE && + !c->frame_hdr->refresh_frame_flags)) + goto skip; + break; + case DAV1D_FRAME_TYPE_INTRA: + if (c->decode_frame_type > DAV1D_DECODEFRAMETYPE_INTRA || + (c->decode_frame_type == DAV1D_DECODEFRAMETYPE_REFERENCE && + !c->frame_hdr->refresh_frame_flags)) + goto skip; + // fall-through + default: + break; + } if (!c->n_tile_data) goto error; if ((res = dav1d_submit_frame(c)) < 0) @@ -1646,10 +1708,31 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa } } - return len + init_byte_pos; + return gb.ptr_end - gb.ptr_start; + +skip: + // update refs with only the headers in case we skip the frame + for (int i = 0; i < 8; i++) { + if (c->frame_hdr->refresh_frame_flags & (1 << i)) { + dav1d_thread_picture_unref(&c->refs[i].p); + c->refs[i].p.p.frame_hdr = c->frame_hdr; + c->refs[i].p.p.seq_hdr = c->seq_hdr; + c->refs[i].p.p.frame_hdr_ref = c->frame_hdr_ref; + c->refs[i].p.p.seq_hdr_ref = c->seq_hdr_ref; + dav1d_ref_inc(c->frame_hdr_ref); + dav1d_ref_inc(c->seq_hdr_ref); + } + } + + dav1d_ref_dec(&c->frame_hdr_ref); + c->frame_hdr = NULL; + c->n_tiles = 0; + + return gb.ptr_end - gb.ptr_start; error: dav1d_data_props_copy(&c->cached_error_props, &in->m); - dav1d_log(c, "Error parsing OBU data\n"); + dav1d_log(c, gb.error ? "Overrun in OBU bit buffer\n" : + "Error parsing OBU data\n"); return DAV1D_ERR(EINVAL); } diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/obu.h b/prog/3rdPartyLibs/codecs/dav1d/src/obu.h index aa79b5277..22901f020 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/obu.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/obu.h @@ -31,6 +31,6 @@ #include "dav1d/data.h" #include "src/internal.h" -int dav1d_parse_obus(Dav1dContext *c, Dav1dData *in, int global); +ptrdiff_t dav1d_parse_obus(Dav1dContext *c, Dav1dData *in); #endif /* DAV1D_SRC_OBU_H */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/pal.c b/prog/3rdPartyLibs/codecs/dav1d/src/pal.c new file mode 100644 index 000000000..f50c7aa21 --- /dev/null +++ b/prog/3rdPartyLibs/codecs/dav1d/src/pal.c @@ -0,0 +1,77 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include + +#include "common/attributes.h" + +#include "src/pal.h" + +// fill invisible edges and pack to 4-bit (2 pixels per byte) +static void pal_idx_finish_c(uint8_t *dst, const uint8_t *src, + const int bw, const int bh, + const int w, const int h) +{ + assert(bw >= 4 && bw <= 64 && !(bw & (bw - 1))); + assert(bh >= 4 && bh <= 64 && !(bh & (bh - 1))); + assert(w >= 4 && w <= bw && !(w & 3)); + assert(h >= 4 && h <= bh && !(h & 3)); + + const int dst_w = w / 2; + const int dst_bw = bw / 2; + + for (int y = 0; y < h; y++, src += bw, dst += dst_bw) { + for (int x = 0; x < dst_w; x++) + dst[x] = src[x * 2 + 0] | (src[x * 2 + 1] << 4); + if (dst_w < dst_bw) + memset(dst + dst_w, src[w - 1] * 0x11, dst_bw - dst_w); + } + + if (h < bh) { + const uint8_t *const last_row = &dst[-dst_bw]; + for (int y = h; y < bh; y++, dst += dst_bw) + memcpy(dst, last_row, dst_bw); + } +} + +#if HAVE_ASM +#if ARCH_X86 +#include "src/x86/pal.h" +#endif +#endif + +COLD void dav1d_pal_dsp_init(Dav1dPalDSPContext *const c) { + c->pal_idx_finish = pal_idx_finish_c; + +#if HAVE_ASM +#if ARCH_X86 + pal_dsp_init_x86(c); +#endif +#endif +} diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/pal.h b/prog/3rdPartyLibs/codecs/dav1d/src/pal.h new file mode 100644 index 000000000..6a6d729be --- /dev/null +++ b/prog/3rdPartyLibs/codecs/dav1d/src/pal.h @@ -0,0 +1,43 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef DAV1D_SRC_PAL_H +#define DAV1D_SRC_PAL_H + +#include + +#define decl_pal_idx_finish_fn(name) \ +void (name)(uint8_t *dst, const uint8_t *src, int bw, int bh, int w, int h) +typedef decl_pal_idx_finish_fn(*pal_idx_finish_fn); + +typedef struct Dav1dPalDSPContext { + pal_idx_finish_fn pal_idx_finish; +} Dav1dPalDSPContext; + +void dav1d_pal_dsp_init(Dav1dPalDSPContext *dsp); + +#endif /* DAV1D_SRC_PAL_H */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/picture.c b/prog/3rdPartyLibs/codecs/dav1d/src/picture.c index 544828fe4..f22f05f0c 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/picture.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/picture.c @@ -89,15 +89,26 @@ void dav1d_default_picture_release(Dav1dPicture *const p, void *const cookie) { struct pic_ctx_context { Dav1dPicAllocator allocator; Dav1dPicture pic; - void *extra_ptr; /* MUST BE AT THE END */ + Dav1dRef ref; + void *extra_data[]; }; static void free_buffer(const uint8_t *const data, void *const user_data) { - struct pic_ctx_context *pic_ctx = user_data; + Dav1dMemPoolBuffer *buf = (Dav1dMemPoolBuffer *)data; + struct pic_ctx_context *pic_ctx = buf->data; pic_ctx->allocator.release_picture_callback(&pic_ctx->pic, pic_ctx->allocator.cookie); - free(pic_ctx); + dav1d_mem_pool_push(user_data, buf); +} + +void dav1d_picture_free_itut_t35(const uint8_t *const data, void *const user_data) { + struct itut_t35_ctx_context *itut_t35_ctx = user_data; + + for (size_t i = 0; i < itut_t35_ctx->n_itut_t35; i++) + dav1d_free(itut_t35_ctx->itut_t35[i].payload); + dav1d_free(itut_t35_ctx->itut_t35); + dav1d_free(itut_t35_ctx); } static int picture_alloc_with_edges(Dav1dContext *const c, @@ -105,13 +116,10 @@ static int picture_alloc_with_edges(Dav1dContext *const c, const int w, const int h, Dav1dSequenceHeader *const seq_hdr, Dav1dRef *const seq_hdr_ref, Dav1dFrameHeader *const frame_hdr, Dav1dRef *const frame_hdr_ref, - Dav1dContentLightLevel *const content_light, Dav1dRef *const content_light_ref, - Dav1dMasteringDisplay *const mastering_display, Dav1dRef *const mastering_display_ref, - Dav1dITUTT35 *const itut_t35, Dav1dRef *const itut_t35_ref, const int bpc, const Dav1dDataProps *const props, Dav1dPicAllocator *const p_allocator, - const size_t extra, void **const extra_ptr) + void **const extra_ptr) { if (p->data[0]) { dav1d_log(c, "Picture already allocated!\n"); @@ -119,35 +127,30 @@ static int picture_alloc_with_edges(Dav1dContext *const c, } assert(bpc > 0 && bpc <= 16); - struct pic_ctx_context *pic_ctx = malloc(extra + sizeof(struct pic_ctx_context)); - if (pic_ctx == NULL) + size_t extra = c->n_fc > 1 ? sizeof(atomic_int) * 2 : 0; + Dav1dMemPoolBuffer *buf = dav1d_mem_pool_pop(c->pic_ctx_pool, + extra + sizeof(struct pic_ctx_context)); + if (buf == NULL) return DAV1D_ERR(ENOMEM); + struct pic_ctx_context *pic_ctx = buf->data; + p->p.w = w; p->p.h = h; p->seq_hdr = seq_hdr; p->frame_hdr = frame_hdr; - p->content_light = content_light; - p->mastering_display = mastering_display; - p->itut_t35 = itut_t35; p->p.layout = seq_hdr->layout; p->p.bpc = bpc; dav1d_data_props_set_defaults(&p->m); const int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie); if (res < 0) { - free(pic_ctx); + dav1d_mem_pool_push(c->pic_ctx_pool, buf); return res; } pic_ctx->allocator = *p_allocator; pic_ctx->pic = *p; - - if (!(p->ref = dav1d_ref_wrap(p->data[0], free_buffer, pic_ctx))) { - p_allocator->release_picture_callback(p, p_allocator->cookie); - free(pic_ctx); - dav1d_log(c, "Failed to wrap picture: %s\n", strerror(errno)); - return DAV1D_ERR(ENOMEM); - } + p->ref = dav1d_ref_init(&pic_ctx->ref, buf, free_buffer, c->pic_ctx_pool, 0); p->seq_hdr_ref = seq_hdr_ref; if (seq_hdr_ref) dav1d_ref_inc(seq_hdr_ref); @@ -155,44 +158,59 @@ static int picture_alloc_with_edges(Dav1dContext *const c, p->frame_hdr_ref = frame_hdr_ref; if (frame_hdr_ref) dav1d_ref_inc(frame_hdr_ref); - dav1d_data_props_copy(&p->m, props); - if (extra && extra_ptr) - *extra_ptr = &pic_ctx->extra_ptr; + *extra_ptr = &pic_ctx->extra_data; + + return 0; +} +void dav1d_picture_copy_props(Dav1dPicture *const p, + Dav1dContentLightLevel *const content_light, Dav1dRef *const content_light_ref, + Dav1dMasteringDisplay *const mastering_display, Dav1dRef *const mastering_display_ref, + Dav1dITUTT35 *const itut_t35, Dav1dRef *itut_t35_ref, size_t n_itut_t35, + const Dav1dDataProps *const props) +{ + dav1d_data_props_copy(&p->m, props); + + dav1d_ref_dec(&p->content_light_ref); p->content_light_ref = content_light_ref; + p->content_light = content_light; if (content_light_ref) dav1d_ref_inc(content_light_ref); + dav1d_ref_dec(&p->mastering_display_ref); p->mastering_display_ref = mastering_display_ref; + p->mastering_display = mastering_display; if (mastering_display_ref) dav1d_ref_inc(mastering_display_ref); + dav1d_ref_dec(&p->itut_t35_ref); p->itut_t35_ref = itut_t35_ref; + p->itut_t35 = itut_t35; + p->n_itut_t35 = n_itut_t35; if (itut_t35_ref) dav1d_ref_inc(itut_t35_ref); - - return 0; } int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f, const int bpc) { Dav1dThreadPicture *const p = &f->sr_cur; - const int have_frame_mt = c->n_fc > 1; const int res = picture_alloc_with_edges(c, &p->p, f->frame_hdr->width[1], f->frame_hdr->height, f->seq_hdr, f->seq_hdr_ref, f->frame_hdr, f->frame_hdr_ref, - c->content_light, c->content_light_ref, - c->mastering_display, c->mastering_display_ref, - c->itut_t35, c->itut_t35_ref, bpc, &f->tile[0].data.m, &c->allocator, - have_frame_mt ? sizeof(atomic_int) * 2 : 0, (void **) &p->progress); if (res) return res; + dav1d_picture_copy_props(&p->p, c->content_light, c->content_light_ref, + c->mastering_display, c->mastering_display_ref, + c->itut_t35, c->itut_t35_ref, c->n_itut_t35, + &f->tile[0].data.m); + // Must be removed from the context after being attached to the frame dav1d_ref_dec(&c->itut_t35_ref); c->itut_t35 = NULL; + c->n_itut_t35 = 0; // Don't clear these flags from c->frame_flags if the frame is not visible. // This way they will be added to the next visible frame too. @@ -203,7 +221,7 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f p->visible = f->frame_hdr->show_frame; p->showable = f->frame_hdr->showable_frame; - if (have_frame_mt) { + if (c->n_fc > 1) { atomic_init(&p->progress[0], 0); atomic_init(&p->progress[1], 0); } @@ -213,43 +231,48 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, const int w, const Dav1dPicture *const src) { - struct pic_ctx_context *const pic_ctx = src->ref->user_data; + Dav1dMemPoolBuffer *const buf = (Dav1dMemPoolBuffer *)src->ref->const_data; + struct pic_ctx_context *const pic_ctx = buf->data; const int res = picture_alloc_with_edges(c, dst, w, src->p.h, src->seq_hdr, src->seq_hdr_ref, src->frame_hdr, src->frame_hdr_ref, - src->content_light, src->content_light_ref, - src->mastering_display, src->mastering_display_ref, - src->itut_t35, src->itut_t35_ref, src->p.bpc, &src->m, &pic_ctx->allocator, - 0, NULL); - return res; + NULL); + if (res) return res; + + dav1d_picture_copy_props(dst, src->content_light, src->content_light_ref, + src->mastering_display, src->mastering_display_ref, + src->itut_t35, src->itut_t35_ref, src->n_itut_t35, + &src->m); + + return 0; } void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) { - validate_input(dst != NULL); - validate_input(dst->data[0] == NULL); - validate_input(src != NULL); + assert(dst != NULL); + assert(dst->data[0] == NULL); + assert(src != NULL); if (src->ref) { - validate_input(src->data[0] != NULL); + assert(src->data[0] != NULL); dav1d_ref_inc(src->ref); - if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref); - if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref); - if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref); - if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref); - if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref); - if (src->itut_t35_ref) dav1d_ref_inc(src->itut_t35_ref); } + if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref); + if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref); + if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref); + if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref); + if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref); + if (src->itut_t35_ref) dav1d_ref_inc(src->itut_t35_ref); *dst = *src; } void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) { - validate_input(dst != NULL); - validate_input(dst->data[0] == NULL); - validate_input(src != NULL); + assert(dst != NULL); + assert(dst->data[0] == NULL); + assert(src != NULL); if (src->ref) - validate_input(src->data[0] != NULL); + assert(src->data[0] != NULL); *dst = *src; memset(src, 0, sizeof(*src)); @@ -282,13 +305,13 @@ void dav1d_picture_unref_internal(Dav1dPicture *const p) { if (p->ref) { validate_input(p->data[0] != NULL); dav1d_ref_dec(&p->ref); - dav1d_ref_dec(&p->seq_hdr_ref); - dav1d_ref_dec(&p->frame_hdr_ref); - dav1d_ref_dec(&p->m.user_data.ref); - dav1d_ref_dec(&p->content_light_ref); - dav1d_ref_dec(&p->mastering_display_ref); - dav1d_ref_dec(&p->itut_t35_ref); } + dav1d_ref_dec(&p->seq_hdr_ref); + dav1d_ref_dec(&p->frame_hdr_ref); + dav1d_ref_dec(&p->m.user_data.ref); + dav1d_ref_dec(&p->content_light_ref); + dav1d_ref_dec(&p->mastering_display_ref); + dav1d_ref_dec(&p->itut_t35_ref); memset(p, 0, sizeof(*p)); dav1d_data_props_set_defaults(&p->m); } diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/picture.h b/prog/3rdPartyLibs/codecs/dav1d/src/picture.h index 154c85a0c..88aee08f4 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/picture.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/picture.h @@ -101,6 +101,19 @@ int dav1d_default_picture_alloc(Dav1dPicture *p, void *cookie); void dav1d_default_picture_release(Dav1dPicture *p, void *cookie); void dav1d_picture_unref_internal(Dav1dPicture *p); +struct itut_t35_ctx_context { + Dav1dITUTT35 *itut_t35; + size_t n_itut_t35; + Dav1dRef ref; +}; + +void dav1d_picture_free_itut_t35(const uint8_t *data, void *user_data); +void dav1d_picture_copy_props(Dav1dPicture *p, + Dav1dContentLightLevel *content_light, Dav1dRef *content_light_ref, + Dav1dMasteringDisplay *mastering_display, Dav1dRef *mastering_display_ref, + Dav1dITUTT35 *itut_t35, Dav1dRef *itut_t35_ref, size_t n_itut_t35, + const Dav1dDataProps *props); + /** * Get event flags from picture flags. */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/ppc/cdef_tmpl.c b/prog/3rdPartyLibs/codecs/dav1d/src/ppc/cdef_tmpl.c index 020e17b77..e2e759810 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/ppc/cdef_tmpl.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/ppc/cdef_tmpl.c @@ -82,7 +82,15 @@ static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride, vec_st(l0, 0, tmp + (h + 0) * 8); vec_st(l1, 0, tmp + (h + 1) * 8); - for (int y = 0; y < h; y++) { + int y_with_left_edge = 0; + if (!(edges & CDEF_HAVE_LEFT)) { + u16x8 l = u8h_to_u16(vec_vsx_ld(0, src)); + vec_vsx_st(l, 0, tmp + 2); + + y_with_left_edge = 1; + } + + for (int y = y_with_left_edge; y < h; y++) { u16x8 l = u8h_to_u16(vec_vsx_ld(0, src - 2 + y * src_stride)); vec_st(l, 0, tmp + y * 8); } @@ -160,7 +168,18 @@ static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride, vec_st(l1h, 0, tmp + (h + 1) * 16); vec_st(l1l, 0, tmp + (h + 1) * 16 + 8); - for (int y = 0; y < h; y++) { + int y_with_left_edge = 0; + if (!(edges & CDEF_HAVE_LEFT)) { + u8x16 l = vec_vsx_ld(0, src); + u16x8 lh = u8h_to_u16(l); + u16x8 ll = u8l_to_u16(l); + vec_vsx_st(lh, 0, tmp + 2); + vec_vsx_st(ll, 0, tmp + 8 + 2); + + y_with_left_edge = 1; + } + + for (int y = y_with_left_edge; y < h; y++) { u8x16 l = vec_vsx_ld(0, src - 2 + y * src_stride); u16x8 lh = u8h_to_u16(l); u16x8 ll = u8l_to_u16(l); @@ -456,7 +475,7 @@ void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \ const int damping, \ const enum CdefEdgeFlags edges) \ { \ - ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride,); \ + ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \ filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \ sec_strength, dir, damping, edges, tmp_stride, tmp); \ diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/ppc/looprestoration_tmpl.c b/prog/3rdPartyLibs/codecs/dav1d/src/ppc/looprestoration_tmpl.c index f64a96327..c0c64e180 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/ppc/looprestoration_tmpl.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/ppc/looprestoration_tmpl.c @@ -50,12 +50,12 @@ static void wiener_filter_h_vsx(int32_t *hor_ptr, const int16_t filterh[8], const int w, const int h) { - static const i32x4 zerov = vec_splats(0); - static const i32x4 seven_vec = vec_splats(7); - static const i32x4 bitdepth_added_vec = vec_splats(1 << 14); - static const i32x4 round_bits_vec = vec_splats(3); - static const i32x4 rounding_off_vec = vec_splats(1<<2); - static const i32x4 clip_limit_v = vec_splats((1 << 13) - 1); + const i32x4 zerov = vec_splats(0); + const i32x4 seven_vec = vec_splats(7); + const i32x4 bitdepth_added_vec = vec_splats(1 << 14); + const i32x4 round_bits_vec = vec_splats(3); + const i32x4 rounding_off_vec = vec_splats(1<<2); + const i32x4 clip_limit_v = vec_splats((1 << 13) - 1); i16x8 filterhvall = vec_vsx_ld(0, filterh); i16x8 filterhv0 = vec_splat( filterhvall, 0); @@ -128,8 +128,8 @@ static void wiener_filter_h_vsx(int32_t *hor_ptr, } static inline i16x8 iclip_u8_vec(i16x8 v) { - static const i16x8 zerov = vec_splats((int16_t)0); - static const i16x8 maxv = vec_splats((int16_t)255); + const i16x8 zerov = vec_splats((int16_t)0); + const i16x8 maxv = vec_splats((int16_t)255); v = vec_max(zerov, v); v = vec_min(maxv, v); return v; @@ -175,8 +175,8 @@ static inline void wiener_filter_v_vsx(uint8_t *p, const int16_t filterv[8], const int w, const int h) { - static const i32x4 round_bits_vec = vec_splats(11); - static const i32x4 round_vec = vec_splats((1 << 10) - (1 << 18)); + const i32x4 round_bits_vec = vec_splats(11); + const i32x4 round_vec = vec_splats((1 << 10) - (1 << 18)); i32x4 filterv0 = vec_splats((int32_t) filterv[0]); i32x4 filterv1 = vec_splats((int32_t) filterv[1]); diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/recon.h b/prog/3rdPartyLibs/codecs/dav1d/src/recon.h index e97ac31ff..721924916 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/recon.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/recon.h @@ -57,6 +57,18 @@ typedef decl_backup_ipred_edge_fn(*backup_ipred_edge_fn); void (name)(Dav1dTaskContext *t, enum BlockSize bs, const Av1Block *b) typedef decl_read_coef_blocks_fn(*read_coef_blocks_fn); +#define decl_copy_pal_block_fn(name) \ +void (name)(Dav1dTaskContext *t, int bx4, int by4, int bw4, int bh4) +typedef decl_copy_pal_block_fn(*copy_pal_block_fn); + +#define decl_read_pal_plane_fn(name) \ +void (name)(Dav1dTaskContext *t, Av1Block *b, int pl, int sz_ctx, int bx4, int by4) +typedef decl_read_pal_plane_fn(*read_pal_plane_fn); + +#define decl_read_pal_uv_fn(name) \ +void (name)(Dav1dTaskContext *t, Av1Block *b, int sz_ctx, int bx4, int by4) +typedef decl_read_pal_uv_fn(*read_pal_uv_fn); + decl_recon_b_intra_fn(dav1d_recon_b_intra_8bpc); decl_recon_b_intra_fn(dav1d_recon_b_intra_16bpc); @@ -82,4 +94,13 @@ decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc); decl_read_coef_blocks_fn(dav1d_read_coef_blocks_8bpc); decl_read_coef_blocks_fn(dav1d_read_coef_blocks_16bpc); +decl_copy_pal_block_fn(dav1d_copy_pal_block_y_8bpc); +decl_copy_pal_block_fn(dav1d_copy_pal_block_y_16bpc); +decl_copy_pal_block_fn(dav1d_copy_pal_block_uv_8bpc); +decl_copy_pal_block_fn(dav1d_copy_pal_block_uv_16bpc); +decl_read_pal_plane_fn(dav1d_read_pal_plane_8bpc); +decl_read_pal_plane_fn(dav1d_read_pal_plane_16bpc); +decl_read_pal_uv_fn(dav1d_read_pal_uv_8bpc); +decl_read_pal_uv_fn(dav1d_read_pal_uv_16bpc); + #endif /* DAV1D_SRC_RECON_H */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/recon_tmpl.c b/prog/3rdPartyLibs/codecs/dav1d/src/recon_tmpl.c index 3158ef5b0..9d1a0da6b 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/recon_tmpl.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/recon_tmpl.c @@ -770,14 +770,12 @@ static void read_coef_tree(Dav1dTaskContext *const t, uint8_t cf_ctx; int eob; coef *cf; - struct CodedBlockInfo *cbi; if (t->frame_thread.pass) { const int p = t->frame_thread.pass & 1; assert(ts->frame_thread[p].cf); cf = ts->frame_thread[p].cf; ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; - cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx]; } else { cf = bitfn(t->cf); } @@ -800,16 +798,15 @@ static void read_coef_tree(Dav1dTaskContext *const t, rep_macro(type, txtp_map, 0, mul * txtp); \ txtp_map += 32; \ } - uint8_t *txtp_map = &t->txtp_map[by4 * 32 + bx4]; + uint8_t *txtp_map = &t->scratch.txtp_map[by4 * 32 + bx4]; case_set_upto16(txw,,,); #undef set_ctx - if (t->frame_thread.pass == 1) { - cbi->eob[0] = eob; - cbi->txtp[0] = txtp; - } + if (t->frame_thread.pass == 1) + *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; } else { - eob = cbi->eob[0]; - txtp = cbi->txtp[0]; + const int cbi = *ts->frame_thread[0].cbi++; + eob = cbi >> 5; + txtp = cbi & 0x1f; } if (!(t->frame_thread.pass & 1)) { assert(dst); @@ -874,8 +871,6 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t, for (y = init_y, t->by += init_y; y < sub_h4; y += t_dim->h, t->by += t_dim->h, y_off++) { - struct CodedBlockInfo *const cbi = - &f->frame_thread.cbi[t->by * f->b4_stride]; int x_off = !!init_x; for (x = init_x, t->bx += init_x; x < sub_w4; x += t_dim->w, t->bx += t_dim->w, x_off++) @@ -886,14 +881,14 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t, } else { uint8_t cf_ctx = 0x40; enum TxfmType txtp; - const int eob = cbi[t->bx].eob[0] = + const int eob = decode_coefs(t, &t->a->lcoef[bx4 + x], &t->l.lcoef[by4 + y], b->tx, bs, b, 1, 0, ts->frame_thread[1].cf, &txtp, &cf_ctx); if (DEBUG_BLOCK_INFO) printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n", b->tx, txtp, eob, ts->msac.rng); - cbi[t->bx].txtp[0] = txtp; + *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; #define set_ctx(type, dir, diridx, off, mul, rep_macro) \ rep_macro(type, t->dir lcoef, off, mul * cf_ctx) @@ -919,17 +914,15 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t, for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4; y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver) { - struct CodedBlockInfo *const cbi = - &f->frame_thread.cbi[t->by * f->b4_stride]; for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4; x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor) { uint8_t cf_ctx = 0x40; enum TxfmType txtp; if (!b->intra) - txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 + - bx4 + (x << ss_hor)]; - const int eob = cbi[t->bx].eob[1 + pl] = + txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 + + bx4 + (x << ss_hor)]; + const int eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], &t->l.ccoef[pl][cby4 + y], b->uvtx, bs, b, b->intra, 1 + pl, ts->frame_thread[1].cf, @@ -938,7 +931,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t, printf("Post-uv-cf-blk[pl=%d,tx=%d," "txtp=%d,eob=%d]: r=%d\n", pl, b->uvtx, txtp, eob, ts->msac.rng); - cbi[t->bx].txtp[1 + pl] = txtp; + *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp; ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16; #define set_ctx(type, dir, diridx, off, mul, rep_macro) \ rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx) @@ -1238,13 +1231,14 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize const int p = t->frame_thread.pass & 1; assert(ts->frame_thread[p].pal_idx); pal_idx = ts->frame_thread[p].pal_idx; - ts->frame_thread[p].pal_idx += bw4 * bh4 * 16; + ts->frame_thread[p].pal_idx += bw4 * bh4 * 8; } else { - pal_idx = t->scratch.pal_idx; + pal_idx = t->scratch.pal_idx_y; } - const uint16_t *const pal = t->frame_thread.pass ? + const pixel *const pal = t->frame_thread.pass ? f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + - ((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0]; + ((t->bx >> 1) + (t->by & 1))][0] : + bytefn(t->scratch.pal)[0]; f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal, pal_idx, bw4 * 4, bh4 * 4); if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) @@ -1321,12 +1315,11 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize enum TxfmType txtp; if (t->frame_thread.pass) { const int p = t->frame_thread.pass & 1; + const int cbi = *ts->frame_thread[p].cbi++; cf = ts->frame_thread[p].cf; ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16; - const struct CodedBlockInfo *const cbi = - &f->frame_thread.cbi[t->by * f->b4_stride + t->bx]; - eob = cbi->eob[0]; - txtp = cbi->txtp[0]; + eob = cbi >> 5; + txtp = cbi & 0x1f; } else { uint8_t cf_ctx; cf = bitfn(t->cf); @@ -1430,7 +1423,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize } else if (b->pal_sz[1]) { const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); - const uint16_t (*pal)[8]; + const pixel (*pal)[8]; const uint8_t *pal_idx; if (t->frame_thread.pass) { const int p = t->frame_thread.pass & 1; @@ -1438,10 +1431,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))]; pal_idx = ts->frame_thread[p].pal_idx; - ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16; + ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8; } else { - pal = t->scratch.pal; - pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16]; + pal = bytefn(t->scratch.pal); + pal_idx = t->scratch.pal_idx_uv; } f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff, @@ -1545,12 +1538,11 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize coef *cf; if (t->frame_thread.pass) { const int p = t->frame_thread.pass & 1; + const int cbi = *ts->frame_thread[p].cbi++; cf = ts->frame_thread[p].cf; ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16; - const struct CodedBlockInfo *const cbi = - &f->frame_thread.cbi[t->by * f->b4_stride + t->bx]; - eob = cbi->eob[pl + 1]; - txtp = cbi->txtp[pl + 1]; + eob = cbi >> 5; + txtp = cbi & 0x1f; } else { uint8_t cf_ctx; cf = bitfn(t->cf); @@ -1684,12 +1676,8 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel), tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0 HIGHBD_CALL_SUFFIX); - const uint8_t *const ii_mask = - b->interintra_type == INTER_INTRA_BLEND ? - dav1d_ii_masks[bs][0][b->interintra_mode] : - dav1d_wedge_masks[bs][0][0][b->wedge_idx]; dsp->mc.blend(dst, f->cur.stride[0], tmp, - bw4 * 4, bh4 * 4, ii_mask); + bw4 * 4, bh4 * 4, II_MASK(0, bs, b)); } if (!has_chroma) goto skip_inter_chroma_pred; @@ -1792,10 +1780,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize // FIXME for 8x32 with 4:2:2 subsampling, this probably does // the wrong thing since it will select 4x16, not 4x32, as a // transform size... - const uint8_t *const ii_mask = - b->interintra_type == INTER_INTRA_BLEND ? - dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] : - dav1d_wedge_masks[bs][chr_layout_idx][0][b->wedge_idx]; + const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b); for (int pl = 0; pl < 2; pl++) { pixel *const tmp = bitfn(t->scratch.interintra); @@ -1873,12 +1858,12 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize mask = seg_mask; break; case COMP_INTER_WEDGE: - mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx]; + mask = WEDGE_MASK(0, bs, 0, b->wedge_idx); dsp->mc.mask(dst, f->cur.stride[0], tmp[b->mask_sign], tmp[!b->mask_sign], bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX); if (has_chroma) - mask = dav1d_wedge_masks[bs][chr_layout_idx][b->mask_sign][b->wedge_idx]; + mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx); break; } @@ -1995,17 +1980,16 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize enum TxfmType txtp; if (t->frame_thread.pass) { const int p = t->frame_thread.pass & 1; + const int cbi = *ts->frame_thread[p].cbi++; cf = ts->frame_thread[p].cf; ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16; - const struct CodedBlockInfo *const cbi = - &f->frame_thread.cbi[t->by * f->b4_stride + t->bx]; - eob = cbi->eob[1 + pl]; - txtp = cbi->txtp[1 + pl]; + eob = cbi >> 5; + txtp = cbi & 0x1f; } else { uint8_t cf_ctx; cf = bitfn(t->cf); - txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 + - bx4 + (x << ss_hor)]; + txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 + + bx4 + (x << ss_hor)]; eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x], &t->l.ccoef[pl][cby4 + y], b->uvtx, bs, b, 0, 1 + pl, @@ -2200,3 +2184,178 @@ void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) { 4 * (ts->tiling.col_end - x_off) >> ss_hor); } } + +void bytefn(dav1d_copy_pal_block_y)(Dav1dTaskContext *const t, + const int bx4, const int by4, + const int bw4, const int bh4) + +{ + const Dav1dFrameContext *const f = t->f; + pixel *const pal = t->frame_thread.pass ? + f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + + ((t->bx >> 1) + (t->by & 1))][0] : + bytefn(t->scratch.pal)[0]; + for (int x = 0; x < bw4; x++) + memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel)); + for (int y = 0; y < bh4; y++) + memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel)); +} + +void bytefn(dav1d_copy_pal_block_uv)(Dav1dTaskContext *const t, + const int bx4, const int by4, + const int bw4, const int bh4) + +{ + const Dav1dFrameContext *const f = t->f; + const pixel (*const pal)[8] = t->frame_thread.pass ? + f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + + ((t->bx >> 1) + (t->by & 1))] : + bytefn(t->scratch.pal); + // see aomedia bug 2183 for why we use luma coordinates here + for (int pl = 1; pl <= 2; pl++) { + for (int x = 0; x < bw4; x++) + memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel)); + for (int y = 0; y < bh4; y++) + memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel)); + } +} + +void bytefn(dav1d_read_pal_plane)(Dav1dTaskContext *const t, Av1Block *const b, + const int pl, const int sz_ctx, + const int bx4, const int by4) +{ + Dav1dTileState *const ts = t->ts; + const Dav1dFrameContext *const f = t->f; + const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac, + ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2; + pixel cache[16], used_cache[8]; + int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4]; + int n_cache = 0; + // don't reuse above palette outside SB64 boundaries + int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0; + const pixel *l = bytefn(t->al_pal)[1][by4][pl]; + const pixel *a = bytefn(t->al_pal)[0][bx4][pl]; + + // fill/sort cache + while (l_cache && a_cache) { + if (*l < *a) { + if (!n_cache || cache[n_cache - 1] != *l) + cache[n_cache++] = *l; + l++; + l_cache--; + } else { + if (*a == *l) { + l++; + l_cache--; + } + if (!n_cache || cache[n_cache - 1] != *a) + cache[n_cache++] = *a; + a++; + a_cache--; + } + } + if (l_cache) { + do { + if (!n_cache || cache[n_cache - 1] != *l) + cache[n_cache++] = *l; + l++; + } while (--l_cache > 0); + } else if (a_cache) { + do { + if (!n_cache || cache[n_cache - 1] != *a) + cache[n_cache++] = *a; + a++; + } while (--a_cache > 0); + } + + // find reused cache entries + int i = 0; + for (int n = 0; n < n_cache && i < pal_sz; n++) + if (dav1d_msac_decode_bool_equi(&ts->msac)) + used_cache[i++] = cache[n]; + const int n_used_cache = i; + + // parse new entries + pixel *const pal = t->frame_thread.pass ? + f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + + ((t->bx >> 1) + (t->by & 1))][pl] : + bytefn(t->scratch.pal)[pl]; + if (i < pal_sz) { + const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc; + int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc); + + if (i < pal_sz) { + int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2); + const int max = (1 << bpc) - 1; + + do { + const int delta = dav1d_msac_decode_bools(&ts->msac, bits); + prev = pal[i++] = imin(prev + delta + !pl, max); + if (prev + !pl >= max) { + for (; i < pal_sz; i++) + pal[i] = max; + break; + } + bits = imin(bits, 1 + ulog2(max - prev - !pl)); + } while (i < pal_sz); + } + + // merge cache+new entries + int n = 0, m = n_used_cache; + for (i = 0; i < pal_sz; i++) { + if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) { + pal[i] = used_cache[n++]; + } else { + assert(m < pal_sz); + pal[i] = pal[m++]; + } + } + } else { + memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache)); + } + + if (DEBUG_BLOCK_INFO) { + printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=", + pl, pal_sz, n_cache, n_used_cache, ts->msac.rng); + for (int n = 0; n < n_cache; n++) + printf("%c%02x", n ? ' ' : '[', cache[n]); + printf("%s, pal=", n_cache ? "]" : "[]"); + for (int n = 0; n < pal_sz; n++) + printf("%c%02x", n ? ' ' : '[', pal[n]); + printf("]\n"); + } +} + +void bytefn(dav1d_read_pal_uv)(Dav1dTaskContext *const t, Av1Block *const b, + const int sz_ctx, const int bx4, const int by4) +{ + bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4); + + // V pal coding + Dav1dTileState *const ts = t->ts; + const Dav1dFrameContext *const f = t->f; + pixel *const pal = t->frame_thread.pass ? + f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + + ((t->bx >> 1) + (t->by & 1))][2] : + bytefn(t->scratch.pal)[2]; + const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc; + if (dav1d_msac_decode_bool_equi(&ts->msac)) { + const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2); + int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc); + const int max = (1 << bpc) - 1; + for (int i = 1; i < b->pal_sz[1]; i++) { + int delta = dav1d_msac_decode_bools(&ts->msac, bits); + if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta; + prev = pal[i] = (prev + delta) & max; + } + } else { + for (int i = 0; i < b->pal_sz[1]; i++) + pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc); + } + if (DEBUG_BLOCK_INFO) { + printf("Post-pal[pl=2]: r=%d ", ts->msac.rng); + for (int n = 0; n < b->pal_sz[1]; n++) + printf("%c%02x", n ? ' ' : '[', pal[n]); + printf("]\n"); + } +} diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/ref.c b/prog/3rdPartyLibs/codecs/dav1d/src/ref.c index 46462b4c8..5a4d3a245 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/ref.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/ref.c @@ -34,10 +34,10 @@ static void default_free_callback(const uint8_t *const data, void *const user_da dav1d_free_aligned(user_data); } -Dav1dRef *dav1d_ref_create(size_t size) { +Dav1dRef *dav1d_ref_create(const enum AllocationType type, size_t size) { size = (size + sizeof(void*) - 1) & ~(sizeof(void*) - 1); - uint8_t *const data = dav1d_alloc_aligned(size + sizeof(Dav1dRef), 64); + uint8_t *const data = dav1d_alloc_aligned(type, size + sizeof(Dav1dRef), 64); if (!data) return NULL; Dav1dRef *const res = (Dav1dRef*)(data + size); @@ -71,23 +71,6 @@ Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *const pool, size_t size) { return res; } -Dav1dRef *dav1d_ref_wrap(const uint8_t *const ptr, - void (*free_callback)(const uint8_t *data, void *user_data), - void *const user_data) -{ - Dav1dRef *res = malloc(sizeof(Dav1dRef)); - if (!res) return NULL; - - res->data = NULL; - res->const_data = ptr; - atomic_init(&res->ref_cnt, 1); - res->free_ref = 1; - res->free_callback = free_callback; - res->user_data = user_data; - - return res; -} - void dav1d_ref_dec(Dav1dRef **const pref) { assert(pref != NULL); @@ -98,10 +81,6 @@ void dav1d_ref_dec(Dav1dRef **const pref) { if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) { const int free_ref = ref->free_ref; ref->free_callback(ref->const_data, ref->user_data); - if (free_ref) free(ref); + if (free_ref) dav1d_free(ref); } } - -int dav1d_ref_is_writable(Dav1dRef *const ref) { - return atomic_load(&ref->ref_cnt) == 1 && ref->data; -} diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/ref.h b/prog/3rdPartyLibs/codecs/dav1d/src/ref.h index ec070a0a9..f1c96eb91 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/ref.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/ref.h @@ -45,16 +45,33 @@ struct Dav1dRef { void *user_data; }; -Dav1dRef *dav1d_ref_create(size_t size); +#if !TRACK_HEAP_ALLOCATIONS +#define dav1d_ref_create(type, size) dav1d_ref_create(size) +#endif + +Dav1dRef *dav1d_ref_create(enum AllocationType type, size_t size); Dav1dRef *dav1d_ref_create_using_pool(Dav1dMemPool *pool, size_t size); -Dav1dRef *dav1d_ref_wrap(const uint8_t *ptr, - void (*free_callback)(const uint8_t *data, void *user_data), - void *user_data); void dav1d_ref_dec(Dav1dRef **ref); -int dav1d_ref_is_writable(Dav1dRef *ref); + +static inline Dav1dRef *dav1d_ref_init(Dav1dRef *const ref, const void *const ptr, + void (*const free_callback)(const uint8_t *data, void *user_data), + void *const user_data, const int free_ref) +{ + ref->data = NULL; + ref->const_data = ptr; + atomic_init(&ref->ref_cnt, 1); + ref->free_ref = free_ref; + ref->free_callback = free_callback; + ref->user_data = user_data; + return ref; +} static inline void dav1d_ref_inc(Dav1dRef *const ref) { atomic_fetch_add_explicit(&ref->ref_cnt, 1, memory_order_relaxed); } +static inline int dav1d_ref_is_writable(Dav1dRef *const ref) { + return atomic_load(&ref->ref_cnt) == 1 && ref->data; +} + #endif /* DAV1D_SRC_REF_H */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/refmvs.c b/prog/3rdPartyLibs/codecs/dav1d/src/refmvs.c index c7ed9db8c..0b5ccd304 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/refmvs.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/refmvs.c @@ -687,9 +687,9 @@ void dav1d_refmvs_tile_sbrow_init(refmvs_tile *const rt, const refmvs_frame *con rt->tile_col.end = imin(tile_col_end4, rf->iw4); } -void dav1d_refmvs_load_tmvs(const refmvs_frame *const rf, int tile_row_idx, - const int col_start8, const int col_end8, - const int row_start8, int row_end8) +static void load_tmvs_c(const refmvs_frame *const rf, int tile_row_idx, + const int col_start8, const int col_end8, + const int row_start8, int row_end8) { if (rf->n_tile_threads == 1) tile_row_idx = 0; assert(row_start8 >= 0); @@ -760,22 +760,14 @@ void dav1d_refmvs_load_tmvs(const refmvs_frame *const rf, int tile_row_idx, } } -void dav1d_refmvs_save_tmvs(const refmvs_tile *const rt, - const int col_start8, int col_end8, - const int row_start8, int row_end8) +static void save_tmvs_c(refmvs_temporal_block *rp, const ptrdiff_t stride, + refmvs_block *const *const rr, + const uint8_t *const ref_sign, + const int col_end8, const int row_end8, + const int col_start8, const int row_start8) { - const refmvs_frame *const rf = rt->rf; - - assert(row_start8 >= 0); - assert((unsigned) (row_end8 - row_start8) <= 16U); - row_end8 = imin(row_end8, rf->ih8); - col_end8 = imin(col_end8, rf->iw8); - - const ptrdiff_t stride = rf->rp_stride; - const uint8_t *const ref_sign = rf->mfmv_sign; - refmvs_temporal_block *rp = &rf->rp[row_start8 * stride]; for (int y = row_start8; y < row_end8; y++) { - const refmvs_block *const b = rt->r[6 + (y & 15) * 2]; + const refmvs_block *const b = rr[(y & 15) * 2]; for (int x = col_start8; x < col_end8;) { const refmvs_block *const cand_b = &b[x * 2 + 1]; @@ -794,8 +786,10 @@ void dav1d_refmvs_save_tmvs(const refmvs_tile *const rt, rp[x] = (refmvs_temporal_block) { .mv = cand_b->mv.mv[0], .ref = cand_b->ref.ref[0] }; } else { - for (int n = 0; n < bw8; n++, x++) + for (int n = 0; n < bw8; n++, x++) { + rp[x].mv.n = 0; rp[x].ref = 0; // "invalid" + } } } rp += stride; @@ -823,7 +817,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf, if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) { if (rf->r) dav1d_freep_aligned(&rf->r); const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1; - rf->r = dav1d_alloc_aligned(sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64); + rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64); if (!rf->r) return DAV1D_ERR(ENOMEM); rf->r_stride = r_stride; } @@ -831,7 +825,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf, const ptrdiff_t rp_stride = r_stride >> 1; if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) { if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj); - rf->rp_proj = dav1d_alloc_aligned(sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64); + rf->rp_proj = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64); if (!rf->rp_proj) return DAV1D_ERR(ENOMEM); rf->rp_stride = rp_stride; } @@ -932,6 +926,8 @@ static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv, COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c) { + c->load_tmvs = load_tmvs_c; + c->save_tmvs = save_tmvs_c; c->splat_mv = splat_mv_c; #if HAVE_ASM diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/refmvs.h b/prog/3rdPartyLibs/codecs/dav1d/src/refmvs.h index 948c35aef..70dc9678d 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/refmvs.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/refmvs.h @@ -39,10 +39,10 @@ #define INVALID_MV 0x80008000 -typedef struct refmvs_temporal_block { +PACKED(typedef struct refmvs_temporal_block { mv mv; int8_t ref; -} refmvs_temporal_block; +}) refmvs_temporal_block; typedef union refmvs_refpair { int8_t ref[2]; // [0] = 0: intra=1, [1] = -1: comp=0 @@ -96,11 +96,28 @@ typedef struct refmvs_candidate { int weight; } refmvs_candidate; +// initialize temporal MVs; this can be done in any configuration, e.g. one +// tile/sbrow at a time, where col_{start,end}8 are the tile boundaries; or +// it can just be for the whole frame's sbrow, where col_{start,end}8 are the +// frame boundaries. row_{start,end}8 are the superblock row boundaries. +#define decl_load_tmvs_fn(name) \ +void (name)(const refmvs_frame *rf, int tile_row_idx, \ + int col_start8, int col_end8, int row_start8, int row_end8) +typedef decl_load_tmvs_fn(*load_tmvs_fn); + +#define decl_save_tmvs_fn(name) \ +void (name)(refmvs_temporal_block *rp, const ptrdiff_t stride, \ + refmvs_block *const *const rr, const uint8_t *const ref_sign, \ + int col_end8, int row_end8, int col_start8, int row_start8) +typedef decl_save_tmvs_fn(*save_tmvs_fn); + #define decl_splat_mv_fn(name) \ void (name)(refmvs_block **rr, const refmvs_block *rmv, int bx4, int bw4, int bh4) typedef decl_splat_mv_fn(*splat_mv_fn); typedef struct Dav1dRefmvsDSPContext { + load_tmvs_fn load_tmvs; + save_tmvs_fn save_tmvs; splat_mv_fn splat_mv; } Dav1dRefmvsDSPContext; @@ -118,19 +135,27 @@ int dav1d_refmvs_init_frame(refmvs_frame *rf, /*const*/ refmvs_temporal_block *const rp_ref[7], int n_tile_threads, int n_frame_threads); -// initialize temporal MVs; this can be done in any configuration, e.g. one -// tile/sbrow at a time, where col_{start,end}8 are the tile boundaries; or -// it can just be for the whole frame's sbrow, where col_{start,end}8 are the -// frame boundaries. row_{start,end}8 are the superblock row boundaries. -void dav1d_refmvs_load_tmvs(const refmvs_frame *rf, int tile_row_idx, - int col_start8, int col_end8, - int row_start8, int row_end8); - // cache the current tile/sbrow (or frame/sbrow)'s projectable motion vectors // into buffers for use in future frame's temporal MV prediction -void dav1d_refmvs_save_tmvs(const refmvs_tile *rt, - int col_start8, int col_end8, - int row_start8, int row_end8); +static inline void dav1d_refmvs_save_tmvs(const Dav1dRefmvsDSPContext *const dsp, + const refmvs_tile *const rt, + const int col_start8, int col_end8, + const int row_start8, int row_end8) +{ + const refmvs_frame *const rf = rt->rf; + + assert(row_start8 >= 0); + assert((unsigned) (row_end8 - row_start8) <= 16U); + row_end8 = imin(row_end8, rf->ih8); + col_end8 = imin(col_end8, rf->iw8); + + const ptrdiff_t stride = rf->rp_stride; + const uint8_t *const ref_sign = rf->mfmv_sign; + refmvs_temporal_block *rp = &rf->rp[row_start8 * stride]; + + dsp->save_tmvs(rp, stride, rt->r + 6, ref_sign, + col_end8, row_end8, col_start8, row_start8); +} // initialize tile boundaries and refmvs_block pointers for one tile/sbrow void dav1d_refmvs_tile_sbrow_init(refmvs_tile *rt, const refmvs_frame *rf, diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/thread.h b/prog/3rdPartyLibs/codecs/dav1d/src/thread.h index 9bb0799bc..f446366d2 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/thread.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/thread.h @@ -33,6 +33,7 @@ #include #include +#define PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT typedef struct { diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/thread_task.c b/prog/3rdPartyLibs/codecs/dav1d/src/thread_task.c index ab2376c30..1698ab0ef 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/thread_task.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/thread_task.c @@ -224,7 +224,7 @@ static int create_filter_sbrow(Dav1dFrameContext *const f, int num_tasks = f->sbh * (1 + uses_2pass); if (num_tasks > f->task_thread.num_tasks) { const size_t size = sizeof(Dav1dTask) * num_tasks; - tasks = realloc(f->task_thread.tasks, size); + tasks = dav1d_realloc(ALLOC_COMMON_CTX, f->task_thread.tasks, size); if (!tasks) return -1; memset(tasks, 0, size); f->task_thread.tasks = tasks; @@ -237,8 +237,8 @@ static int create_filter_sbrow(Dav1dFrameContext *const f, } else { const int prog_sz = ((f->sbh + 31) & ~31) >> 5; if (prog_sz > f->frame_thread.prog_sz) { - atomic_uint *const prog = realloc(f->frame_thread.frame_progress, - 2 * prog_sz * sizeof(*prog)); + atomic_uint *const prog = dav1d_realloc(ALLOC_COMMON_CTX, f->frame_thread.frame_progress, + 2 * prog_sz * sizeof(*prog)); if (!prog) return -1; f->frame_thread.frame_progress = prog; f->frame_thread.copy_lpf_progress = prog + prog_sz; @@ -275,7 +275,7 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass, int alloc_num_tasks = num_tasks * (1 + uses_2pass); if (alloc_num_tasks > f->task_thread.num_tile_tasks) { const size_t size = sizeof(Dav1dTask) * alloc_num_tasks; - tasks = realloc(f->task_thread.tile_tasks[0], size); + tasks = dav1d_realloc(ALLOC_COMMON_CTX, f->task_thread.tile_tasks[0], size); if (!tasks) return -1; memset(tasks, 0, size); f->task_thread.tile_tasks[0] = tasks; @@ -327,6 +327,7 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass, f->task_thread.pending_tasks.tail->next = &tasks[0]; f->task_thread.pending_tasks.tail = prev_t; atomic_store(&f->task_thread.pending_tasks.merge, 1); + atomic_store(&f->task_thread.init_done, 1); pthread_mutex_unlock(&f->task_thread.pending_tasks.lock); return 0; @@ -499,7 +500,7 @@ static inline void delayed_fg_task(const Dav1dContext *const c, case DAV1D_TASK_TYPE_FG_APPLY:; int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1); pthread_mutex_unlock(&ttd->lock); - int progmax = (out->p.h + 31) >> 5; + int progmax = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE; fg_apply_loop: if (row + 1 < progmax) pthread_cond_signal(&ttd->cond); @@ -730,14 +731,11 @@ void *dav1d_worker_task(void *data) { dav1d_decode_frame_exit(f, DAV1D_ERR(ENOMEM)); f->n_tile_data = 0; pthread_cond_signal(&f->task_thread.cond); - atomic_store(&f->task_thread.init_done, 1); - continue; } else { pthread_mutex_unlock(&ttd->lock); } } } - atomic_store(&f->task_thread.init_done, 1); pthread_mutex_lock(&ttd->lock); } else { pthread_mutex_lock(&ttd->lock); @@ -795,6 +793,7 @@ void *dav1d_worker_task(void *data) { atomic_load(&f->task_thread.done[0]) && (!uses_2pass || atomic_load(&f->task_thread.done[1]))) { + error = atomic_load(&f->task_thread.error); dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) : error ? DAV1D_ERR(ENOMEM) : 0); f->n_tile_data = 0; @@ -891,6 +890,7 @@ void *dav1d_worker_task(void *data) { if (!num_tasks && atomic_load(&f->task_thread.done[0]) && atomic_load(&f->task_thread.done[1])) { + error = atomic_load(&f->task_thread.error); dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) : error ? DAV1D_ERR(ENOMEM) : 0); f->n_tile_data = 0; @@ -920,6 +920,7 @@ void *dav1d_worker_task(void *data) { if (!num_tasks && atomic_load(&f->task_thread.done[0]) && (!uses_2pass || atomic_load(&f->task_thread.done[1]))) { + error = atomic_load(&f->task_thread.error); dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) : error ? DAV1D_ERR(ENOMEM) : 0); f->n_tile_data = 0; diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/wedge.c b/prog/3rdPartyLibs/codecs/dav1d/src/wedge.c index 6466068f3..2bea1393a 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/wedge.c +++ b/prog/3rdPartyLibs/codecs/dav1d/src/wedge.c @@ -83,37 +83,7 @@ static const wedge_code_type wedge_codebook_16_heqw[16] = { { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, }; -static uint8_t ALIGN(wedge_masks_444_32x32[2 * 16 * 32 * 32], 64); -static uint8_t ALIGN(wedge_masks_444_32x16[2 * 16 * 32 * 16], 64); -static uint8_t ALIGN(wedge_masks_444_32x8 [2 * 16 * 32 * 8], 64); -static uint8_t ALIGN(wedge_masks_444_16x32[2 * 16 * 16 * 32], 64); -static uint8_t ALIGN(wedge_masks_444_16x16[2 * 16 * 16 * 16], 64); -static uint8_t ALIGN(wedge_masks_444_16x8 [2 * 16 * 16 * 8], 64); -static uint8_t ALIGN(wedge_masks_444_8x32 [2 * 16 * 8 * 32], 64); -static uint8_t ALIGN(wedge_masks_444_8x16 [2 * 16 * 8 * 16], 64); -static uint8_t ALIGN(wedge_masks_444_8x8 [2 * 16 * 8 * 8], 64); - -static uint8_t ALIGN(wedge_masks_422_16x32[2 * 16 * 16 * 32], 64); -static uint8_t ALIGN(wedge_masks_422_16x16[2 * 16 * 16 * 16], 64); -static uint8_t ALIGN(wedge_masks_422_16x8 [2 * 16 * 16 * 8], 64); -static uint8_t ALIGN(wedge_masks_422_8x32 [2 * 16 * 8 * 32], 64); -static uint8_t ALIGN(wedge_masks_422_8x16 [2 * 16 * 8 * 16], 64); -static uint8_t ALIGN(wedge_masks_422_8x8 [2 * 16 * 8 * 8], 64); -static uint8_t ALIGN(wedge_masks_422_4x32 [2 * 16 * 4 * 32], 64); -static uint8_t ALIGN(wedge_masks_422_4x16 [2 * 16 * 4 * 16], 64); -static uint8_t ALIGN(wedge_masks_422_4x8 [2 * 16 * 4 * 8], 32); - -static uint8_t ALIGN(wedge_masks_420_16x16[2 * 16 * 16 * 16], 64); -static uint8_t ALIGN(wedge_masks_420_16x8 [2 * 16 * 16 * 8], 64); -static uint8_t ALIGN(wedge_masks_420_16x4 [2 * 16 * 16 * 4], 64); -static uint8_t ALIGN(wedge_masks_420_8x16 [2 * 16 * 8 * 16], 64); -static uint8_t ALIGN(wedge_masks_420_8x8 [2 * 16 * 8 * 8], 64); -static uint8_t ALIGN(wedge_masks_420_8x4 [2 * 16 * 8 * 4], 64); -static uint8_t ALIGN(wedge_masks_420_4x16 [2 * 16 * 4 * 16], 64); -static uint8_t ALIGN(wedge_masks_420_4x8 [2 * 16 * 4 * 8], 32); -static uint8_t ALIGN(wedge_masks_420_4x4 [2 * 16 * 4 * 4], 16); - -const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3][2][16]; +Dav1dMasks dav1d_masks; static void insert_border(uint8_t *const dst, const uint8_t *const src, const int ctr) @@ -136,29 +106,33 @@ static void hflip(uint8_t *const dst, const uint8_t *const src) { dst[y_off + 64 - 1 - x] = src[y_off + x]; } -static void invert(uint8_t *const dst, const uint8_t *const src, - const int w, const int h) -{ - for (int y = 0, y_off = 0; y < h; y++, y_off += w) - for (int x = 0; x < w; x++) - dst[y_off + x] = 64 - src[y_off + x]; -} - -static void copy2d(uint8_t *dst, const uint8_t *src, +static void copy2d(uint8_t *dst, const uint8_t *src, int sign, const int w, const int h, const int x_off, const int y_off) { src += y_off * 64 + x_off; - for (int y = 0; y < h; y++) { - memcpy(dst, src, w); - src += 64; - dst += w; + if (sign) { + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) + dst[x] = 64 - src[x]; + src += 64; + dst += w; + } + } else { + for (int y = 0; y < h; y++) { + memcpy(dst, src, w); + src += 64; + dst += w; + } } } -static COLD void init_chroma(uint8_t *chroma, const uint8_t *luma, - const int sign, const int w, const int h, - const int ss_ver) +#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3)) + +static COLD uint16_t init_chroma(uint8_t *chroma, const uint8_t *luma, + const int sign, const int w, const int h, + const int ss_ver) { + const uint16_t offset = MASK_OFFSET(chroma); for (int y = 0; y < h; y += 1 + ss_ver) { for (int x = 0; x < w; x += 2) { int sum = luma[x] + luma[x + 1] + 1; @@ -168,62 +142,69 @@ static COLD void init_chroma(uint8_t *chroma, const uint8_t *luma, luma += w << ss_ver; chroma += w >> 1; } + return offset; } -static COLD void fill2d_16x2(uint8_t *dst, const int w, const int h, - const enum BlockSize bs, +static COLD void fill2d_16x2(const int w, const int h, const enum BlockSize bs, const uint8_t (*const master)[64 * 64], const wedge_code_type *const cb, uint8_t *masks_444, uint8_t *masks_422, - uint8_t *masks_420, const unsigned signs) + uint8_t *masks_420, unsigned signs) { - uint8_t *ptr = dst; - for (int n = 0; n < 16; n++) { - copy2d(ptr, master[cb[n].direction], w, h, - 32 - (w * cb[n].x_offset >> 3), 32 - (h * cb[n].y_offset >> 3)); - ptr += w * h; - } - for (int n = 0, off = 0; n < 16; n++, off += w * h) - invert(ptr + off, dst + off, w, h); - const int n_stride_444 = (w * h); const int n_stride_422 = n_stride_444 >> 1; const int n_stride_420 = n_stride_444 >> 2; - const int sign_stride_444 = 16 * n_stride_444; const int sign_stride_422 = 16 * n_stride_422; const int sign_stride_420 = 16 * n_stride_420; - // assign pointers in externally visible array + + // assign pointer offsets in lookup table for (int n = 0; n < 16; n++) { - const int sign = (signs >> n) & 1; - dav1d_wedge_masks[bs][0][0][n] = &masks_444[ sign * sign_stride_444]; + const int sign = signs & 1; + + copy2d(masks_444, master[cb[n].direction], sign, w, h, + 32 - (w * cb[n].x_offset >> 3), 32 - (h * cb[n].y_offset >> 3)); + // not using !sign is intentional here, since 444 does not require // any rounding since no chroma subsampling is applied. - dav1d_wedge_masks[bs][0][1][n] = &masks_444[ sign * sign_stride_444]; - dav1d_wedge_masks[bs][1][0][n] = &masks_422[ sign * sign_stride_422]; - dav1d_wedge_masks[bs][1][1][n] = &masks_422[!sign * sign_stride_422]; - dav1d_wedge_masks[bs][2][0][n] = &masks_420[ sign * sign_stride_420]; - dav1d_wedge_masks[bs][2][1][n] = &masks_420[!sign * sign_stride_420]; + dav1d_masks.offsets[0][bs].wedge[0][n] = + dav1d_masks.offsets[0][bs].wedge[1][n] = MASK_OFFSET(masks_444); + + dav1d_masks.offsets[1][bs].wedge[0][n] = + init_chroma(&masks_422[ sign * sign_stride_422], masks_444, 0, w, h, 0); + dav1d_masks.offsets[1][bs].wedge[1][n] = + init_chroma(&masks_422[!sign * sign_stride_422], masks_444, 1, w, h, 0); + dav1d_masks.offsets[2][bs].wedge[0][n] = + init_chroma(&masks_420[ sign * sign_stride_420], masks_444, 0, w, h, 1); + dav1d_masks.offsets[2][bs].wedge[1][n] = + init_chroma(&masks_420[!sign * sign_stride_420], masks_444, 1, w, h, 1); + + signs >>= 1; masks_444 += n_stride_444; masks_422 += n_stride_422; masks_420 += n_stride_420; + } +} - // since the pointers come from inside, we know that - // violation of the const is OK here. Any other approach - // means we would have to duplicate the sign correction - // logic in two places, which isn't very nice, or mark - // the table faced externally as non-const, which also sucks - init_chroma((uint8_t *)dav1d_wedge_masks[bs][1][0][n], - dav1d_wedge_masks[bs][0][0][n], 0, w, h, 0); - init_chroma((uint8_t *)dav1d_wedge_masks[bs][1][1][n], - dav1d_wedge_masks[bs][0][0][n], 1, w, h, 0); - init_chroma((uint8_t *)dav1d_wedge_masks[bs][2][0][n], - dav1d_wedge_masks[bs][0][0][n], 0, w, h, 1); - init_chroma((uint8_t *)dav1d_wedge_masks[bs][2][1][n], - dav1d_wedge_masks[bs][0][0][n], 1, w, h, 1); +static COLD void build_nondc_ii_masks(uint8_t *const mask_v, const int w, + const int h, const int step) +{ + static const uint8_t ii_weights_1d[32] = { + 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, + 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, + }; + + uint8_t *const mask_h = &mask_v[w * h]; + uint8_t *const mask_sm = &mask_h[w * h]; + for (int y = 0, off = 0; y < h; y++, off += w) { + memset(&mask_v[off], ii_weights_1d[y * step], w); + for (int x = 0; x < w; x++) { + mask_sm[off + x] = ii_weights_1d[imin(x, y) * step]; + mask_h[off + x] = ii_weights_1d[x * step]; + } } } -COLD void dav1d_init_wedge_masks(void) { +COLD void dav1d_init_ii_wedge_masks(void) { // This function is guaranteed to be called only once enum WedgeMasterLineType { @@ -257,9 +238,11 @@ COLD void dav1d_init_wedge_masks(void) { hflip(master[WEDGE_OBLIQUE153], master[WEDGE_OBLIQUE27]); #define fill(w, h, sz_422, sz_420, hvsw, signs) \ - fill2d_16x2((uint8_t *) wedge_masks_444_##w##x##h, w, h, BS_##w##x##h, \ - master, wedge_codebook_16_##hvsw, wedge_masks_444_##w##x##h, \ - wedge_masks_422_##sz_422, wedge_masks_420_##sz_420, signs) + fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \ + master, wedge_codebook_16_##hvsw, \ + dav1d_masks.wedge_444_##w##x##h, \ + dav1d_masks.wedge_422_##sz_422, \ + dav1d_masks.wedge_420_##sz_420, signs) fill(32, 32, 16x32, 16x16, heqw, 0x7bfb); fill(32, 16, 16x16, 16x8, hltw, 0x7beb); @@ -271,72 +254,46 @@ COLD void dav1d_init_wedge_masks(void) { fill( 8, 16, 4x16, 4x8, hgtw, 0x7beb); fill( 8, 8, 4x8, 4x4, heqw, 0x7bfb); #undef fill -} - -#define N_II_PRED_MODES (N_INTER_INTRA_PRED_MODES - 1) -static uint8_t ALIGN(ii_dc_mask[32 * 32], 64); -static uint8_t ALIGN(ii_nondc_mask_32x32[N_II_PRED_MODES][32 * 32], 64); -static uint8_t ALIGN(ii_nondc_mask_16x32[N_II_PRED_MODES][16 * 32], 64); -static uint8_t ALIGN(ii_nondc_mask_16x16[N_II_PRED_MODES][16 * 16], 64); -static uint8_t ALIGN(ii_nondc_mask_8x32 [N_II_PRED_MODES][ 8 * 32], 64); -static uint8_t ALIGN(ii_nondc_mask_8x16 [N_II_PRED_MODES][ 8 * 16], 64); -static uint8_t ALIGN(ii_nondc_mask_8x8 [N_II_PRED_MODES][ 8 * 8], 64); -static uint8_t ALIGN(ii_nondc_mask_4x16 [N_II_PRED_MODES][ 4 * 16], 64); -static uint8_t ALIGN(ii_nondc_mask_4x8 [N_II_PRED_MODES][ 4 * 8], 32); -static uint8_t ALIGN(ii_nondc_mask_4x4 [N_II_PRED_MODES][ 4 * 4], 16); -#undef N_II_PRED_MODES - -#define set1(sz) \ - [II_DC_PRED] = ii_dc_mask, \ - [II_VERT_PRED] = ii_nondc_mask_##sz[II_VERT_PRED - 1], \ - [II_HOR_PRED] = ii_nondc_mask_##sz[II_HOR_PRED - 1], \ - [II_SMOOTH_PRED] = ii_nondc_mask_##sz[II_SMOOTH_PRED - 1] -#define set(sz_444, sz_422, sz_420) \ - { { set1(sz_444) }, { set1(sz_422) }, { set1(sz_420) } } -const uint8_t *dav1d_ii_masks[N_BS_SIZES][3][N_INTER_INTRA_PRED_MODES] = { - [BS_8x8] = set( 8x8, 4x8, 4x4), - [BS_8x16] = set( 8x16, 4x16, 4x8), - [BS_16x8] = set(16x16, 8x8, 8x8), - [BS_16x16] = set(16x16, 8x16, 8x8), - [BS_16x32] = set(16x32, 8x32, 8x16), - [BS_32x16] = set(32x32, 16x16, 16x16), - [BS_32x32] = set(32x32, 16x32, 16x16), -}; -#undef set -#undef set1 - -static COLD void build_nondc_ii_masks(uint8_t *const mask_v, - uint8_t *const mask_h, - uint8_t *const mask_sm, - const int w, const int h, const int step) -{ - static const uint8_t ii_weights_1d[] = { - 60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7, - 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, - }; - for (int y = 0, off = 0; y < h; y++, off += w) { - memset(&mask_v[off], ii_weights_1d[y * step], w); - for (int x = 0; x < w; x++) { - mask_sm[off + x] = ii_weights_1d[imin(x, y) * step]; - mask_h[off + x] = ii_weights_1d[x * step]; - } + memset(dav1d_masks.ii_dc, 32, 32 * 32); + for (int c = 0; c < 3; c++) { + dav1d_masks.offsets[c][BS_32x32-BS_32x32].ii[II_DC_PRED] = + dav1d_masks.offsets[c][BS_32x16-BS_32x32].ii[II_DC_PRED] = + dav1d_masks.offsets[c][BS_16x32-BS_32x32].ii[II_DC_PRED] = + dav1d_masks.offsets[c][BS_16x16-BS_32x32].ii[II_DC_PRED] = + dav1d_masks.offsets[c][BS_16x8 -BS_32x32].ii[II_DC_PRED] = + dav1d_masks.offsets[c][BS_8x16 -BS_32x32].ii[II_DC_PRED] = + dav1d_masks.offsets[c][BS_8x8 -BS_32x32].ii[II_DC_PRED] = + MASK_OFFSET(dav1d_masks.ii_dc); } -} -COLD void dav1d_init_interintra_masks(void) { - // This function is guaranteed to be called only once - - memset(ii_dc_mask, 32, 32 * 32); -#define set(a) a[II_VERT_PRED - 1], a[II_HOR_PRED - 1], a[II_SMOOTH_PRED - 1] - build_nondc_ii_masks(set(ii_nondc_mask_32x32), 32, 32, 1); - build_nondc_ii_masks(set(ii_nondc_mask_16x32), 16, 32, 1); - build_nondc_ii_masks(set(ii_nondc_mask_16x16), 16, 16, 2); - build_nondc_ii_masks(set(ii_nondc_mask_8x32), 8, 32, 1); - build_nondc_ii_masks(set(ii_nondc_mask_8x16), 8, 16, 2); - build_nondc_ii_masks(set(ii_nondc_mask_8x8), 8, 8, 4); - build_nondc_ii_masks(set(ii_nondc_mask_4x16), 4, 16, 2); - build_nondc_ii_masks(set(ii_nondc_mask_4x8), 4, 8, 4); - build_nondc_ii_masks(set(ii_nondc_mask_4x4), 4, 4, 8); -#undef set +#define BUILD_NONDC_II_MASKS(w, h, step) \ + build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step) + +#define ASSIGN_NONDC_II_OFFSET(bs, w444, h444, w422, h422, w420, h420) \ + dav1d_masks.offsets[0][bs-BS_32x32].ii[p + 1] = \ + MASK_OFFSET(&dav1d_masks.ii_nondc_##w444##x##h444[p*w444*h444]); \ + dav1d_masks.offsets[1][bs-BS_32x32].ii[p + 1] = \ + MASK_OFFSET(&dav1d_masks.ii_nondc_##w422##x##h422[p*w422*h422]); \ + dav1d_masks.offsets[2][bs-BS_32x32].ii[p + 1] = \ + MASK_OFFSET(&dav1d_masks.ii_nondc_##w420##x##h420[p*w420*h420]) + + BUILD_NONDC_II_MASKS(32, 32, 1); + BUILD_NONDC_II_MASKS(16, 32, 1); + BUILD_NONDC_II_MASKS(16, 16, 2); + BUILD_NONDC_II_MASKS( 8, 32, 1); + BUILD_NONDC_II_MASKS( 8, 16, 2); + BUILD_NONDC_II_MASKS( 8, 8, 4); + BUILD_NONDC_II_MASKS( 4, 16, 2); + BUILD_NONDC_II_MASKS( 4, 8, 4); + BUILD_NONDC_II_MASKS( 4, 4, 8); + for (int p = 0; p < 3; p++) { + ASSIGN_NONDC_II_OFFSET(BS_32x32, 32, 32, 16, 32, 16, 16); + ASSIGN_NONDC_II_OFFSET(BS_32x16, 32, 32, 16, 16, 16, 16); + ASSIGN_NONDC_II_OFFSET(BS_16x32, 16, 32, 8, 32, 8, 16); + ASSIGN_NONDC_II_OFFSET(BS_16x16, 16, 16, 8, 16, 8, 8); + ASSIGN_NONDC_II_OFFSET(BS_16x8, 16, 16, 8, 8, 8, 8); + ASSIGN_NONDC_II_OFFSET(BS_8x16, 8, 16, 4, 16, 4, 8); + ASSIGN_NONDC_II_OFFSET(BS_8x8, 8, 8, 4, 8, 4, 4); + } } diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/wedge.h b/prog/3rdPartyLibs/codecs/dav1d/src/wedge.h index 586be98c4..244e04ad2 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/wedge.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/wedge.h @@ -30,12 +30,67 @@ #include "src/levels.h" -void dav1d_init_wedge_masks(void); -EXTERN const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */] - [2 /* sign */][16 /* wedge_idx */]; +typedef struct { + /* Offsets, in units of 8 bytes, relative to the start of the struct. */ + struct { + uint16_t wedge[2 /* sign */][16 /* wedge_idx */]; + uint16_t ii[N_INTER_INTRA_PRED_MODES]; + } offsets[3 /* 444, 422, 420 */][BS_8x8 - BS_32x32 + 1]; -void dav1d_init_interintra_masks(void); -EXTERN const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */] - [N_INTER_INTRA_PRED_MODES]; + uint8_t ALIGN(wedge_444_32x32[ 16 * 32 * 32], 64); + uint8_t ALIGN(wedge_444_32x16[ 16 * 32 * 16], 64); + uint8_t ALIGN(wedge_444_32x8 [ 16 * 32 * 8], 64); + uint8_t ALIGN(wedge_444_16x32[ 16 * 16 * 32], 64); + uint8_t ALIGN(wedge_444_16x16[ 16 * 16 * 16], 64); + uint8_t ALIGN(wedge_444_16x8 [ 16 * 16 * 8], 64); + uint8_t ALIGN(wedge_444_8x32 [ 16 * 8 * 32], 64); + uint8_t ALIGN(wedge_444_8x16 [ 16 * 8 * 16], 64); + uint8_t ALIGN(wedge_444_8x8 [ 16 * 8 * 8], 64); + + uint8_t ALIGN(wedge_422_16x32[2 * 16 * 16 * 32], 64); + uint8_t ALIGN(wedge_422_16x16[2 * 16 * 16 * 16], 64); + uint8_t ALIGN(wedge_422_16x8 [2 * 16 * 16 * 8], 64); + uint8_t ALIGN(wedge_422_8x32 [2 * 16 * 8 * 32], 64); + uint8_t ALIGN(wedge_422_8x16 [2 * 16 * 8 * 16], 64); + uint8_t ALIGN(wedge_422_8x8 [2 * 16 * 8 * 8], 64); + uint8_t ALIGN(wedge_422_4x32 [2 * 16 * 4 * 32], 64); + uint8_t ALIGN(wedge_422_4x16 [2 * 16 * 4 * 16], 64); + uint8_t ALIGN(wedge_422_4x8 [2 * 16 * 4 * 8], 64); + + uint8_t ALIGN(wedge_420_16x16[2 * 16 * 16 * 16], 64); + uint8_t ALIGN(wedge_420_16x8 [2 * 16 * 16 * 8], 64); + uint8_t ALIGN(wedge_420_16x4 [2 * 16 * 16 * 4], 64); + uint8_t ALIGN(wedge_420_8x16 [2 * 16 * 8 * 16], 64); + uint8_t ALIGN(wedge_420_8x8 [2 * 16 * 8 * 8], 64); + uint8_t ALIGN(wedge_420_8x4 [2 * 16 * 8 * 4], 64); + uint8_t ALIGN(wedge_420_4x16 [2 * 16 * 4 * 16], 64); + uint8_t ALIGN(wedge_420_4x8 [2 * 16 * 4 * 8], 64); + uint8_t ALIGN(wedge_420_4x4 [2 * 16 * 4 * 4], 64); + + uint8_t ALIGN(ii_dc [ 32 * 32], 64); + uint8_t ALIGN(ii_nondc_32x32[3 * 32 * 32], 64); + uint8_t ALIGN(ii_nondc_16x32[3 * 16 * 32], 64); + uint8_t ALIGN(ii_nondc_16x16[3 * 16 * 16], 64); + uint8_t ALIGN(ii_nondc_8x32 [3 * 8 * 32], 64); + uint8_t ALIGN(ii_nondc_8x16 [3 * 8 * 16], 64); + uint8_t ALIGN(ii_nondc_8x8 [3 * 8 * 8], 64); + uint8_t ALIGN(ii_nondc_4x16 [3 * 4 * 16], 64); + uint8_t ALIGN(ii_nondc_4x8 [3 * 4 * 8], 32); + uint8_t ALIGN(ii_nondc_4x4 [3 * 4 * 4], 16); +} Dav1dMasks; + +#define II_MASK(c, bs, b) \ + ((const uint8_t*)((uintptr_t)&dav1d_masks + \ + (size_t)((b)->interintra_type == INTER_INTRA_BLEND ? \ + dav1d_masks.offsets[c][(bs)-BS_32x32].ii[(b)->interintra_mode] : \ + dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[0][(b)->wedge_idx]) * 8)) + +#define WEDGE_MASK(c, bs, sign, idx) \ + ((const uint8_t*)((uintptr_t)&dav1d_masks + \ + (size_t)dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[sign][idx] * 8)) + +EXTERN Dav1dMasks dav1d_masks; + +void dav1d_init_ii_wedge_masks(void); #endif /* DAV1D_SRC_WEDGE_H */ diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/cdef_avx2.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/cdef_avx2.asm index c5c66c759..1f30f8a3b 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/cdef_avx2.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/cdef_avx2.asm @@ -396,21 +396,17 @@ SECTION .text %macro CDEF_FILTER 2 ; w, h INIT_YMM avx2 -cglobal cdef_filter_%1x%2_8bpc, 5, 10, 0, dst, stride, left, top, bot, \ +cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %assign stack_offset_entry stack_offset mov edged, edgem cmp edged, 0xf jne .border_block - PUSH r10 PUSH r11 + PUSH r12 %if %2 == 4 - %assign regs_used 12 - %if STACK_ALIGNMENT < 32 - PUSH r%+regs_used - %assign regs_used regs_used+1 - %endif +%assign regs_used 13 ALLOC_STACK 0x60, 16 pmovzxbw xm0, [leftq+1] vpermq m0, m0, q0110 @@ -420,23 +416,15 @@ cglobal cdef_filter_%1x%2_8bpc, 5, 10, 0, dst, stride, left, top, bot, \ movu [rsp+0x28], m1 movu [rsp+0x40], m2 %elif %1 == 4 - PUSH r12 - %assign regs_used 13 - %if STACK_ALIGNMENT < 32 - PUSH r%+regs_used - %assign regs_used regs_used+1 - %endif +%assign regs_used 14 + PUSH r13 ALLOC_STACK 8*2+%1*%2*1, 16 pmovzxwd m0, [leftq] mova [rsp+0x10], m0 %else - PUSH r12 +%assign regs_used 15 PUSH r13 - %assign regs_used 14 - %if STACK_ALIGNMENT < 32 - PUSH r%+regs_used - %assign regs_used regs_used+1 - %endif + PUSH r14 ALLOC_STACK 8*4+%1*%2*2+32, 16 lea r11, [strideq*3] movu xm4, [dstq+strideq*2] @@ -1209,11 +1197,7 @@ cglobal cdef_filter_%1x%2_8bpc, 5, 10, 0, dst, stride, left, top, bot, \ DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge %define rstk rsp %assign stack_offset stack_offset_entry -%assign regs_used 10 -%if STACK_ALIGNMENT < 32 - PUSH r%+regs_used - %assign regs_used regs_used+1 -%endif +%assign regs_used 11 ALLOC_STACK 2*16+(%2+4)*32, 16 %define px rsp+2*16+2*32 diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred.h b/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred.h index 7df563fee..29e1d9605 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred.h @@ -83,6 +83,9 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, ssse3); init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3); init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3); + init_angular_ipred_fn(Z1_PRED, ipred_z1, ssse3); + init_angular_ipred_fn(Z2_PRED, ipred_z2, ssse3); + init_angular_ipred_fn(Z3_PRED, ipred_z3, ssse3); init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3); init_cfl_pred_fn(DC_PRED, ipred_cfl, ssse3); diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred16_avx2.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred16_avx2.asm index 72300c2a4..f4931e977 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred16_avx2.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred16_avx2.asm @@ -1936,11 +1936,7 @@ ALIGN function_align .upsample_left: ; h4/h8 mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1 movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0 -%if STACK_ALIGNMENT < 32 vpbroadcastw xm4, r8m ; pixel_max -%else - vpbroadcastw xm4, r9m ; r8m -> r9m due to call -%endif cmp hd, 8 je .upsample_left_h8 pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2 @@ -4889,24 +4885,26 @@ cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h jg .w32_wpad jmp .w32_hpad -cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h - vbroadcasti128 m3, [palq] +cglobal pal_pred_16bpc, 4, 6, 6, dst, stride, pal, idx, w, h + vbroadcasti128 m4, [palq] lea r2, [pal_pred_16bpc_avx2_table] tzcnt wd, wm - vbroadcasti128 m4, [pal_pred_shuf] + vbroadcasti128 m5, [pal_pred_shuf] movifnidn hd, hm movsxd wq, [r2+wq*4] - pshufb m3, m4 - punpckhqdq m4, m3, m3 + pshufb m4, m5 + punpckhqdq m5, m4, m4 add wq, r2 DEFINE_ARGS dst, stride, stride3, idx, w, h lea stride3q, [strideq*3] jmp wq .w4: - mova xm2, [idxq] - add idxq, 16 - pshufb xm1, xm3, xm2 - pshufb xm2, xm4, xm2 + movq xm0, [idxq] + add idxq, 8 + psrlw xm1, xm0, 4 + punpcklbw xm0, xm1 + pshufb xm1, xm4, xm0 + pshufb xm2, xm5, xm0 punpcklbw xm0, xm1, xm2 punpckhbw xm1, xm2 movq [dstq+strideq*0], xm0 @@ -4918,10 +4916,12 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h jg .w4 RET .w8: - movu m2, [idxq] ; only 16-byte alignment - add idxq, 32 - pshufb m1, m3, m2 - pshufb m2, m4, m2 + pmovzxbw m2, [idxq] + add idxq, 16 + psllw m1, m2, 4 + por m2, m1 + pshufb m1, m4, m2 + pshufb m2, m5, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*0], xm0 @@ -4933,19 +4933,22 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h jg .w8 RET .w16: - vpermq m2, [idxq+ 0], q3120 - vpermq m5, [idxq+32], q3120 - add idxq, 64 - pshufb m1, m3, m2 - pshufb m2, m4, m2 + pshufd m3, [idxq], q3120 + add idxq, 32 + vpermq m3, m3, q3120 + psrlw m1, m3, 4 + punpcklbw m2, m3, m1 + punpckhbw m3, m1 + pshufb m1, m4, m2 + pshufb m2, m5, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 - pshufb m1, m3, m5 - pshufb m2, m4, m5 - punpcklbw m0, m1, m2 - punpckhbw m1, m2 + pshufb m1, m4, m3 + pshufb m3, m5, m3 + punpcklbw m0, m1, m3 + punpckhbw m1, m3 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m1 lea dstq, [dstq+strideq*4] @@ -4953,41 +4956,47 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h jg .w16 RET .w32: - vpermq m2, [idxq+ 0], q3120 - vpermq m5, [idxq+32], q3120 - add idxq, 64 - pshufb m1, m3, m2 - pshufb m2, m4, m2 - punpcklbw m0, m1, m2 - punpckhbw m1, m2 - mova [dstq+strideq*0+ 0], m0 - mova [dstq+strideq*0+32], m1 - pshufb m1, m3, m5 - pshufb m2, m4, m5 + pshufd m3, [idxq], q3120 + add idxq, 32 + vpermq m3, m3, q3120 + psrlw m1, m3, 4 + punpcklbw m2, m3, m1 + punpckhbw m3, m1 + pshufb m1, m4, m2 + pshufb m2, m5, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 - mova [dstq+strideq*1+ 0], m0 - mova [dstq+strideq*1+32], m1 + mova [dstq+ 0], m0 + mova [dstq+32], m1 + pshufb m1, m4, m3 + pshufb m3, m5, m3 + punpcklbw m0, m1, m3 + punpckhbw m1, m3 + mova [dstq+strideq+ 0], m0 + mova [dstq+strideq+32], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32 RET .w64: - vpermq m2, [idxq+ 0], q3120 - vpermq m5, [idxq+32], q3120 - add idxq, 64 - pshufb m1, m3, m2 - pshufb m2, m4, m2 - punpcklbw m0, m1, m2 - punpckhbw m1, m2 - mova [dstq+ 0], m0 - mova [dstq+32], m1 - pshufb m1, m3, m5 - pshufb m2, m4, m5 + pshufd m3, [idxq], q3120 + add idxq, 32 + vpermq m3, m3, q3120 + psrlw m1, m3, 4 + punpcklbw m2, m3, m1 + punpckhbw m3, m1 + pshufb m1, m4, m2 + pshufb m2, m5, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 - mova [dstq+64], m0 - mova [dstq+96], m1 + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + pshufb m1, m4, m3 + pshufb m3, m5, m3 + punpcklbw m0, m1, m3 + punpckhbw m1, m3 + mova [dstq+32*2], m0 + mova [dstq+32*3], m1 add dstq, strideq dec hd jg .w64 diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred16_avx512.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred16_avx512.asm index 1a307adc9..60f08d71c 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred16_avx512.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred16_avx512.asm @@ -38,10 +38,10 @@ smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 -pal_pred_perm: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 - db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 - db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 - db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 +pal_pred_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51 + db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55 + db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59 + db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63 filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5 times 4 db 10, 11, 12, 13, 2, 3, -1, -1 filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7 @@ -57,6 +57,8 @@ filter_shift: times 2 dw 6 dd 0 times 2 dw 4 dd 9 +pal_unpack: db 0, 8, 4, 12, 32, 40, 36, 44 + db 16, 24, 20, 28, 48, 56, 52, 60 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) @@ -610,20 +612,23 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3 jg .w64_loop RET -cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3 +cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3 lea r6, [pal_pred_16bpc_avx512icl_table] tzcnt wd, wm - mova m2, [pal_pred_perm] - movsxd wq, [r6+wq*4] - mova xm3, [palq] + mova m3, [pal_pred_perm] movifnidn hd, hm + movsxd wq, [r6+wq*4] + vpbroadcastq m4, [pal_unpack+0] + vpbroadcastq m5, [pal_unpack+8] add wq, r6 + vbroadcasti32x4 m6, [palq] lea stride3q, [strideq*3] jmp wq .w4: - pmovzxbw ym0, [idxq] - add idxq, 16 - vpermw ym0, ym0, ym3 + pmovzxbd ym0, [idxq] + add idxq, 8 + vpmultishiftqb ym0, ym4, ym0 + vpermw ym0, ym0, ym6 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 @@ -634,9 +639,10 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3 jg .w4 RET .w8: - pmovzxbw m0, [idxq] - add idxq, 32 - vpermw m0, m0, m3 + pmovzxbd m0, [idxq] + add idxq, 16 + vpmultishiftqb m0, m4, m0 + vpermw m0, m0, m6 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 @@ -646,11 +652,13 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3 jg .w8 RET .w16: - vpermb m1, m2, [idxq] - add idxq, 64 - vpermw m0, m1, m3 + movu ym1, [idxq] + add idxq, 32 + vpermb m1, m3, m1 + vpmultishiftqb m1, m4, m1 + vpermw m0, m1, m6 psrlw m1, 8 - vpermw m1, m1, m3 + vpermw m1, m1, m6 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 @@ -660,27 +668,41 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3 jg .w16 RET .w32: - vpermb m1, m2, [idxq] + vpermb m2, m3, [idxq] add idxq, 64 - vpermw m0, m1, m3 + vpmultishiftqb m1, m4, m2 + vpmultishiftqb m2, m5, m2 + vpermw m0, m1, m6 psrlw m1, 8 - vpermw m1, m1, m3 + vpermw m1, m1, m6 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 - lea dstq, [dstq+strideq*2] - sub hd, 2 + vpermw m0, m2, m6 + psrlw m2, 8 + vpermw m1, m2, m6 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 jg .w32 RET .w64: - vpermb m1, m2, [idxq] + vpermb m2, m3, [idxq] add idxq, 64 - vpermw m0, m1, m3 + vpmultishiftqb m1, m4, m2 + vpmultishiftqb m2, m5, m2 + vpermw m0, m1, m6 psrlw m1, 8 - vpermw m1, m1, m3 - mova [dstq+64*0], m0 - mova [dstq+64*1], m1 - add dstq, strideq - dec hd + vpermw m1, m1, m6 + mova [dstq+ 0], m0 + mova [dstq+64], m1 + vpermw m0, m2, m6 + psrlw m2, 8 + vpermw m1, m2, m6 + mova [dstq+strideq+ 0], m0 + mova [dstq+strideq+64], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 jg .w64 RET diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred16_sse.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred16_sse.asm index 07ea9567e..5a311b144 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred16_sse.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred16_sse.asm @@ -30,14 +30,46 @@ SECTION_RODATA filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 +z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 +z_base_inc_z2: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64 +z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +z2_upsample_l: db -1, -1, -2, -1, -3, -1, -4, -1, 8, 9, 8, 9, 10, 11, 12, 13 + db 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 +z2_top_shufA: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 +z2_top_shufB: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +z2_left_shufA: db 14, 15, 12, 13, 10, 11, 8, 9, 12, 13, 10, 11, 8, 9, 6, 7 +z2_left_shufB: db 14, 15, 10, 11, 6, 7, 2, 3, 12, 13, 8, 9, 4, 5, 0, 1 +z_filt_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 +z_filt_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 + db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 +z_filt_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 +z_filt_wh4: db 7, 7, 19, 7, +z_filt_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 +ALIGN 8 +pb_2_3: times 4 db 2, 3 +z2_dy_offset: dw 96*64, 96*64, 95*64, 95*64 +z_filt_k: times 4 dw 8 + times 4 dw 6 + times 4 dw 4 + times 4 dw 5 +pw_m3584: times 4 dw -3584 +pw_m3072: times 4 dw -3072 +pw_m2560: times 4 dw -2560 +pw_m2048: times 4 dw -2048 +pw_m1536: times 4 dw -1536 +pw_m1024: times 4 dw -1024 +pw_m512: times 4 dw -512 +pw_1: times 4 dw 1 +pw_2: times 4 dw 2 +pw_3: times 4 dw 3 +pw_62: times 4 dw 62 +pw_256: times 4 dw 256 +pw_512: times 4 dw 512 +pw_2048: times 4 dw 2048 -pb_0_1: times 4 db 0, 1 -pb_2_3: times 4 db 2, 3 -pw_1: times 4 dw 1 -pw_2: times 4 dw 2 -pw_4: times 4 dw 4 -pw_512: times 4 dw 512 -pw_2048: times 4 dw 2048 +%define pw_4 (z_filt_k+8*2) +%define pw_8 (z_filt_k+8*0) +%define pw_m1to4 z2_upsample_l %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) @@ -58,6 +90,9 @@ JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32 s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1_16bpc, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2_16bpc, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3_16bpc, ssse3, h4, h8, h16, h32, h64 JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32 @@ -66,6 +101,7 @@ JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 cextern smooth_weights_1d_16bpc cextern smooth_weights_2d_16bpc +cextern dr_intra_derivative cextern filter_intra_taps SECTION .text @@ -443,7 +479,7 @@ cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 LEA r5, ipred_h_16bpc_ssse3_table movifnidn hd, hm movsxd wq, [r5+wq*4] - movddup m2, [base+pb_0_1] + movddup m2, [base+pw_256] movddup m3, [base+pb_2_3] add wq, r5 lea stride3q, [strideq*3] @@ -577,7 +613,7 @@ cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left %assign regs_used 8 %endif %if ARCH_X86_64 - movddup m8, [pb_0_1] + movddup m8, [pw_256] %endif lea tlq, [tlq+wq*2+2] neg wq @@ -881,6 +917,2123 @@ cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ %endif RET +%if ARCH_X86_64 +cglobal ipred_z1_16bpc, 3, 8, 8, 16*18, dst, stride, tl, w, h, angle, dx + %define base r7-$$ + %define bdmaxm r8m + lea r7, [$$] +%else +cglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride, tl, w, h, angle, dx + %define base r1-$$ + %define stridemp [rsp+4*0] + %define bdmaxm [rsp+4*1] + mov r3, r8m + mov stridemp, r1 + mov bdmaxm, r3 + LEA r1, $$ +%endif + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + add tlq, 2 + movsxd wq, [base+ipred_z1_16bpc_ssse3_table+wq*4] + mov dxd, angled + movddup m0, [base+pw_256] + and dxd, 0x7e + movddup m7, [base+pw_62] + add angled, 165 ; ~90 + lea wq, [base+wq+ipred_z1_16bpc_ssse3_table] + movzx dxd, word [base+dr_intra_derivative+dxq] + xor angled, 0x4ff ; d = 90 - angle + jmp wq +.w4: + lea r3d, [angleq+88] + test r3d, 0x480 + jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 + sar r3d, 9 + add r3d, hd + cmp r3d, 8 + jg .w4_no_upsample ; h > 8 || (w == h && is_sm) + movd m3, [tlq+14] + movu m2, [tlq+ 0] ; 1 2 3 4 5 6 7 8 + movd m1, bdmaxm + pshufb m3, m0 + palignr m4, m3, m2, 4 ; 3 4 5 6 7 8 8 8 + paddw m4, [tlq- 2] ; 0 1 2 3 4 5 6 7 + add dxd, dxd + mova [rsp+32], m3 + palignr m3, m2, 2 ; 2 3 4 5 6 7 8 8 + pshufb m1, m0 + paddw m3, m2 ; -1 * a + 9 * b + 9 * c + -1 * d + psubw m5, m3, m4 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 + movd m4, dxd + psraw m5, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 + paddw m3, m5 + pxor m5, m5 + pmaxsw m3, m5 + mov r3d, dxd + pavgw m3, m5 + pshufb m4, m0 + pminsw m3, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + mova m3, [base+z_upsample] + movifnidn strideq, stridemp + mova [rsp+ 0], m1 + paddw m5, m4, m4 + mova [rsp+16], m2 + punpcklqdq m4, m5 ; xpos0 xpos1 +.w4_upsample_loop: + lea r2d, [r3+dxq] + shr r3d, 6 ; base0 + movu m1, [rsp+r3*2] + lea r3d, [r2+dxq] + shr r2d, 6 ; base1 + movu m2, [rsp+r2*2] + pshufb m1, m3 + pshufb m2, m3 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pand m2, m7, m4 ; frac + psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 + psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) + pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) + paddw m4, m5 ; xpos += dx + paddw m0, m1 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w4_upsample_loop + RET +.w4_no_upsample: + mov r3d, 7 ; max_base + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + lea r3d, [hq+3] + movd m1, r3d + movd m3, angled + shr angled, 8 ; is_sm << 1 + pxor m2, m2 + pshufb m1, m2 + pshufb m3, m2 + pcmpeqb m1, [base+z_filt_wh4] + pand m1, m3 + pcmpgtb m1, [base+z_filt_t_w48+angleq*8] + pmovmskb r5d, m1 + mov r3d, 7 + test r5d, r5d + jz .w4_main ; filter_strength == 0 + pshuflw m1, [tlq-2], q0000 + movu m2, [tlq+16*0] + imul r5d, 0x55555555 + movd m3, [tlq+r3*2] + shr r5d, 30 ; filter_strength + movd [rsp+12], m1 + pshuflw m3, m3, q0000 + mova [rsp+16*1], m2 + lea r2d, [r3+2] + movq [rsp+r3*2+18], m3 + cmp hd, 8 + cmovae r3d, r2d + lea tlq, [rsp+16*1] + call .filter_edge +.w4_main: + lea tlq, [tlq+r3*2] + movd m4, dxd + movddup m1, [base+z_base_inc] ; base_inc << 6 + movd m6, [tlq] ; top[max_base_x] + shl r3d, 6 + movd m3, r3d + pshufb m4, m0 + mov r5d, dxd ; xpos + pshufb m6, m0 + sub r5, r3 + pshufb m3, m0 + paddw m5, m4, m4 + psubw m3, m1 ; max_base_x + punpcklqdq m4, m5 ; xpos0 xpos1 + movifnidn strideq, stridemp +.w4_loop: + lea r3, [r5+dxq] + sar r5, 6 ; base0 + movq m0, [tlq+r5*2+0] + movq m1, [tlq+r5*2+2] + lea r5, [r3+dxq] + sar r3, 6 ; base1 + movhps m0, [tlq+r3*2+0] + movhps m1, [tlq+r3*2+2] + pand m2, m7, m4 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + pcmpgtw m2, m3, m4 ; xpos < max_base_x + paddw m4, m5 ; xpos += dx + paddw m0, m1 + pand m0, m2 + pandn m2, m6 + por m0, m2 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + sub hd, 2 + jz .w4_end + lea dstq, [dstq+strideq*2] + test r5d, r5d + jl .w4_loop +.w4_end_loop: + movq [dstq+strideq*0], m6 + movq [dstq+strideq*1], m6 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w4_end_loop +.w4_end: + RET +.w8: + lea r3d, [angleq+88] + and r3d, ~0x7f + or r3d, hd + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + movu m1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 + movu m5, [tlq+ 2] ; 2 3 4 5 6 7 8 9 + movu m3, [tlq+ 4] ; 3 4 5 6 7 8 9 a + paddw m5, m1 + paddw m3, [tlq- 2] ; 0 1 2 3 4 5 6 7 + psubw m2, m5, m3 + movu m6, [tlq+18] ; a b c d e f g _ + psraw m2, 3 + movu m3, [tlq+20] ; b c d e f g _ _ + paddw m5, m2 + movu m2, [tlq+16] ; 9 a b c d e f g + paddw m6, m2 + add dxd, dxd + cmp hd, 4 + jne .w8_upsample_h8 ; awkward single-pixel edge case + pshuflw m3, m3, q1110 ; b c c _ _ _ _ _ +.w8_upsample_h8: + paddw m3, [tlq+14] ; 8 9 a b c d e f + psubw m4, m6, m3 + movd m3, bdmaxm + psraw m4, 3 + mov r3d, dxd + paddw m6, m4 + pxor m4, m4 + pmaxsw m5, m4 + pmaxsw m6, m4 + pshufb m3, m0 + pavgw m5, m4 + pavgw m6, m4 + movd m4, dxd + pminsw m5, m3 + pminsw m6, m3 + mova m3, [base+z_upsample] + pshufb m4, m0 + movifnidn strideq, stridemp + punpcklwd m0, m1, m5 + mova [rsp+ 0], m0 + punpckhwd m1, m5 + mova [rsp+16], m1 + punpcklwd m0, m2, m6 + mova [rsp+32], m0 + punpckhwd m2, m6 + mova [rsp+48], m2 + mova m5, m4 +.w8_upsample_loop: + mov r2d, r3d + shr r2d, 6 + movu m1, [rsp+r2*2+ 0] + movu m2, [rsp+r2*2+16] + add r3d, dxd + pshufb m1, m3 + pshufb m2, m3 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pand m2, m7, m4 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m4, m5 + paddw m0, m1 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w8_upsample_loop + RET +.w8_no_upsample: + lea r3d, [hq+7] + movd m1, r3d + and r3d, 7 + or r3d, 8 ; imin(h+7, 15) + test angled, 0x400 + jnz .w8_main + movd m3, angled + shr angled, 8 ; is_sm << 1 + pxor m2, m2 + pshufb m1, m2 + pshufb m3, m2 + movu m2, [base+z_filt_wh8] + psrldq m4, [base+z_filt_t_w48+angleq*8], 4 + pcmpeqb m2, m1 + pand m2, m3 + pcmpgtb m2, m4 + pmovmskb r5d, m2 + test r5d, r5d + jz .w8_main ; filter_strength == 0 + pshuflw m1, [tlq-2], q0000 + movu m2, [tlq+16*0] + imul r5d, 0x55555555 + movu m3, [tlq+16*1] + movd m4, [tlq+r3*2] + shr r5d, 30 ; filter_strength + movd [rsp+12], m1 + mova [rsp+16*1], m2 + pshuflw m4, m4, q0000 + mova [rsp+16*2], m3 + lea r2d, [r3+2] + movq [rsp+r3*2+18], m4 + cmp hd, 16 + cmovae r3d, r2d + lea tlq, [rsp+16*1] + call .filter_edge +.w8_main: + lea tlq, [tlq+r3*2] + movd m5, dxd + mova m4, [base+z_base_inc] + shl r3d, 6 + movd m6, [tlq] ; top[max_base_x] + movd m1, r3d + pshufb m5, m0 + mov r5d, dxd ; xpos + pshufb m1, m0 + sub r5, r3 + psubw m4, m1 ; max_base_x + pshufb m6, m0 + paddw m4, m5 + movifnidn strideq, stridemp +.w8_loop: + mov r3, r5 + sar r3, 6 + movu m0, [tlq+r3*2+0] + movu m1, [tlq+r3*2+2] + pand m2, m7, m4 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + psraw m2, m4, 15 ; xpos < max_base_x + paddw m4, m5 ; xpos += dx + paddw m0, m1 + pand m0, m2 + pandn m2, m6 + por m0, m2 + mova [dstq], m0 + dec hd + jz .w8_end + add dstq, strideq + add r5, dxq + jl .w8_loop +.w8_end_loop: + mova [dstq], m6 + add dstq, strideq + dec hd + jg .w8_end_loop +.w8_end: + RET +.w16: +%if ARCH_X86_32 + %define strideq r3 +%endif + lea r3d, [hq+15] + movd m1, r3d + and r3d, 15 + or r3d, 16 ; imin(h+15, 31) + test angled, 0x400 + jnz .w16_main + movd m3, angled + shr angled, 8 ; is_sm << 1 + pxor m2, m2 + pshufb m1, m2 + pshufb m3, m2 + movq m4, [base+z_filt_t_w16+angleq*4] + pcmpeqb m1, [base+z_filt_wh16] + pand m1, m3 + pcmpgtb m1, m4 + pmovmskb r5d, m1 + test r5d, r5d + jz .w16_main ; filter_strength == 0 + pshuflw m1, [tlq-2], q0000 + movu m2, [tlq+16*0] + imul r5d, 0x24924924 + movu m3, [tlq+16*1] + movu m4, [tlq+16*2] + shr r5d, 30 + movu m5, [tlq+16*3] + movd m6, [tlq+r3*2] + adc r5d, -1 ; filter_strength + movd [rsp+12], m1 + mova [rsp+16*1], m2 + mova [rsp+16*2], m3 + pshuflw m6, m6, q0000 + mova [rsp+16*3], m4 + mova [rsp+16*4], m5 + lea r2d, [r3+2] + movq [rsp+r3*2+18], m6 + cmp hd, 32 + cmovae r3d, r2d + lea tlq, [rsp+16*1] + call .filter_edge +.w16_main: + lea tlq, [tlq+r3*2] + movd m5, dxd + mova m4, [base+z_base_inc] + shl r3d, 6 + movd m6, [tlq] ; top[max_base_x] + movd m1, r3d + pshufb m5, m0 + mov r5d, dxd ; xpos + pshufb m1, m0 + sub r5, r3 + psubw m4, m1 ; max_base_x + pshufb m6, m0 + paddw m4, m5 +.w16_loop: + mov r3, r5 + sar r3, 6 + movu m0, [tlq+r3*2+ 0] + movu m2, [tlq+r3*2+ 2] + pand m3, m7, m4 + psllw m3, 9 + psubw m2, m0 + pmulhrsw m2, m3 + movu m1, [tlq+r3*2+16] + paddw m0, m2 + movu m2, [tlq+r3*2+18] + psubw m2, m1 + pmulhrsw m2, m3 + movddup m3, [base+pw_m512] + paddw m1, m2 + psraw m2, m4, 15 + pcmpgtw m3, m4 + paddw m4, m5 + pand m0, m2 + pandn m2, m6 + pand m1, m3 + pandn m3, m6 + por m0, m2 + mova [dstq+16*0], m0 + por m1, m3 + mova [dstq+16*1], m1 + dec hd + jz .w16_end + movifnidn strideq, stridemp + add dstq, strideq + add r5, dxq + jl .w16_loop +.w16_end_loop: + mova [dstq+16*0], m6 + mova [dstq+16*1], m6 + add dstq, strideq + dec hd + jg .w16_end_loop +.w16_end: + RET +.w32: + lea r3d, [hq+31] + and r3d, 31 + or r3d, 32 ; imin(h+31, 63) + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + call .filter_copy + lea r5d, [r3+2] + cmp hd, 64 + cmove r3d, r5d + call .filter_edge_s3 +.w32_main: + lea tlq, [tlq+r3*2] + movd m5, dxd + mova m4, [base+z_base_inc] + shl r3d, 6 + movd m6, [tlq] ; top[max_base_x] + movd m1, r3d + pshufb m5, m0 + mov r5d, dxd ; xpos + pshufb m1, m0 + sub r5, r3 + psubw m4, m1 ; max_base_x + pshufb m6, m0 + paddw m4, m5 +.w32_loop: + mov r3, r5 + sar r3, 6 + movu m0, [tlq+r3*2+ 0] + movu m2, [tlq+r3*2+ 2] + pand m3, m7, m4 + psllw m3, 9 + psubw m2, m0 + pmulhrsw m2, m3 + movu m1, [tlq+r3*2+16] + paddw m0, m2 + movu m2, [tlq+r3*2+18] + psubw m2, m1 + pmulhrsw m2, m3 + paddw m1, m2 + psraw m2, m4, 15 + pand m0, m2 + pandn m2, m6 + por m0, m2 + movddup m2, [base+pw_m512] + pcmpgtw m2, m4 + pand m1, m2 + pandn m2, m6 + mova [dstq+16*0], m0 + por m1, m2 + mova [dstq+16*1], m1 + movu m0, [tlq+r3*2+32] + movu m2, [tlq+r3*2+34] + psubw m2, m0 + pmulhrsw m2, m3 + movu m1, [tlq+r3*2+48] + paddw m0, m2 + movu m2, [tlq+r3*2+50] + psubw m2, m1 + pmulhrsw m2, m3 + paddw m1, m2 + movddup m2, [base+pw_m1024] + movddup m3, [base+pw_m1536] + pcmpgtw m2, m4 + pcmpgtw m3, m4 + paddw m4, m5 + pand m0, m2 + pandn m2, m6 + pand m1, m3 + pandn m3, m6 + por m0, m2 + mova [dstq+16*2], m0 + por m1, m3 + mova [dstq+16*3], m1 + dec hd + jz .w32_end + movifnidn strideq, stridemp + add dstq, strideq + add r5, dxq + jl .w32_loop +.w32_end_loop: + REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3 + add dstq, strideq + dec hd + jg .w32_end_loop +.w32_end: + RET +.w64: + lea r3d, [hq+63] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + call .filter_copy + call .filter_edge_s3 +.w64_main: + lea tlq, [tlq+r3*2] + movd m5, dxd + mova m4, [base+z_base_inc] + shl r3d, 6 + movd m6, [tlq] ; top[max_base_x] + movd m1, r3d + pshufb m5, m0 + mov r5d, dxd ; xpos + pshufb m1, m0 + sub r5, r3 + psubw m4, m1 ; max_base_x + pshufb m6, m0 + paddw m4, m5 +.w64_loop: + mov r3, r5 + sar r3, 6 + movu m0, [tlq+r3*2+ 0] + movu m2, [tlq+r3*2+ 2] + pand m3, m7, m4 + psllw m3, 9 + psubw m2, m0 + pmulhrsw m2, m3 + movu m1, [tlq+r3*2+16] + paddw m0, m2 + movu m2, [tlq+r3*2+18] + psubw m2, m1 + pmulhrsw m2, m3 + paddw m1, m2 + psraw m2, m4, 15 + pand m0, m2 + pandn m2, m6 + por m0, m2 + movddup m2, [base+pw_m512] + pcmpgtw m2, m4 + pand m1, m2 + pandn m2, m6 + mova [dstq+16*0], m0 + por m1, m2 + mova [dstq+16*1], m1 + movu m0, [tlq+r3*2+32] + movu m2, [tlq+r3*2+34] + psubw m2, m0 + pmulhrsw m2, m3 + movu m1, [tlq+r3*2+48] + paddw m0, m2 + movu m2, [tlq+r3*2+50] + psubw m2, m1 + pmulhrsw m2, m3 + paddw m1, m2 + movddup m2, [base+pw_m1024] + pcmpgtw m2, m4 + pand m0, m2 + pandn m2, m6 + por m0, m2 + movddup m2, [base+pw_m1536] + pcmpgtw m2, m4 + pand m1, m2 + pandn m2, m6 + mova [dstq+16*2], m0 + por m1, m2 + mova [dstq+16*3], m1 + movu m0, [tlq+r3*2+64] + movu m2, [tlq+r3*2+66] + psubw m2, m0 + pmulhrsw m2, m3 + movu m1, [tlq+r3*2+80] + paddw m0, m2 + movu m2, [tlq+r3*2+82] + psubw m2, m1 + pmulhrsw m2, m3 + paddw m1, m2 + movddup m2, [base+pw_m2048] + pcmpgtw m2, m4 + pand m0, m2 + pandn m2, m6 + por m0, m2 + movddup m2, [base+pw_m2560] + pcmpgtw m2, m4 + pand m1, m2 + pandn m2, m6 + mova [dstq+16*4], m0 + por m1, m2 + mova [dstq+16*5], m1 + movu m0, [tlq+r3*2+96] + movu m2, [tlq+r3*2+98] + psubw m2, m0 + pmulhrsw m2, m3 + movu m1, [tlq+r3*2+112] + paddw m0, m2 + movu m2, [tlq+r3*2+114] + psubw m2, m1 + pmulhrsw m2, m3 + paddw m1, m2 + movddup m2, [base+pw_m3072] + movddup m3, [base+pw_m3584] + pcmpgtw m2, m4 + pcmpgtw m3, m4 + paddw m4, m5 + pand m0, m2 + pandn m2, m6 + pand m1, m3 + pandn m3, m6 + por m0, m2 + mova [dstq+16*6], m0 + por m1, m3 + mova [dstq+16*7], m1 + dec hd + jz .w64_end + movifnidn strideq, stridemp + add dstq, strideq + add r5, dxq + jl .w64_loop +.w64_end_loop: + REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + RET +ALIGN function_align +.filter_copy: + pshuflw m2, [tlq-2], q0000 + pshuflw m3, [tlq+r3*2], q0000 + xor r5d, r5d + movd [rsp+gprsize+12], m2 +.filter_copy_loop: + movu m1, [tlq+r5*2+16*0] + movu m2, [tlq+r5*2+16*1] + add r5d, 16 + mova [rsp+r5*2+gprsize-16*1], m1 + mova [rsp+r5*2+gprsize-16*0], m2 + cmp r5d, r3d + jle .filter_copy_loop + lea tlq, [rsp+gprsize+16*1] + movq [tlq+r3*2+2], m3 + ret +.filter_edge: + cmp r5d, 3 + je .filter_edge_s3 + movddup m4, [base+z_filt_k+r5*8-8] + movddup m5, [base+z_filt_k+r5*8+8] + xor r5d, r5d + movddup m6, [base+pw_8] + movu m2, [tlq-2] + jmp .filter_edge_start +.filter_edge_loop: + movu m2, [tlq+r5*2-2] + mova [tlq+r5*2-16], m1 +.filter_edge_start: + pmullw m1, m4, [tlq+r5*2] + movu m3, [tlq+r5*2+2] + paddw m2, m3 + pmullw m2, m5 + add r5d, 8 + paddw m1, m6 + paddw m1, m2 + psrlw m1, 4 + cmp r5d, r3d + jl .filter_edge_loop + mova [tlq+r5*2-16], m1 + ret +.filter_edge_s3: + movddup m5, [base+pw_3] + xor r5d, r5d + movu m2, [tlq-2] + movu m3, [tlq-4] + jmp .filter_edge_s3_start +.filter_edge_s3_loop: + movu m2, [tlq+r5*2-2] + movu m3, [tlq+r5*2-4] + mova [tlq+r5*2-16], m1 +.filter_edge_s3_start: + paddw m2, [tlq+r5*2+0] + paddw m3, m5 + movu m1, [tlq+r5*2+2] + movu m4, [tlq+r5*2+4] + add r5d, 8 + paddw m1, m2 + pavgw m3, m4 + paddw m1, m3 + psrlw m1, 2 + cmp r5d, r3d + jl .filter_edge_s3_loop + mova [tlq+r5*2-16], m1 + ret + +%if ARCH_X86_64 +cglobal ipred_z2_16bpc, 4, 12, 11, 16*24, dst, stride, tl, w, h, angle, dx, _, dy + %define base r7-$$ + %define maxwm r6m + %define maxhm r7m + %define bdmaxm r8m + lea r7, [$$] + mov hd, hm + movddup m8, [base+pw_62] + lea r9d, [wq-4] + shl r9d, 6 + mova m9, [base+z2_top_shufA] + or r9d, hd + mova m10, [base+z2_left_shufA] +%else +cglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w, h, angle, dx + %define base r1-$$ + %define r9b byte [rsp+16*26+4*0] + %define r9d dword [rsp+16*26+4*0] + %define r10d dword [rsp+16*26+4*1] + %define r11d dword [rsp+16*26+4*2] + %define maxwm [rsp+16*2+4*0] + %define maxhm [rsp+16*2+4*1] + %define bdmaxm [rsp+16*2+4*2] + %define stridemp [rsp+16*26+4*3] + %define strideq r3 + %define dyd r4 + %define dyq r4 + mov stridemp, r1 + mov r1d, r6m + mov r4d, r7m + mov r5d, r8m + mov maxwm, r1d + mov maxhm, r4d + mov bdmaxm, r5d + LEA r1, $$ + lea hd, [wq-4] + mova m0, [base+z2_top_shufA] + shl hd, 6 + mova m1, [base+z2_left_shufA] + or hd, hm + mova [rsp+16*24], m0 + mov r9d, hd + mova [rsp+16*25], m1 +%endif + tzcnt wd, wd + movifnidn angled, anglem + mova m0, [tlq-16*8] + mova m1, [tlq-16*7] + mova m2, [tlq-16*6] + mova m3, [tlq-16*5] + movsxd wq, [base+ipred_z2_16bpc_ssse3_table+wq*4] +%if ARCH_X86_64 + movzx dxd, angleb +%else + movzx dxd, byte anglem +%endif + mova m4, [tlq-16*4] + mova m5, [tlq-16*3] + mova m6, [tlq-16*2] + mova m7, [tlq-16*1] + mova [rsp+16* 5], m0 + xor angled, 0x400 + mova [rsp+16* 6], m1 + mov dyd, dxd + mova [rsp+16* 7], m2 + neg dxq + mova [rsp+16* 8], m3 + and dyd, ~1 + mova [rsp+16* 9], m4 + and dxq, ~1 + mova [rsp+16*10], m5 + lea wq, [base+ipred_z2_16bpc_ssse3_table+wq] + mova [rsp+16*11], m6 + pxor m3, m3 + mova [rsp+16*12], m7 + movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 + movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle + movddup m0, [base+pw_256] ; 4<<6 + movd m4, [tlq] + movu m5, [tlq+16*0+2] + movu m6, [tlq+16*1+2] + movsldup m1, [base+z2_dy_offset] + pshufb m4, m0 + movq m7, [base+z_base_inc+2] + mov r11d, (112-4)<<6 + mova [rsp+16*13], m4 + neg dxd + mova [rsp+16*14], m5 + or dyd, 4<<16 + mova [rsp+16*15], m6 +%if ARCH_X86_64 + lea r10d, [dxq+(112<<6)] ; xpos +%else + mov [rsp+8*3], dyd + lea r4d, [dxq+(112<<6)] + mov r10d, r4d + movzx hd, r9b +%endif + movq [rsp+8*0], m1 + movq [rsp+8*1], m0 + movq [rsp+8*2], m7 + jmp wq +.w4: + test angled, 0x400 + jnz .w4_main + lea r3d, [hq+2] + add angled, 1022 + pshuflw m1, m5, q3333 + shl r3d, 6 + movq [rsp+16*14+8], m1 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + call .upsample_above + sub angled, 1075 ; angle - 53 + lea r3d, [hq+3] + xor angled, 0x7f ; 180 - angle + movd m2, r3d + movd m7, angled + shr angled, 8 ; is_sm << 1 + pshufb m2, m3 + pshufb m7, m3 + pcmpeqb m2, [base+z_filt_wh4] + pand m7, m2 + pcmpgtb m7, [base+z_filt_t_w48+angleq*8] + jmp .w8_filter_left +.upsample_above: ; w4/w8 + paddw m2, m5, [tlq] + movu m1, [rsp+gprsize+16*14+2] + movu m4, [rsp+gprsize+16*14-4] +%if ARCH_X86_64 + movd m6, r9m ; bdmax, offset due to call +%else + movd m6, [rsp+gprsize+16*2+4*2] +%endif + paddw m4, m1 + psubw m1, m2, m4 + pshufb m6, m0 + psraw m1, 3 + paddw m2, m1 + add dxd, dxd + pmaxsw m2, m3 + paddw m7, m7 + pavgw m2, m3 + pminsw m2, m6 +%if ARCH_X86_64 + mova m9, [base+z2_top_shufB] + lea r10d, [dxq+(113<<6)] + mov r11d, (112-7)<<6 +%else + mova m1, [base+z2_top_shufB] + lea r3d, [dxq+(113<<6)] + mov dword [rsp+gprsize+16*26+4*2], (112-7)<<6 + mov [rsp+gprsize+16*26+4*1], r3d + mova [rsp+gprsize+16*24], m1 +%endif + punpcklwd m1, m2, m5 + punpckhwd m2, m5 + movq [rsp+gprsize+8*2], m7 + mova [rsp+gprsize+16*14], m1 + mova [rsp+gprsize+16*15], m2 + ret +.w4_no_upsample_above: + lea r3d, [hq+3] + mov [rsp+16*4], angled + sub angled, 1112 ; angle - 90 + movd m2, r3d + mov r3d, 90 + movd m1, angled + sub r3d, angled ; 180 - angle + shr angled, 8 ; is_sm << 1 + mova m4, [base+z_filt_wh4] + movd m7, r3d + mova m5, [base+z_filt_t_w48+angleq*8] + mov r3d, 4 + call .w8_filter_top + mov angled, [rsp+16*4] + lea r3d, [hq+2] + sub angled, 139 + shl r3d, 6 + test r3d, angled + jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) +.upsample_left: ; w4/w8 + mova m2, [tlq-16] + lea r3d, [hq-4] + movu m3, [tlq-14] + movu m4, [rsp+16*12+4] + pshufb m1, m2, [base+z2_upsample_l+r3*4] + movd m6, bdmaxm + pxor m5, m5 + paddw m3, m2 + paddw m4, m1 + psubw m1, m3, m4 + movshdup m4, [base+z2_dy_offset] + psraw m1, 3 + pshufb m6, m0 + paddw m3, m1 + pmaxsw m3, m5 + pavgw m3, m5 + pminsw m3, m6 +%if ARCH_X86_64 + mova m10, [base+z2_left_shufB] + add dyd, dyd +%else + mova m1, [base+z2_left_shufB] + shl dword [rsp+8*3], 1 + mova [rsp+16*25], m1 +%endif + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + movq [rsp+8*0], m4 + mova [rsp+16*12], m1 + mova [rsp+16*11], m2 +.w4_main: + movd m6, dxd +%if ARCH_X86_64 + movd m3, dyd +%else + movd m3, [rsp+8*3] +%endif + pshufb m6, m0 + movddup m0, [rsp+8*2] + paddw m7, m6, m6 + movq m5, [base+pw_m1to4] + pshuflw m4, m3, q0000 + punpcklqdq m6, m7 + pmullw m4, m5 + pshuflw m3, m3, q1111 + paddw m6, m0 + mov r2d, r10d + pshuflw m0, m4, q3333 + psubw m4, [rsp+8*0] + movq [rsp+8*3], m3 + movq [rsp+8*5], m0 ; dy*4 + mov r5, dstq +.w4_loop0: + mova [rsp+16*4], m6 + movq [rsp+8*4], m4 +%if ARCH_X86_64 + pand m0, m8, m4 +%else + movq m0, [base+pw_62] + pand m0, m4 +%endif + psraw m4, 6 + psllw m0, 9 ; frac_y << 9 + movq [rsp+8*7], m0 + pabsw m4, m4 + movq [rsp+8*6], m4 + movzx hd, r9b +.w4_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movu m2, [rsp+r2*2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + movu m1, [rsp+r3*2] + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x2 + movu m3, [rsp+r2*2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x3 + movu m4, [rsp+r3*2] +%if ARCH_X86_64 + REPX {pshufb x, m9}, m2, m1, m3, m4 +%else + mova m0, [rsp+16*24] + REPX {pshufb x, m0}, m2, m1, m3, m4 +%endif + punpcklqdq m0, m2, m1 + punpckhqdq m2, m1 + punpcklqdq m1, m3, m4 + punpckhqdq m3, m4 +%if ARCH_X86_64 + pand m5, m8, m6 +%else + movddup m5, [base+pw_62] + pand m5, m6 +%endif + psllw m5, 9 + psubw m2, m0 + pmulhrsw m2, m5 + paddw m5, m6, m7 + psubw m3, m1 + paddw m0, m2 +%if ARCH_X86_64 + pand m2, m8, m5 +%else + movddup m2, [base+pw_62] + pand m2, m5 +%endif + psllw m2, 9 + pmulhrsw m3, m2 + paddw m1, m3 + cmp r3d, 111 ; topleft + jge .w4_toponly + mova [rsp+16*22], m0 + mova [rsp+16*23], m1 + movzx r3d, byte [rsp+8*6+0] ; base_y0 + movu m3, [rsp+r3*2] + movzx r3d, byte [rsp+8*6+2] ; base_y1 + movu m2, [rsp+r3*2] + movzx r3d, byte [rsp+8*6+4] ; base_y2 + movu m4, [rsp+r3*2] + movzx r3d, byte [rsp+8*6+6] ; base_y3 + movu m0, [rsp+r3*2] +%if ARCH_X86_64 + REPX {pshufb x, m10}, m3, m2, m4, m0 +%else + mova m1, [rsp+16*25] + REPX {pshufb x, m1}, m3, m2, m4, m0 +%endif + punpcklwd m1, m3, m2 + punpckhwd m3, m2 ; 01 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 ; 23 + punpckldq m0, m1, m2 ; y0 d1 + punpckhdq m1, m2 ; y2 y3 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + movddup m4, [rsp+8*7] + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + psraw m6, 15 ; base_x < topleft + psraw m4, m5, 15 + paddw m0, m2 + paddw m1, m3 + pand m0, m6 + pandn m6, [rsp+16*22] + pand m1, m4 + pandn m4, [rsp+16*23] + por m0, m6 + por m1, m4 +.w4_toponly: + movifnidn strideq, stridemp + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + sub hd, 4 + jz .w4_end + movq m4, [rsp+8*6] + paddsw m6, m5, m7 ; xpos += dx + movq m5, [rsp+8*3] + psubw m4, m5 + lea dstq, [dstq+strideq*2] + movq [rsp+8*6], m4 + cmp r2d, r11d + jge .w4_loop +.w4_leftonly_loop: + movzx r2d, byte [rsp+8*6+0] ; base_y0 + movu m3, [rsp+r2*2] + movzx r2d, byte [rsp+8*6+2] ; base_y1 + movu m2, [rsp+r2*2] + movzx r2d, byte [rsp+8*6+4] ; base_y2 + movu m6, [rsp+r2*2] + movzx r2d, byte [rsp+8*6+6] ; base_y3 + movu m0, [rsp+r2*2] + psubw m4, m5 +%if ARCH_X86_64 + REPX {pshufb x, m10}, m3, m2, m6, m0 +%else + mova m1, [rsp+16*25] + REPX {pshufb x, m1}, m3, m2, m6, m0 +%endif + movq [rsp+8*6], m4 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m6, m0 + punpckhwd m6, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m6 + punpckhdq m3, m6 + movddup m6, [rsp+8*7] + psubw m2, m0 + psubw m3, m1 + pmulhrsw m2, m6 + pmulhrsw m3, m6 + paddw m0, m2 + paddw m1, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 4 + jg .w4_leftonly_loop +.w4_end: + sub r9d, 1<<8 + jl .w4_ret + movq m4, [rsp+8*5] + add r5, 8 + mov dstq, r5 + paddw m4, [rsp+8*4] ; base_y += 4*dy + movzx r2d, word [rsp+8*1] + movddup m6, [rsp+8*1] + paddw m6, [rsp+16*4] ; base_x += (4 << upsample_above) + add r2d, r10d + mov r10d, r2d + jmp .w4_loop0 +.w4_ret: + RET +.w8: + test angled, 0x400 + jnz .w4_main + lea r3d, [angleq+126] + pshufhw m1, m5, q3333 +%if ARCH_X86_64 + mov r3b, hb +%else + xor r3b, r3b + or r3d, hd +%endif + movhps [rsp+16*15], m1 + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + call .upsample_above + sub angled, 53 + lea r3d, [hq+7] + xor angled, 0x7f ; 180 - angle + movu m1, [base+z_filt_wh8] + movd m2, r3d + movd m7, angled + shr angled, 8 ; is_sm << 1 + psrldq m4, [base+z_filt_t_w48+angleq*8], 4 + pshufb m2, m3 + pshufb m7, m3 + pcmpeqb m2, m1 + movq m1, [base+pw_512] + pand m7, m2 + pcmpgtb m7, m4 + movq [rsp+8*1], m1 ; 8<<6 + jmp .w8_filter_left +.w8_no_upsample_above: + lea r3d, [hq+7] + mov [rsp+16*4], angled + sub angled, 90 + movd m2, r3d + mov r3d, 90 + movd m1, angled + sub r3d, angled ; 180 - angle + shr angled, 8 ; is_sm << 1 + movu m4, [base+z_filt_wh8] + movd m7, r3d + psrldq m5, [base+z_filt_t_w48+angleq*8], 4 + mov r3d, 8 + call .w8_filter_top + mov r3d, [rsp+16*4] + sub r3d, 141 +%if ARCH_X86_64 + mov r3b, hb +%else + xor r3b, r3b + or r3d, hd +%endif + cmp r3d, 8 + jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm +.w8_filter_left: + pmovmskb r5d, m7 + test r5d, r5d + jz .w4_main + imul r5d, 0x55555555 + neg hq + mov r3, tlq + movd m1, [tlq+hq*2] + shr r5d, 30 ; filter_strength + lea tlq, [rsp+16*13-2] + pshuflw m1, m1, q0000 + movq [tlq+hq*2-6], m1 + call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge + jmp .filter_left_end +.w8_filter_top: + REPX {pshufb x, m3}, m2, m1, m7 + pcmpeqb m2, m4 + pand m1, m2 + pand m7, m2 + pcmpgtb m1, m5 + pcmpgtb m7, m5 + pmovmskb r5d, m1 + test r5d, r5d + jz .w8_filter_top_end ; filter_strength == 0 + imul r5d, 0x55555555 + mov [dstq], tlq + lea tlq, [rsp+16*14+gprsize] + shr r5d, 30 ; filter_strength + call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge +%if ARCH_X86_64 + mov r3d, r7m ; maxw, offset due to call +%else + mov r3d, [rsp+16*2+4*1] +%endif + mov tlq, [dstq] + cmp r3d, 8 + jge .w8_filter_top_end + movu m1, [tlq+r3*2+16*0+2] + movu m2, [tlq+r3*2+16*1+2] + movu [rsp+r3*2+16*14+gprsize], m1 + movu [rsp+r3*2+16*15+gprsize], m2 +.w8_filter_top_end: + ret +.w16: + test angled, 0x400 + jnz .w4_main + lea r3d, [hq+15] + sub angled, 90 + movd m2, r3d + mov r3d, 90 + movd m1, angled + sub r3d, angled ; 180 - angle + shr angled, 8 ; is_sm << 1 + movd m7, r3d + REPX {pshufb x, m3}, m2, m1, m7 + movq m4, [base+z_filt_t_w16+angleq*4] + pcmpeqb m2, [base+z_filt_wh16] + pand m1, m2 + pand m7, m2 + pcmpgtb m1, m4 + pcmpgtb m7, m4 + pmovmskb r5d, m1 + test r5d, r5d + jz .w16_filter_left ; filter_strength == 0 + imul r5d, 0x24924924 + pshufhw m6, m6, q3333 + mov [dstq], tlq + lea tlq, [rsp+16*14] + shr r5d, 30 + movhps [tlq+16*2], m6 + adc r5d, -1 ; filter_strength + mov r3d, 16 + call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge + mov r3d, maxwm + mov tlq, [dstq] + cmp r3d, 16 + jge .w16_filter_left + movu m1, [tlq+r3*2+16*0+2] + movu m2, [tlq+r3*2+16*1+2] + movu [rsp+r3*2+16*14], m1 + movu [rsp+r3*2+16*15], m2 +.w16_filter_left: + pmovmskb r5d, m7 + test r5d, r5d + jz .w4_main + imul r5d, 0x24924924 + neg hq + mov r3, tlq + movd m1, [tlq+hq*2] + shr r5d, 30 + lea tlq, [rsp+16*13-2] + pshuflw m1, m1, q0000 + adc r5d, -1 ; filter_strength + movq [tlq+hq*2-6], m1 + call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge + jmp .filter_left_end +.w32: + movu m1, [tlq+16*2+2] + movu m2, [tlq+16*3+2] + mova [rsp+16*16], m1 + mova [rsp+16*17], m2 + test angled, 0x400 + jnz .w4_main + mov [dstq], tlq + lea tlq, [rsp+16*14] + pshufhw m2, m2, q3333 + mov r3d, 32 + movhps [tlq+16*4], m2 + call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3 + mov r3d, maxwm + mov tlq, [dstq] + cmp r3d, 32 + jge .filter_left + movu m1, [tlq+r3*2+16*0+2] + movu m2, [tlq+r3*2+16*1+2] + movu [rsp+r3*2+16*14], m1 + movu [rsp+r3*2+16*15], m2 + cmp r3d, 16 + jge .filter_left + movu m1, [tlq+r3*2+16*2+2] + movu m2, [tlq+r3*2+16*3+2] + movu [rsp+r3*2+16*16], m1 + movu [rsp+r3*2+16*17], m2 +.filter_left: + neg hq + mov r3, tlq + pshuflw m1, [tlq+hq*2], q0000 + lea tlq, [rsp+16*13-2] + movq [tlq+hq*2-6], m1 + call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge_s3 +.filter_left_end: + mov r2d, maxhm + cmp r2d, hd + jge .w4_main + neg r2 + movu m1, [r3+r2*2-16*1] + movu m2, [r3+r2*2-16*2] + movu [rsp+r2*2+16*12], m1 + movu [rsp+r2*2+16*11], m2 + cmp r2d, -48 + jle .w4_main + movu m1, [r3+r2*2-16*3] + movu m2, [r3+r2*2-16*4] + movu [rsp+r2*2+16*10], m1 + movu [rsp+r2*2+16* 9], m2 + cmp r2d, -32 + jle .w4_main + movu m1, [r3+r2*2-16*5] + movu m2, [r3+r2*2-16*6] + movu [rsp+r2*2+16* 8], m1 + movu [rsp+r2*2+16* 7], m2 + cmp r2d, -16 + jle .w4_main + movu m1, [r3+r2*2-16*7] + movu m2, [r3+r2*2-16*8] + movu [rsp+r2*2+16* 6], m1 + movu [rsp+r2*2+16* 5], m2 + jmp .w4_main +.w64: + movu m1, [tlq+16*2+2] + movu m2, [tlq+16*3+2] + movu m3, [tlq+16*4+2] + movu m4, [tlq+16*5+2] + movu m5, [tlq+16*6+2] + movu m6, [tlq+16*7+2] + mov [dstq], tlq + lea tlq, [rsp+16*14] + mova [tlq+16*2], m1 + mova [tlq+16*3], m2 + mova [tlq+16*4], m3 + mova [tlq+16*5], m4 + mova [tlq+16*6], m5 + mova [tlq+16*7], m6 + test angled, 0x400 + jnz .w4_main + pshufhw m6, m6, q3333 + mov r3d, 64 + movhps [tlq+16*8], m6 + call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3 + mov r3d, maxwm + mov tlq, [dstq] + cmp r3d, 64 + jge .filter_left + movu m1, [tlq+r3*2+16*0+2] + movu m2, [tlq+r3*2+16*1+2] + movu [rsp+r3*2+16*14], m1 + movu [rsp+r3*2+16*15], m2 + cmp r3d, 48 + jge .filter_left + movu m1, [tlq+r3*2+16*2+2] + movu m2, [tlq+r3*2+16*3+2] + movu [rsp+r3*2+16*16], m1 + movu [rsp+r3*2+16*17], m2 + cmp r3d, 32 + jge .filter_left + movu m1, [tlq+r3*2+16*4+2] + movu m2, [tlq+r3*2+16*5+2] + movu [rsp+r3*2+16*18], m1 + movu [rsp+r3*2+16*19], m2 + cmp r3d, 16 + jge .filter_left + movu m1, [tlq+r3*2+16*6+2] + movu m2, [tlq+r3*2+16*7+2] + movu [rsp+r3*2+16*20], m1 + movu [rsp+r3*2+16*21], m2 + jmp .filter_left + +%if ARCH_X86_64 +cglobal ipred_z3_16bpc, 4, 9, 8, 16*18, dst, stride, tl, w, h, angle, dy, _, org_w + %define base r7-$$ + lea r7, [$$] + mov org_wd, wd +%else +cglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride, tl, w, h, angle, dy + %define base r1-$$ + %define org_wd r5 + %define org_wq r5 + movd m6, r8m ; pixel_max + mov [dstq+4*0], strideq + LEA r1, $$ + mov [dstq+4*1], wd +%endif + tzcnt hd, hm + movifnidn angled, anglem + sub tlq, 2 + movsxd hq, [base+ipred_z3_16bpc_ssse3_table+hq*4] + sub angled, 180 + movddup m0, [base+pw_256] + mov dyd, angled + neg dyd + xor angled, 0x400 + movddup m7, [base+pw_62] + or dyq, ~0x7e + lea hq, [base+ipred_z3_16bpc_ssse3_table+hq] + movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] + jmp hq +.h4: + lea r4d, [angleq+88] + test r4d, 0x480 + jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 + sar r4d, 9 + add r4d, wd + cmp r4d, 8 + jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm) + mova m2, [tlq-14] ; 7 6 5 4 3 2 1 0 + movu m3, [tlq-12] ; 8 7 6 5 4 3 2 1 +%if ARCH_X86_64 + movd m6, r8m +%endif + pshufb m4, m2, m0 + mov tlq, rsp + palignr m1, m2, m4, 14 ; 8 8 7 6 5 4 3 2 + add dyd, dyd + palignr m5, m2, m4, 12 ; 8 8 8 7 6 5 4 3 + paddw m1, m2 + paddw m3, m5 + psubw m5, m1, m3 + mova m3, [base+z_upsample] + mova [tlq+ 0], m4 + movd m4, dyd + psraw m5, 3 + neg dyd + paddw m1, m5 + pxor m5, m5 + lea r5d, [dyq+(16<<6)+63] ; ypos + pmaxsw m1, m5 + pshufb m6, m0 + shl wd, 3 + pavgw m1, m5 + pshufb m4, m0 + pminsw m1, m6 + sub rsp, wq + punpckhwd m0, m1, m2 + paddw m5, m4, m4 + punpcklwd m1, m2 + mova [tlq+32], m0 + movsd m4, m5 + mova [tlq+16], m1 +.h4_upsample_loop: + lea r4d, [r5+dyq] + sar r5d, 6 + movu m2, [tlq+r5*2] + lea r5d, [r4+dyq] + sar r4d, 6 + movu m1, [tlq+r4*2] + pshufb m2, m3 + pshufb m1, m3 + punpckhqdq m0, m1, m2 + punpcklqdq m1, m2 + pand m2, m7, m4 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m4, m5 + paddw m0, m1 + mova [rsp+wq-16], m0 + sub wd, 16 + jg .h4_upsample_loop + or r3d, 4*2 + jmp .end_transpose +.h4_no_upsample: + mov r4d, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h4_main + lea r4d, [wq+3] + movd m1, r4d + movd m3, angled + shr angled, 8 ; is_sm << 1 + pxor m2, m2 + pshufb m1, m2 + pshufb m3, m2 + pcmpeqb m1, [base+z_filt_wh4] + pand m1, m3 + pcmpgtb m1, [base+z_filt_t_w48+angleq*8] + pmovmskb r5d, m1 + mov r4d, 7 + test r5d, r5d + jz .h4_main ; filter_strength == 0 + pshuflw m1, [tlq+2], q0000 + imul r5d, 0x55555555 + mova m2, [tlq-14] + neg r4 + movd m3, [tlq+r4*2] + shr r5d, 30 + movd [rsp+16*17], m1 + pshuflw m3, m3, q0000 + mova [rsp+16*16], m2 + lea r2, [r4-2] + movq [rsp+16*17+r4*2-10], m3 + cmp wd, 8 + cmovae r4, r2 + lea tlq, [rsp+16*17-2] + call .filter_edge +.h4_main: + movd m4, dyd + sub tlq, r4 + movddup m1, [base+z_base_inc_z2+8] ; base_inc << 6 + sub tlq, r4 + shl r4d, 6 + movd m6, [tlq] + movd m3, r4d + pshufb m4, m0 + neg dyq + pshufb m6, m0 + lea r5, [dyq+r4+63] ; ypos + pshufb m3, m0 + shl wd, 3 + paddw m5, m4, m4 + sub rsp, wq + psubw m3, m1 ; max_base_y + movsd m4, m5 ; ypos1 ypos0 +.h4_loop: + lea r4, [r5+dyq] + sar r5, 6 + movddup m0, [tlq+r5*2-6] + movddup m1, [tlq+r5*2-8] + lea r5, [r4+dyq] + sar r4, 6 + movlps m0, [tlq+r4*2-6] + movlps m1, [tlq+r4*2-8] + pand m2, m7, m4 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + pcmpgtw m2, m3, m4 + paddw m4, m5 + paddw m0, m1 + pand m0, m2 + pandn m2, m6 + por m0, m2 + mova [rsp+wq-16], m0 + sub wd, 16 + jz .h4_transpose + test r5d, r5d + jg .h4_loop +.h4_end_loop: + mova [rsp+wq-16], m6 + sub wd, 16 + jg .h4_end_loop +.h4_transpose: + or r3d, 4*2 + jmp .end_transpose +.h8: + lea r4d, [angleq+88] + and r4d, ~0x7f + or r4d, wd + cmp r4d, 8 + ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 + mova m2, [tlq-30] ; g f e d c b a 9 + movu m1, [tlq-32] ; _ g f e d c b a + movu m3, [tlq-16] ; 9 8 7 6 5 4 3 2 + paddw m3, [tlq-14] ; 8 7 6 5 4 3 2 1 + pshufd m4, m2, q2100 ; _ _ g f e d c b + paddw m1, m2 + movu m5, [tlq-28] ; f e d c b a 9 8 + add dyd, dyd + cmp wd, 8 + je .h8_upsample_w8 + pshufhw m4, m2, q1000 ; _ _ _ _ c c c b +.h8_upsample_w8: + paddw m4, m5 + psubw m5, m1, m4 + movu m4, [tlq-18] ; a 9 8 7 6 5 4 3 + psraw m5, 3 + paddw m1, m5 + movu m5, [tlq-12] ; 7 6 5 4 3 2 1 0 +%if ARCH_X86_64 + movd m6, r8m ; pixel_max +%endif + paddw m4, m5 + shl wd, 4 + psubw m5, m3, m4 + movd m4, dyd + psraw m5, 3 + neg dyd + paddw m3, m5 + pshufb m6, m0 + mova m5, [tlq-14] + pshufb m4, m0 + pxor m0, m0 + pmaxsw m1, m0 + pmaxsw m3, m0 + mov tlq, rsp + pavgw m1, m0 + pavgw m3, m0 + sub rsp, wq + pminsw m1, m6 + pminsw m6, m3 + mova m3, [base+z_upsample] + lea r5d, [dyq+(16<<6)+63] ; ypos + punpcklwd m0, m1, m2 + mova [tlq+16*0], m0 + punpckhwd m1, m2 + mova [tlq+16*1], m1 + punpcklwd m0, m6, m5 + mova [tlq+16*2], m0 + punpckhwd m6, m5 + mova [tlq+16*3], m6 + mova m5, m4 +.h8_upsample_loop: + mov r4d, r5d + sar r4d, 6 + movu m1, [tlq+r4*2+16*0] + movu m2, [tlq+r4*2+16*1] + add r5d, dyd + pshufb m2, m3 + pshufb m1, m3 + punpckhqdq m0, m1, m2 + punpcklqdq m1, m2 + pand m2, m7, m4 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + paddw m4, m5 + paddw m0, m1 + mova [rsp+wq-16], m0 + sub wd, 16 + jg .h8_upsample_loop + or r3d, 8*2 + jmp .end_transpose +.h8_no_upsample: + lea r4d, [wq+7] + movd m1, r4d + and r4d, 7 + or r4d, 8 ; imin(w+7, 15) + test angled, 0x400 + jnz .h8_main + movd m3, angled + shr angled, 8 ; is_sm << 1 + pxor m2, m2 + pshufb m1, m2 + pshufb m3, m2 + movu m2, [base+z_filt_wh8] + psrldq m4, [base+z_filt_t_w48+angleq*8], 4 + pcmpeqb m2, m1 + pand m2, m3 + pcmpgtb m2, m4 + pmovmskb r5d, m2 + test r5d, r5d + jz .h8_main ; filter_strength == 0 + pshuflw m1, [tlq+2], q0000 + imul r5d, 0x55555555 + mova m2, [tlq-16*1+2] + neg r4 + mova m3, [tlq-16*2+2] + shr r5d, 30 + movd m4, [tlq+r4*2] + movd [rsp+16*17], m1 + mova [rsp+16*16], m2 + pshuflw m4, m4, q0000 + mova [rsp+16*15], m3 + lea r2, [r4-2] + movq [rsp+16*17+r4*2-10], m4 + cmp wd, 16 + cmovae r4, r2 + lea tlq, [rsp+16*17-2] + call .filter_edge +.h8_main: + sub tlq, r4 + movd m4, dyd + sub tlq, r4 + shl r4d, 6 + movd m6, [tlq] + movd m3, r4d + pshufb m4, m0 + neg dyq + pshufb m6, m0 + lea r5, [dyq+r4+63] + pshufb m3, m0 + shl wd, 4 + mova m5, m4 + sub rsp, wq + psubw m3, [base+z_base_inc_z2] +.h8_loop: + mov r4, r5 + sar r4, 6 + movu m0, [tlq+r4*2-14] + movu m1, [tlq+r4*2-16] + pand m2, m7, m4 + psllw m2, 9 + psubw m1, m0 + pmulhrsw m1, m2 + pcmpgtw m2, m3, m4 + paddw m4, m5 + paddw m0, m1 + pand m0, m2 + pandn m2, m6 + por m0, m2 + mova [rsp+wq-16], m0 + sub wd, 8*2 + jz .h8_transpose + add r5, dyq + jg .h8_loop +.h8_end_loop: + mova [rsp+wq-16], m6 + sub wd, 8*2 + jg .h8_end_loop +.h8_transpose: + or r3d, 8*2 + jmp .end_transpose +.h16: + lea r4d, [wq+15] + movd m1, r4d + and r4d, 15 + or r4d, 16 ; imin(w+15, 31) + test angled, 0x400 + jnz .h16_main + movd m3, angled + shr angled, 8 ; is_sm << 1 + pxor m2, m2 + pshufb m1, m2 + pshufb m3, m2 + movq m4, [base+z_filt_t_w16+angleq*4] + pcmpeqb m1, [base+z_filt_wh16] + pand m1, m3 + pcmpgtb m1, m4 + pmovmskb r5d, m1 + test r5d, r5d + jz .h16_main ; filter_strength == 0 + pshuflw m1, [tlq+2], q0000 + mova m2, [tlq-16*1+2] + imul r5d, 0x24924924 + mova m3, [tlq-16*2+2] + neg r4 + mova m4, [tlq-16*3+2] + shr r5d, 30 + mova m5, [tlq-16*4+2] + movd m6, [tlq+r4*2] + adc r5d, -1 ; filter_strength + movd [rsp+16*17], m1 + mova [rsp+16*16], m2 + mova [rsp+16*15], m3 + pshuflw m6, m6, q0000 + mova [rsp+16*14], m4 + mova [rsp+16*13], m5 + lea r2, [r4-2] + movq [rsp+16*17+r4*2-10], m6 + cmp wd, 32 + cmovae r4, r2 + lea tlq, [rsp+16*17-2] + call .filter_edge +.h16_main: + sub tlq, r4 + movd m5, dyd + sub tlq, r4 + shl r4d, 6 + movd m6, [tlq] + movd m3, r4d + pshufb m5, m0 + neg dyq + pshufb m6, m0 + lea r5, [dyq+r4+63] + pshufb m3, m0 + shl wd, 5 + paddw m4, m5, [base+z_base_inc_z2] + sub rsp, wq + psubw m4, m3 +.h16_loop: + mov r4, r5 + sar r4, 6 + movu m0, [tlq+r4*2-14] + movu m2, [tlq+r4*2-16] + pand m3, m7, m4 + psllw m3, 9 + psubw m2, m0 + pmulhrsw m2, m3 + movu m1, [tlq+r4*2-30] + paddw m0, m2 + movu m2, [tlq+r4*2-32] + psubw m2, m1 + pmulhrsw m2, m3 + movddup m3, [base+pw_m512] + paddw m1, m2 + psraw m2, m4, 15 + pcmpgtw m3, m4 + paddw m4, m5 + pand m0, m2 + pandn m2, m6 + pand m1, m3 + pandn m3, m6 + por m0, m2 + mova [rsp+wq-16*1], m0 + por m1, m3 + mova [rsp+wq-16*2], m1 + sub wd, 16*2 + jz .h16_transpose + add r5, dyq + jg .h16_loop +.h16_end_loop: + mova [rsp+wq-16*1], m6 + mova [rsp+wq-16*2], m6 + sub wd, 16*2 + jg .h16_end_loop +.h16_transpose: + or r3d, 16*2 + jmp .end_transpose +.h32: + lea r4d, [wq+31] + and r4d, 31 + or r4d, 32 ; imin(w+31, 63) + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h32_main + call .filter_copy + lea r5, [r4-2] + cmp wd, 64 + cmove r4, r5 + call .filter_edge_s3 +.h32_main: + sub tlq, r4 + movd m5, dyd + sub tlq, r4 + shl r4d, 6 + movd m6, [tlq] + movd m3, r4d + pshufb m5, m0 + neg dyq + pshufb m6, m0 + lea r5, [dyq+r4+63] + pshufb m3, m0 + paddw m4, m5, [base+z_base_inc_z2] + psubw m4, m3 +.h32_loop: + mov r4, r5 + sar r4, 6 + movu m0, [tlq+r4*2-14] + movu m3, [tlq+r4*2-16] + pand m2, m7, m4 + psllw m2, 9 + psubw m3, m0 + pmulhrsw m3, m2 + movu m1, [tlq+r4*2-30] + paddw m0, m3 + movu m3, [tlq+r4*2-32] + psubw m3, m1 + pmulhrsw m3, m2 + sub rsp, 16*4 + paddw m1, m3 + psraw m3, m4, 15 + pand m0, m3 + pandn m3, m6 + por m0, m3 + movddup m3, [base+pw_m512] + pcmpgtw m3, m4 + pand m1, m3 + pandn m3, m6 + mova [rsp+16*3], m0 + por m1, m3 + mova [rsp+16*2], m1 + movu m0, [tlq+r4*2-46] + movu m3, [tlq+r4*2-48] + psubw m3, m0 + pmulhrsw m3, m2 + movu m1, [tlq+r4*2-62] + paddw m0, m3 + movu m3, [tlq+r4*2-64] + psubw m3, m1 + pmulhrsw m3, m2 + movddup m2, [base+pw_m1024] + paddw m1, m3 + movddup m3, [base+pw_m1536] + pcmpgtw m2, m4 + pcmpgtw m3, m4 + paddw m4, m5 + pand m0, m2 + pandn m2, m6 + pand m1, m3 + pandn m3, m6 + por m0, m2 + mova [rsp+16*1], m0 + por m1, m3 + mova [rsp+16*0], m1 + dec wd + jz .h32_transpose + add r5, dyq + jg .h32_loop +.h32_end_loop: + sub rsp, 16*4 + REPX {mova [rsp+16*x], m6}, 3, 2, 1, 0 + dec wd + jg .h32_end_loop +.h32_transpose: + or r3d, 32*2 + jmp .end_transpose +.h64: + lea r4d, [wq+63] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h64_main + call .filter_copy + call .filter_edge_s3 +.h64_main: + sub tlq, r4 + movd m5, dyd + sub tlq, r4 + shl r4d, 6 + movd m6, [tlq] + movd m3, r4d + pshufb m5, m0 + neg dyq + pshufb m6, m0 + lea r5, [dyq+r4+63] + pshufb m3, m0 + paddw m4, m5, [base+z_base_inc_z2] + psubw m4, m3 +.h64_loop: + mov r4, r5 + sar r4, 6 + movu m0, [tlq+r4*2- 14] + movu m3, [tlq+r4*2- 16] + pand m2, m7, m4 + psllw m2, 9 + psubw m3, m0 + pmulhrsw m3, m2 + movu m1, [tlq+r4*2- 30] + paddw m0, m3 + movu m3, [tlq+r4*2- 32] + psubw m3, m1 + pmulhrsw m3, m2 + sub rsp, 16*8 + paddw m1, m3 + psraw m3, m4, 15 + pand m0, m3 + pandn m3, m6 + por m0, m3 + movddup m3, [base+pw_m512] + pcmpgtw m3, m4 + pand m1, m3 + pandn m3, m6 + mova [rsp+16*7], m0 + por m1, m3 + mova [rsp+16*6], m1 + movu m0, [tlq+r4*2- 46] + movu m3, [tlq+r4*2- 48] + psubw m3, m0 + pmulhrsw m3, m2 + movu m1, [tlq+r4*2- 62] + paddw m0, m3 + movu m3, [tlq+r4*2- 64] + psubw m3, m1 + pmulhrsw m3, m2 + paddw m1, m3 + movddup m3, [base+pw_m1024] + pcmpgtw m3, m4 + pand m0, m3 + pandn m3, m6 + por m0, m3 + movddup m3, [base+pw_m1536] + pcmpgtw m3, m4 + pand m1, m3 + pandn m3, m6 + mova [rsp+16*5], m0 + por m1, m3 + mova [rsp+16*4], m1 + movu m0, [tlq+r4*2- 78] + movu m3, [tlq+r4*2- 80] + psubw m3, m0 + pmulhrsw m3, m2 + movu m1, [tlq+r4*2- 94] + paddw m0, m3 + movu m3, [tlq+r4*2- 96] + psubw m3, m1 + pmulhrsw m3, m2 + paddw m1, m3 + movddup m3, [base+pw_m2048] + pcmpgtw m3, m4 + pand m0, m3 + pandn m3, m6 + por m0, m3 + movddup m3, [base+pw_m2560] + pcmpgtw m3, m4 + pand m1, m3 + pandn m3, m6 + mova [rsp+16*3], m0 + por m1, m3 + mova [rsp+16*2], m1 + movu m0, [tlq+r4*2-110] + movu m3, [tlq+r4*2-112] + psubw m3, m0 + pmulhrsw m3, m2 + movu m1, [tlq+r4*2-126] + paddw m0, m3 + movu m3, [tlq+r4*2-128] + psubw m3, m1 + pmulhrsw m3, m2 + movddup m2, [base+pw_m3072] + paddw m1, m3 + movddup m3, [base+pw_m3584] + pcmpgtw m2, m4 + pcmpgtw m3, m4 + paddw m4, m5 + pand m0, m2 + pandn m2, m6 + pand m1, m3 + pandn m3, m6 + por m0, m2 + mova [rsp+16*1], m0 + por m1, m3 + mova [rsp+16*0], m1 + dec wd + jz .h64_transpose + add r5, dyq + jg .h64_loop +.h64_end_loop: + sub rsp, 16*8 + REPX {mova [rsp+16*x], m6}, 7, 6, 5, 4, 3, 2, 1, 0 + dec wd + jg .h64_end_loop +.h64_transpose: + add r3d, 64*2 +.end_transpose: +%if ARCH_X86_64 + lea r7, [strideq*3] +%else + mov strideq, [dstq+4*0] + mov org_wd, [dstq+4*1] +%endif + lea r4d, [r3*3] +.end_transpose_loop: + lea r2, [rsp+r3-8] + lea r6, [dstq+org_wq*2-8] +.end_transpose_loop_y: + movq m0, [r2+r4 ] + movq m1, [r2+r3*2] + movq m2, [r2+r3*1] + movq m3, [r2+r3*0] + sub r2, 8 + punpcklwd m0, m1 + punpcklwd m2, m3 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + movhps [r6+strideq*0], m1 + movq [r6+strideq*1], m1 +%if ARCH_X86_64 + movhps [r6+strideq*2], m0 + movq [r6+r7 ], m0 + lea r6, [r6+strideq*4] +%else + lea r6, [r6+strideq*2] + movhps [r6+strideq*0], m0 + movq [r6+strideq*1], m0 + lea r6, [r6+strideq*2] +%endif + cmp r2, rsp + jae .end_transpose_loop_y + lea rsp, [rsp+r3*4] + sub org_wd, 4 + jg .end_transpose_loop + RET +.filter_copy: + neg r4 + pshuflw m2, [tlq+2], q0000 + xor r5d, r5d + pshuflw m3, [tlq+r4*2], q0000 + movq [rsp+gprsize+16*17], m2 +.filter_copy_loop: + mova m1, [tlq+r5*2-16*1+2] + mova m2, [tlq+r5*2-16*2+2] + sub r5, 16 + mova [rsp+r5*2+gprsize+16*18], m1 + mova [rsp+r5*2+gprsize+16*17], m2 + cmp r5d, r4d + jg .filter_copy_loop + lea tlq, [rsp+gprsize+16*17-2] + movq [tlq+r4*2-8], m3 + ret +.filter_edge: + cmp r5d, 3 + je .filter_edge_s3 + movddup m4, [base+z_filt_k+r5*8-8] + movddup m5, [base+z_filt_k+r5*8+8] + xor r5d, r5d + movddup m6, [base+pw_8] + movu m2, [tlq-12] + jmp .filter_edge_start +.filter_edge_loop: + movu m2, [tlq+r5*2-12] + mova [tlq+r5*2+2], m1 +.filter_edge_start: + pmullw m1, m4, [tlq+r5*2-14] + movu m3, [tlq+r5*2-16] + sub r5, 8 + paddw m2, m3 + pmullw m2, m5 + paddw m1, m6 + paddw m1, m2 + psrlw m1, 4 + cmp r5d, r4d + jg .filter_edge_loop + mova [tlq+r5*2+2], m1 + neg r4d + ret +.filter_edge_s3: + movddup m5, [base+pw_3] + xor r5d, r5d + movu m2, [tlq-12] + movu m3, [tlq-10] + jmp .filter_edge_s3_start +.filter_edge_s3_loop: + movu m2, [tlq+r5*2-12] + movu m3, [tlq+r5*2-10] + mova [tlq+r5*2+2], m1 +.filter_edge_s3_start: + paddw m2, [tlq+r5*2-14] + paddw m3, m5 + movu m1, [tlq+r5*2-16] + movu m4, [tlq+r5*2-18] + sub r5, 8 + paddw m1, m2 + pavgw m3, m4 + paddw m1, m3 + psrlw m1, 2 + cmp r5d, r4d + jg .filter_edge_s3_loop + mova [tlq+r5*2+2], m1 + neg r4d + ret + %if ARCH_X86_64 cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter %else @@ -1811,25 +3964,27 @@ cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h jg .w32_hpad_loop jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc -cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h +cglobal pal_pred_16bpc, 4, 5, 6, dst, stride, pal, idx, w, h %define base r2-pal_pred_16bpc_ssse3_table %if ARCH_X86_32 %define hd r2d %endif - mova m3, [palq] + mova m4, [palq] LEA r2, pal_pred_16bpc_ssse3_table tzcnt wd, wm - pshufb m3, [base+pal_pred_shuf] + pshufb m4, [base+pal_pred_shuf] movsxd wq, [r2+wq*4] - pshufd m4, m3, q1032 + pshufd m5, m4, q1032 add wq, r2 movifnidn hd, hm jmp wq .w4: - mova m0, [idxq] - add idxq, 16 - pshufb m1, m3, m0 - pshufb m2, m4, m0 + movq m0, [idxq] + add idxq, 8 + psrlw m1, m0, 4 + punpcklbw m0, m1 + pshufb m1, m4, m0 + pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 movq [dstq+strideq*0], m0 @@ -1842,77 +3997,102 @@ cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h jg .w4 RET .w8: - mova m0, [idxq] + movu m3, [idxq] add idxq, 16 - pshufb m1, m3, m0 - pshufb m2, m4, m0 + psrlw m1, m3, 4 + punpcklbw m0, m3, m1 + punpckhbw m3, m1 + pshufb m1, m4, m0 + pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] - sub hd, 2 + pshufb m1, m4, m3 + pshufb m2, m5, m3 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 4 jg .w8 RET .w16: - mova m0, [idxq] + movu m3, [idxq] add idxq, 16 - pshufb m1, m3, m0 - pshufb m2, m4, m0 + psrlw m1, m3, 4 + punpcklbw m0, m3, m1 + punpckhbw m3, m1 + pshufb m1, m4, m0 + pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 - mova [dstq+16*0], m0 - mova [dstq+16*1], m1 - add dstq, strideq - dec hd + mova [dstq+ 0], m0 + mova [dstq+16], m1 + pshufb m1, m4, m3 + pshufb m2, m5, m3 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq+ 0], m0 + mova [dstq+strideq+16], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 jg .w16 RET .w32: - mova m0, [idxq+16*0] - pshufb m1, m3, m0 - pshufb m2, m4, m0 + movu m3, [idxq] + add idxq, 16 + psrlw m1, m3, 4 + punpcklbw m0, m3, m1 + punpckhbw m3, m1 + pshufb m1, m4, m0 + pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 - mova m2, [idxq+16*1] - add idxq, 16*2 mova [dstq+16*0], m0 - pshufb m0, m3, m2 mova [dstq+16*1], m1 - pshufb m1, m4, m2 - punpcklbw m2, m0, m1 - punpckhbw m0, m1 - mova [dstq+16*2], m2 - mova [dstq+16*3], m0 + pshufb m1, m4, m3 + pshufb m2, m5, m3 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 add dstq, strideq dec hd jg .w32 RET .w64: - mova m0, [idxq+16*0] - pshufb m1, m3, m0 - pshufb m2, m4, m0 + movu m3, [idxq+16*0] + psrlw m1, m3, 4 + punpcklbw m0, m3, m1 + punpckhbw m3, m1 + pshufb m1, m4, m0 + pshufb m2, m5, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 - mova m2, [idxq+16*1] mova [dstq+16*0], m0 - pshufb m0, m3, m2 mova [dstq+16*1], m1 - pshufb m1, m4, m2 - punpcklbw m2, m0, m1 - punpckhbw m0, m1 - mova m1, [idxq+16*2] - mova [dstq+16*2], m2 - pshufb m2, m3, m1 - mova [dstq+16*3], m0 - pshufb m0, m4, m1 - punpcklbw m1, m2, m0 - punpckhbw m2, m0 - mova m0, [idxq+16*3] - add idxq, 16*4 - mova [dstq+16*4], m1 - pshufb m1, m3, m0 - mova [dstq+16*5], m2 - pshufb m2, m4, m0 + pshufb m1, m4, m3 + pshufb m2, m5, m3 + movu m3, [idxq+16*1] + add idxq, 32 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 + psrlw m1, m3, 4 + punpcklbw m0, m3, m1 + punpckhbw m3, m1 + pshufb m1, m4, m0 + pshufb m2, m5, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+16*4], m0 + mova [dstq+16*5], m1 + pshufb m1, m4, m3 + pshufb m2, m5, m3 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*6], m0 diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred_avx2.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred_avx2.asm index dd188a7f3..95802c7d7 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred_avx2.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred_avx2.asm @@ -5307,18 +5307,20 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_ RET cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h - vbroadcasti128 m4, [palq] + vpbroadcastq m4, [palq] lea r2, [pal_pred_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r2+wq*4] - packuswb m4, m4 add wq, r2 lea r2, [strideq*3] jmp wq .w4: - pshufb xm0, xm4, [idxq] - add idxq, 16 + movq xm0, [idxq] + add idxq, 8 + psrlw xm1, xm0, 4 + punpcklbw xm0, xm1 + pshufb xm0, xm4, xm0 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 @@ -5327,11 +5329,14 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h sub hd, 4 jg .w4 RET -ALIGN function_align .w8: - pshufb xm0, xm4, [idxq+16*0] - pshufb xm1, xm4, [idxq+16*1] - add idxq, 16*2 + movu xm2, [idxq] + add idxq, 16 + pshufb xm1, xm4, xm2 + psrlw xm2, 4 + pshufb xm2, xm4, xm2 + punpcklbw xm0, xm1, xm2 + punpckhbw xm1, xm2 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 @@ -5340,47 +5345,48 @@ ALIGN function_align sub hd, 4 jg .w8 RET -ALIGN function_align .w16: - pshufb m0, m4, [idxq+32*0] - pshufb m1, m4, [idxq+32*1] - add idxq, 32*2 + movu m2, [idxq] + add idxq, 32 + pshufb m1, m4, m2 + psrlw m2, 4 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 mova [dstq+strideq*0], xm0 - vextracti128 [dstq+strideq*1], m0, 1 - mova [dstq+strideq*2], xm1 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+strideq*2], m0, 1 vextracti128 [dstq+r2 ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET -ALIGN function_align .w32: - pshufb m0, m4, [idxq+32*0] - pshufb m1, m4, [idxq+32*1] - pshufb m2, m4, [idxq+32*2] - pshufb m3, m4, [idxq+32*3] - add idxq, 32*4 + vpermq m2, [idxq], q3120 + add idxq, 32 + pshufb m1, m4, m2 + psrlw m2, 4 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 - mova [dstq+strideq*2], m2 - mova [dstq+r2 ], m3 - lea dstq, [dstq+strideq*4] - sub hd, 4 + lea dstq, [dstq+strideq*2] + sub hd, 2 jg .w32 RET -ALIGN function_align .w64: - pshufb m0, m4, [idxq+32*0] - pshufb m1, m4, [idxq+32*1] - pshufb m2, m4, [idxq+32*2] - pshufb m3, m4, [idxq+32*3] - add idxq, 32*4 - mova [dstq+strideq*0+32*0], m0 - mova [dstq+strideq*0+32*1], m1 - mova [dstq+strideq*1+32*0], m2 - mova [dstq+strideq*1+32*1], m3 - lea dstq, [dstq+strideq*2] - sub hd, 2 + vpermq m2, [idxq], q3120 + add idxq, 32 + pshufb m1, m4, m2 + psrlw m2, 4 + pshufb m2, m4, m2 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+32*0], m0 + mova [dstq+32*1], m1 + add dstq, strideq + dec hd jg .w64 RET diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred_avx512.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred_avx512.asm index 38c86b54f..4aeb14e74 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred_avx512.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred_avx512.asm @@ -95,6 +95,8 @@ smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79 db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127 ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 +pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 +pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 pb_127_m127: times 2 db 127, -127 pb_128: times 4 db 128 @@ -126,7 +128,6 @@ JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 -JMP_TABLE pal_pred_8bpc, avx512icl, w4, w8, w16, w32, w64 SECTION .text @@ -1111,19 +1112,20 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 jg .w64_loop RET -cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3 - lea r6, [pal_pred_8bpc_avx512icl_table] - tzcnt wd, wm - vbroadcasti32x4 m4, [palq] +cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3 + movifnidn wd, wm movifnidn hd, hm - movsxd wq, [r6+wq*4] - packuswb m4, m4 - add wq, r6 lea stride3q, [strideq*3] - jmp wq + cmp wd, 8 + jg .w32 + movq xmm3, [palq] + je .w8 .w4: - pshufb xmm0, xm4, [idxq] - add idxq, 16 + movq xmm0, [idxq] + add idxq, 8 + psrlw xmm1, xmm0, 4 + punpcklbw xmm0, xmm1 + pshufb xmm0, xmm3, xmm0 movd [dstq+strideq*0], xmm0 pextrd [dstq+strideq*1], xmm0, 1 pextrd [dstq+strideq*2], xmm0, 2 @@ -1133,9 +1135,13 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3 jg .w4 RET .w8: - pshufb xmm0, xm4, [idxq+16*0] - pshufb xmm1, xm4, [idxq+16*1] - add idxq, 16*2 + movu xmm2, [idxq] + add idxq, 16 + pshufb xmm1, xmm3, xmm2 + psrlw xmm2, 4 + pshufb xmm2, xmm3, xmm2 + punpcklbw xmm0, xmm1, xmm2 + punpckhbw xmm1, xmm2 movq [dstq+strideq*0], xmm0 movhps [dstq+strideq*1], xmm0 movq [dstq+strideq*2], xmm1 @@ -1145,8 +1151,10 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3 jg .w8 RET .w16: - pshufb m0, m4, [idxq] - add idxq, 64 + pmovzxdq m0, [idxq] + add idxq, 32 + vpmultishiftqb m0, m3, m0 + pshufb m0, m5, m0 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 @@ -1156,29 +1164,39 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3 jg .w16 RET .w32: - pshufb m0, m4, [idxq+64*0] - pshufb m1, m4, [idxq+64*1] - add idxq, 64*2 + vpbroadcastq m3, [pal_unpack+0] + vpbroadcastq m5, [palq] + cmp wd, 32 + jl .w16 + pmovzxbd m2, [pal_perm] + vpbroadcastq m4, [pal_unpack+8] + jg .w64 +.w32_loop: + vpermd m1, m2, [idxq] + add idxq, 64 + vpmultishiftqb m0, m3, m1 + vpmultishiftqb m1, m4, m1 + pshufb m0, m5, m0 + pshufb m1, m5, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 - jg .w32 + jg .w32_loop RET .w64: - pshufb m0, m4, [idxq+64*0] - pshufb m1, m4, [idxq+64*1] - pshufb m2, m4, [idxq+64*2] - pshufb m3, m4, [idxq+64*3] - add idxq, 64*4 + vpermd m1, m2, [idxq] + add idxq, 64 + vpmultishiftqb m0, m3, m1 + vpmultishiftqb m1, m4, m1 + pshufb m0, m5, m0 + pshufb m1, m5, m1 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 - mova [dstq+strideq*2], m2 - mova [dstq+stride3q ], m3 - lea dstq, [dstq+strideq*4] - sub hd, 4 + lea dstq, [dstq+strideq*2] + sub hd, 2 jg .w64 RET diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred_sse.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred_sse.asm index 9f548aadb..976f33a24 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred_sse.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/ipred_sse.asm @@ -55,22 +55,61 @@ smooth_weights: SMOOTH_WEIGHT_TABLE \ 18, 16, 15, 13, 12, 10, 9, 8, \ 7, 6, 6, 5, 5, 4, 4, 4 -ipred_v_shuf : db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 -ipred_h_shuf : db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 -ipred_paeth_shuf : db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 -filter_shuf1 : db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 -filter_shuf2 : db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1 - -pw_8 : times 8 dw 8 -pb_3 : times 16 db 3 -pb_128 : times 8 db 128 -pw_128 : times 4 dw 128 -pw_255 : times 4 dw 255 -pb_2 : times 8 db 2 -pb_4 : times 8 db 4 -pb_127_m127 : times 4 db 127, -127 -pd_32768 : times 1 dd 32768 - +ipred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 +ipred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 +ipred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 +z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 +z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 +z_transpose4: db 8, 12, 0, 4, 9, 13, 1, 5, 10, 14, 2, 6, 11, 15, 3, 7 +z3_shuf: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 +z3_shuf_h4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 +filter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 +filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1 +z_filter_wh4: db 7, 7, 19, 7, +z_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 +pd_32768: dd 32768 +z3_filter_k_tail: db 64, 0, 64, 0, 64, 0, 56, 8 +z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 +pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 +z3_base_inc: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64 +z_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 +z_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 + db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 +z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 +z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 + db 7, 8, 8, 9, 9, 10, 10, 11 +z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64 +z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 +z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 +z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64 +pw_m1to4: dw -1, -2, -3, -4 +z_filter_k: times 4 db 0, 16 + times 4 db 0, 20 + times 4 db 8, 16 + times 4 db 32, 16 + times 4 db 24, 20 + times 4 db 16, 16 + times 4 db 0, 0 + times 4 db 0, 0 +pw_8: times 8 db 8, 0 +pb_3: times 16 db 3 +pb_16: times 16 db 16 +pw_62: times 8 dw 62 +pw_64: times 8 dw 64 +pw_256: times 8 dw 256 +pw_512: times 8 dw 512 +pw_m256: times 8 dw -256 +pb_2: times 8 db 2 +pb_4: times 8 db 4 +pb_8: times 8 db 8 +pb_128: times 8 db 128 +pb_m16: times 8 db -16 +pw_128: times 4 dw 128 +pw_255: times 4 dw 255 +pb_36_m4: times 4 db 36, -4 +pb_127_m127: times 4 db 127, -127 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) @@ -93,15 +132,18 @@ JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64 JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32 JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32 +cextern dr_intra_derivative cextern filter_intra_taps - SECTION .text ;--------------------------------------------------------------------------------------- @@ -1190,26 +1232,2275 @@ ALIGN function_align jg .w64_loop RET -;--------------------------------------------------------------------------------------- -;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal, -; const uint8_t *idx, const int w, const int h); -;--------------------------------------------------------------------------------------- +%if ARCH_X86_64 +cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx + %define base r7-$$ + lea r7, [$$] + mova m8, [base+pw_62] + mova m9, [base+pw_64] + mova m10, [base+pw_512] +%else +cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx + %define base r1-$$ + %define m8 [base+pw_62] + %define m9 [base+pw_64] + %define m10 [base+pw_512] + %define strideq r3 + %define stridemp dword [rsp+16*12] + mov stridemp, r1 + LEA r1, $$ +%endif + tzcnt wd, wm + movifnidn angled, anglem + movifnidn hd, hm + inc tlq + movsxd wq, [base+ipred_z1_ssse3_table+wq*4] + mov dxd, angled + and dxd, 0x7e + add angled, 165 ; ~90 + lea wq, [base+wq+ipred_z1_ssse3_table] + movzx dxd, word [base+dr_intra_derivative+dxq] + xor angled, 0x4ff ; d = 90 - angle + jmp wq +.w4: + lea r3d, [angleq+88] + test r3d, 0x480 + jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 + sar r3d, 9 + add r3d, hd + cmp r3d, 8 + jg .w4_no_upsample ; h > 8 || (w == h && is_sm) + mova m1, [tlq-1] + pshufb m0, m1, [base+z_upsample1] + pshufb m1, [base+z_upsample2] + movddup m2, [base+pb_36_m4] + add dxd, dxd + pmaddubsw m0, m2 + pshufd m7, m1, q3333 + movd [rsp+16], m7 ; top[max_base_x] + pmaddubsw m1, m2 + movd m6, dxd + mov r5d, dxd ; xpos + pshufb m6, [base+pw_256] + paddw m1, m0 + movq m0, [tlq] + pmulhrsw m1, m10 + paddw m7, m6, m6 + punpcklqdq m6, m7 ; xpos0 xpos1 + packuswb m1, m1 + punpcklbw m0, m1 + movifnidn strideq, stridemp + mova [rsp], m0 +.w4_upsample_loop: + lea r2d, [r5+dxq] + shr r5d, 6 ; base0 + movq m0, [rsp+r5] + lea r5d, [r2+dxq] + shr r2d, 6 ; base1 + movhps m0, [rsp+r2] + pand m2, m8, m6 ; frac + psubw m1, m9, m2 ; 64-frac + psllw m2, 8 + por m1, m2 ; 64-frac, frac + pmaddubsw m0, m1 + paddw m6, m7 ; xpos += dx + pmulhrsw m0, m10 + packuswb m0, m0 + movd [dstq+strideq*0], m0 + pshuflw m0, m0, q1032 + movd [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w4_upsample_loop + RET +.w4_no_upsample: + mov r3d, 7 ; max_base + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + lea r3d, [hq+3] + movd m0, r3d + movd m2, angled + shr angled, 8 ; is_sm << 1 + pxor m1, m1 + pshufb m0, m1 + pshufb m2, m1 + pcmpeqb m1, m0, [base+z_filter_wh4] + pand m1, m2 + pcmpgtb m1, [base+z_filter_t_w48+angleq*8] + pmovmskb r5d, m1 + mov r3d, 7 + test r5d, r5d + jz .w4_main ; filter_strength == 0 + mova m3, [tlq-1] + imul r5d, 0x55555555 + movu m7, [base+z_filter_s+8] + shr r5d, 30 ; filter_strength + movddup m0, [base+pb_8] + pminub m7, m0 + pshufb m0, m3, [base+z_filter_s] + movddup m4, [base+z_filter_k-8+r5*8+24*0] + pshufb m3, m7 + movddup m5, [base+z_filter_k-8+r5*8+24*1] + shufps m2, m0, m3, q2121 + movddup m6, [base+z_filter_k-8+r5*8+24*2] + pmaddubsw m0, m4 + pmaddubsw m1, m2, m4 + pmaddubsw m2, m5 + paddd m5, m6 + pmaddubsw m4, m3, m5 + pmaddubsw m3, m6 + paddw m0, m2 + paddw m1, m4 + paddw m0, m3 + pshufd m1, m1, q3333 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + mov r5d, 9 + mov tlq, rsp + cmp hd, 4 + cmovne r3d, r5d + packuswb m0, m1 + mova [tlq], m0 +.w4_main: + add tlq, r3 + movd m5, dxd + movddup m0, [base+z_base_inc] ; base_inc << 6 + movd m7, [tlq] ; top[max_base_x] + shl r3d, 6 + movd m4, r3d + pshufb m5, [base+pw_256] + mov r5d, dxd ; xpos + pshufb m7, [base+pw_m256] + sub r5, r3 + pshufb m4, [base+pw_256] + mova m3, [base+z1_shuf_w4] + paddw m6, m5, m5 + psubw m4, m0 ; max_base_x + punpcklqdq m5, m6 ; xpos0 xpos1 +.w4_loop: + lea r3, [r5+dxq] + sar r5, 6 ; base0 + movq m0, [tlq+r5] + lea r5, [r3+dxq] + sar r3, 6 ; base1 + movhps m0, [tlq+r3] + pand m2, m8, m5 ; frac + psubw m1, m9, m2 ; 64-frac + psllw m2, 8 + pshufb m0, m3 + por m1, m2 ; 64-frac, frac + pmaddubsw m0, m1 + movifnidn strideq, stridemp + pcmpgtw m1, m4, m5 ; base < max_base_x + pmulhrsw m0, m10 + paddw m5, m6 ; xpos += dx + pand m0, m1 + pandn m1, m7 + por m0, m1 + packuswb m0, m0 + movd [dstq+strideq*0], m0 + pshuflw m0, m0, q1032 + movd [dstq+strideq*1], m0 + sub hd, 2 + jz .w4_end + lea dstq, [dstq+strideq*2] + test r5d, r5d + jl .w4_loop + packuswb m7, m7 +.w4_end_loop: + movd [dstq+strideq*0], m7 + movd [dstq+strideq*1], m7 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w4_end_loop +.w4_end: + RET +.w8: + lea r3d, [angleq+88] + and r3d, ~0x7f + or r3d, hd + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + mova m5, [base+z_upsample1] + movu m3, [base+z_filter_s+6] + movd m4, hd + mova m0, [tlq-1] + movu m1, [tlq+7] + pxor m7, m7 + pshufb m4, m7 + movddup m7, [base+pb_36_m4] + pminub m4, m3 + add dxd, dxd + pshufb m2, m0, m5 + pmaddubsw m2, m7 + pshufb m0, m3 + pmaddubsw m0, m7 + movd m6, dxd + pshufb m3, m1, m5 + pmaddubsw m3, m7 + pshufb m1, m4 + pmaddubsw m1, m7 + pshufb m6, [base+pw_256] + mov r5d, dxd + paddw m2, m0 + paddw m7, m6, m6 + paddw m3, m1 + punpcklqdq m6, m7 ; xpos0 xpos1 + movu m1, [tlq] + pmulhrsw m2, m10 + pmulhrsw m3, m10 + packuswb m2, m3 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + movifnidn strideq, stridemp + mova [rsp+16*0], m0 + mova [rsp+16*1], m1 +.w8_upsample_loop: + lea r2d, [r5+dxq] + shr r5d, 6 ; base0 + movu m0, [rsp+r5] + lea r5d, [r2+dxq] + shr r2d, 6 ; base1 + movu m1, [rsp+r2] + pand m2, m8, m6 + psubw m3, m9, m2 + psllw m2, 8 + por m3, m2 + punpcklqdq m2, m3, m3 ; frac0 + pmaddubsw m0, m2 + punpckhqdq m3, m3 ; frac1 + pmaddubsw m1, m3 + paddw m6, m7 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w8_upsample_loop + RET +.w8_no_upsample: + lea r3d, [hq+7] + movd m0, r3d + and r3d, 7 + or r3d, 8 ; imin(h+7, 15) + test angled, 0x400 + jnz .w8_main + movd m2, angled + shr angled, 8 ; is_sm << 1 + pxor m1, m1 + pshufb m0, m1 + pshufb m2, m1 + movu m1, [base+z_filter_wh8] + psrldq m3, [base+z_filter_t_w48+angleq*8], 4 + pcmpeqb m1, m0 + pand m1, m2 + pcmpgtb m1, m3 + pmovmskb r5d, m1 + test r5d, r5d + jz .w8_main ; filter_strength == 0 + movd m3, [tlq-1] + movu m0, [tlq+16*0] + imul r5d, 0x55555555 + movu m1, [tlq+16*1] + shr r5d, 30 ; filter_strength + movd m2, [tlq+r3] + lea tlq, [rsp+16*4] + sub r5, 3 + mova [tlq-16*1], m0 + pxor m7, m7 + mova [tlq+16*0], m1 + pshufb m3, m7 + pshufb m2, m7 + mova [tlq-16*2], m3 + movq [tlq+r3-15], m2 + call .filter_edge + sar r5d, 1 + add r5d, 17 + cmp hd, 8 + cmova r3d, r5d +.w8_main: + add tlq, r3 + movd m5, dxd + movd m7, [tlq] + shl r3d, 6 + movu m3, [base+z_filter_s+2] + movd m4, r3d + pshufb m5, [base+pw_256] + mov r5d, dxd + pshufb m7, [base+pw_m256] + sub r5, r3 + pshufb m4, [base+pw_256] + psubw m4, [base+z_base_inc] + mova m6, m5 +.w8_loop: + mov r3, r5 + sar r3, 6 + movu m0, [tlq+r3] + pand m1, m8, m5 + psubw m2, m9, m1 + psllw m1, 8 + pshufb m0, m3 + por m1, m2 + pmaddubsw m0, m1 + pcmpgtw m1, m4, m5 + paddw m5, m6 + pmulhrsw m0, m10 + pand m0, m1 + pandn m1, m7 + por m0, m1 + packuswb m0, m0 + movq [dstq], m0 + dec hd + jz .w8_end + movifnidn strideq, stridemp + add dstq, strideq + add r5, dxq + jl .w8_loop + packuswb m7, m7 +.w8_end_loop: + movq [dstq], m7 + add dstq, strideq + dec hd + jg .w8_end_loop +.w8_end: + RET +.w16: + lea r3d, [hq+15] + movd m0, r3d + and r3d, 15 + or r3d, 16 ; imin(h+15, 31) + test angled, 0x400 + jnz .w16_main + movd m2, angled + shr angled, 8 ; is_sm << 1 + pxor m1, m1 + pshufb m0, m1 + pshufb m2, m1 + movq m3, [base+z_filter_t_w16+angleq*4] + pcmpeqb m0, [base+z_filter_wh16] + pand m0, m2 + pcmpgtb m0, m3 + pmovmskb r5d, m0 + test r5d, r5d + jz .w16_main ; filter_strength == 0 + movd m4, [tlq-1] + movu m0, [tlq+16*0] + imul r5d, 0x24924924 + movu m1, [tlq+16*1] + shr r5d, 30 + movd m2, [tlq+30] + adc r5, -4 ; filter_strength-3 + movd m3, [tlq+r3] + lea tlq, [rsp+16*4] + mova [tlq-16*1], m0 + pxor m7, m7 + mova [tlq+16*0], m1 + pshufb m4, m7 + movd [rsp], m2 + pshufb m3, m7 + mova [tlq-16*2], m4 + movd [tlq+r3-16], m3 + call .filter_edge + cmp hd, 16 + jle .w16_main + pshuflw m0, [rsp], q0000 + sar r5, 1 + movd m1, [base+z_filter_k_tail+4+r5*4] + lea r3d, [r5+33] + pmaddubsw m0, m1 +%if ARCH_X86_64 + pmulhrsw m0, m10 +%else + pmulhrsw m0, m4 +%endif + packuswb m0, m0 + movd [tlq+32], m0 +.w16_main: + add tlq, r3 + movd m5, dxd + movd m7, [tlq] + movd m4, r3d + shl r3d, 6 + pshufb m5, [base+pw_256] + pxor m6, m6 + pshufb m7, m6 + mov r5d, dxd + pshufb m4, m6 + sub r5, r3 + psubb m4, [base+pb_0to15] + mova m6, m5 +.w16_loop: + mov r3, r5 + sar r3, 6 + movu m1, [tlq+r3+0] + pand m0, m8, m5 + movu m2, [tlq+r3+1] + psubw m3, m9, m0 + psllw m0, 8 + por m3, m0 + punpcklbw m0, m1, m2 + pmaddubsw m0, m3 + punpckhbw m1, m2 + pmaddubsw m1, m3 + psrlw m3, m5, 6 + packsswb m3, m3 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + paddw m5, m6 + pcmpgtb m2, m4, m3 + packuswb m0, m1 + pand m0, m2 + pandn m2, m7 + por m0, m2 + mova [dstq], m0 + dec hd + jz .w16_end + movifnidn strideq, stridemp + add dstq, strideq + add r5, dxq + jl .w16_loop +.w16_end_loop: + mova [dstq], m7 + add dstq, strideq + dec hd + jg .w16_end_loop +.w16_end: + RET +.w32: + lea r3d, [hq+31] + and r3d, 31 + or r3d, 32 ; imin(h+31, 63) + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + movd m6, [tlq-1] + movu m0, [tlq+16*0] + movu m1, [tlq+16*1] + movu m2, [tlq+16*2] + movu m3, [tlq+16*3] + movd m4, [tlq+62] + movd m5, [tlq+r3] + lea tlq, [rsp+16*6] + mova [tlq-16*3], m0 + pxor m7, m7 + mova [tlq-16*2], m1 + pshufb m6, m7 + mova [tlq-16*1], m2 + xor r5d, r5d ; filter_strength = 3 + mova [tlq+16*0], m3 + movd [rsp], m4 + pshufb m5, m7 + mova [tlq-16*4], m6 + movd [tlq+r3-48], m5 + call .filter_edge + sub tlq, 16*2 + call .filter_edge + cmp hd, 32 + jle .w32_main + pshuflw m0, [rsp], q0000 + movd m1, [base+z_filter_k_tail+4] + add r3d, 2 + pmaddubsw m0, m1 +%if ARCH_X86_64 + pmulhrsw m0, m10 +%else + pmulhrsw m0, m4 +%endif + packuswb m0, m0 + movd [tlq+64], m0 +.w32_main: + add tlq, r3 + movd m0, r3d + movd m7, [tlq] + shl r3d, 6 + movd m5, dxd + pxor m6, m6 + mov r5d, dxd + pshufb m0, m6 + pshufb m5, [base+pw_256] + sub r5, r3 + pshufb m7, m6 + psubb m0, [base+pb_0to15] + movddup m1, [base+pb_m16] + mova [rsp+16*0], m0 + paddb m0, m1 + mova [rsp+16*1], m0 + mova m6, m5 +.w32_loop: + mov r3, r5 + sar r3, 6 + movu m1, [tlq+r3+16*0+0] + pand m0, m8, m5 + movu m2, [tlq+r3+16*0+1] + psubw m3, m9, m0 + psllw m0, 8 + por m3, m0 + punpcklbw m0, m1, m2 + pmaddubsw m0, m3 + punpckhbw m1, m2 + pmaddubsw m1, m3 + psrlw m4, m5, 6 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packsswb m4, m4 + pcmpgtb m2, [rsp+16*0], m4 + packuswb m0, m1 + pand m0, m2 + pandn m2, m7 + por m0, m2 + movu m1, [tlq+r3+16*1+0] + movu m2, [tlq+r3+16*1+1] + mova [dstq+16*0], m0 + punpcklbw m0, m1, m2 + pmaddubsw m0, m3 + punpckhbw m1, m2 + pmaddubsw m1, m3 + paddw m5, m6 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pcmpgtb m2, [rsp+16*1], m4 + packuswb m0, m1 + pand m0, m2 + pandn m2, m7 + por m0, m2 + mova [dstq+16*1], m0 + dec hd + jz .w32_end + movifnidn strideq, stridemp + add dstq, strideq + add r5, dxq + jl .w32_loop +.w32_end_loop: + mova [dstq+16*0], m7 + mova [dstq+16*1], m7 + add dstq, strideq + dec hd + jg .w32_end_loop +.w32_end: + RET +.w64: + lea r3d, [hq+63] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + movd m4, [tlq-1] + movu m0, [tlq+16*0] + movu m1, [tlq+16*1] + movu m2, [tlq+16*2] + movu m3, [tlq+16*3] + mova [rsp+16*3], m0 + pxor m7, m7 + mova [rsp+16*4], m1 + pshufb m4, m7 + mova [rsp+16*5], m2 + mova [rsp+16*6], m3 + mova [rsp+16*2], m4 + movu m0, [tlq+16*4] + movu m1, [tlq+16*5] + movu m2, [tlq+16*6] + movu m3, [tlq+16*7] + movd m4, [tlq+r3] + lea tlq, [rsp+16*10] + mova [tlq-16*3], m0 + xor r5d, r5d ; filter_strength = 3 + mova [tlq-16*2], m1 + pshufb m4, m7 + mova [tlq-16*1], m2 + mova [tlq+16*0], m3 + movd [tlq+r3-16*7], m4 + cmp hd, 64 + jl .w64_filter96 ; skip one call if the last 32 bytes aren't used + call .filter_edge +.w64_filter96: + sub tlq, 16*2 + call .filter_edge + sub tlq, 16*2 + call .filter_edge + sub tlq, 16*2 + call .filter_edge +.w64_main: + add tlq, r3 + movd m0, r3d + movd m7, [tlq] + shl r3d, 6 + movd m5, dxd + pxor m6, m6 + mov r5d, dxd + pshufb m0, m6 + sub r5, r3 + pshufb m5, [base+pw_256] + pshufb m7, m6 + psubb m0, [base+pb_0to15] + movddup m1, [base+pb_m16] + mova [rsp+16*0], m0 + paddb m0, m1 + mova [rsp+16*1], m0 + paddb m0, m1 + mova [rsp+16*2], m0 + paddb m0, m1 + mova [rsp+16*3], m0 + mova m6, m5 +.w64_loop: + mov r3, r5 + sar r3, 6 + movu m1, [tlq+r3+16*0+0] + pand m0, m8, m5 + movu m2, [tlq+r3+16*0+1] + psubw m3, m9, m0 + psllw m0, 8 + por m3, m0 + punpcklbw m0, m1, m2 + pmaddubsw m0, m3 + punpckhbw m1, m2 + pmaddubsw m1, m3 + psrlw m4, m5, 6 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packsswb m4, m4 + pcmpgtb m2, [rsp+16*0], m4 + packuswb m0, m1 + pand m0, m2 + pandn m2, m7 + por m0, m2 + movu m1, [tlq+r3+16*1+0] + movu m2, [tlq+r3+16*1+1] + mova [dstq+16*0], m0 + punpcklbw m0, m1, m2 + pmaddubsw m0, m3 + punpckhbw m1, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pcmpgtb m2, [rsp+16*1], m4 + packuswb m0, m1 + pand m0, m2 + pandn m2, m7 + por m0, m2 + movu m1, [tlq+r3+16*2+0] + movu m2, [tlq+r3+16*2+1] + mova [dstq+16*1], m0 + punpcklbw m0, m1, m2 + pmaddubsw m0, m3 + punpckhbw m1, m2 + pmaddubsw m1, m3 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pcmpgtb m2, [rsp+16*2], m4 + packuswb m0, m1 + pand m0, m2 + pandn m2, m7 + por m0, m2 + movu m1, [tlq+r3+16*3+0] + movu m2, [tlq+r3+16*3+1] + mova [dstq+16*2], m0 + punpcklbw m0, m1, m2 + pmaddubsw m0, m3 + punpckhbw m1, m2 + pmaddubsw m1, m3 + paddw m5, m6 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pcmpgtb m2, [rsp+16*3], m4 + packuswb m0, m1 + pand m0, m2 + pandn m2, m7 + por m0, m2 + mova [dstq+16*3], m0 + dec hd + jz .w64_end + movifnidn strideq, stridemp + add dstq, strideq + add r5, dxq + jl .w64_loop +.w64_end_loop: + mova [dstq+16*0], m7 + mova [dstq+16*1], m7 + mova [dstq+16*2], m7 + mova [dstq+16*3], m7 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + RET +ALIGN function_align +.filter_edge: ; 32 pixels/iteration + movddup m7, [base+z_filter_k+8*2+r5*8+24*0] + movu m2, [tlq-18] + movu m1, [tlq-17] + movu m3, [tlq- 2] + movu m4, [tlq- 1] + punpcklbw m0, m2, m1 + pmaddubsw m0, m7 + punpckhbw m2, m1 + pmaddubsw m2, m7 + punpcklbw m1, m3, m4 + pmaddubsw m1, m7 + punpckhbw m3, m4 + pmaddubsw m3, m7 + movddup m7, [base+z_filter_k+8*2+r5*8+24*1] + mova m5, [tlq-16] + movu m6, [tlq-15] + punpcklbw m4, m5, m6 + pmaddubsw m4, m7 + punpckhbw m5, m6 + pmaddubsw m5, m7 + paddw m0, m4 + paddw m2, m5 + mova m5, [tlq+ 0] + movu m6, [tlq+ 1] + punpcklbw m4, m5, m6 + pmaddubsw m4, m7 + punpckhbw m5, m6 + pmaddubsw m5, m7 + paddw m1, m4 + paddw m3, m5 + test r5d, r5d + jnz .filter_end ; 3-tap + movddup m7, [base+z_filter_k+8*8] + movu m5, [tlq-14] + movu m6, [tlq+ 2] + punpcklbw m4, m5, m5 + pmaddubsw m4, m7 + punpckhbw m5, m5 + pmaddubsw m5, m7 + paddw m0, m4 + paddw m2, m5 + punpcklbw m5, m6, m6 + pmaddubsw m5, m7 + punpckhbw m6, m6 + pmaddubsw m6, m7 + paddw m1, m5 + paddw m3, m6 +.filter_end: +%if ARCH_X86_64 + REPX {pmulhrsw x, m10}, m0, m2, m1, m3 +%else + mova m4, m10 + REPX {pmulhrsw x, m4 }, m0, m2, m1, m3 +%endif + packuswb m0, m2 + packuswb m1, m3 + mova [tlq+16*0], m0 + mova [tlq+16*1], m1 + ret + +%if ARCH_X86_64 +cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy + %define base r7-$$ + %define maxwm r6m + %define maxhm r7m + lea r7, [$$] + mov hd, hm + mova m8, [base+pw_62] + mova m9, [base+pw_64] + lea r9d, [wq-4] + mova m10, [base+pw_512] + shl r9d, 6 + mova m11, [base+z1_shuf_w4] + or r9d, hd + mova m12, [base+z2_h_shuf] +%else +cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx + %define base r1-$$ + %define m8 [base+pw_62] + %define m9 [base+pw_64] + %define m10 [base+pw_512] + %define m11 [rsp+16*16] + %define m12 [rsp+16*17] + %define r9b byte [rsp+16*18+4*0] + %define r9d dword [rsp+16*18+4*0] + %define r10d dword [rsp+16*18+4*1] + %define r11d dword [rsp+16*18+4*2] + %define maxwm [rsp+16*18+4*3] + %define maxhm [rsp+16*19+4*0] + %define stridemp [rsp+16*19+4*1] + %define strideq r3 + %define dyd r4 + %define dyq r4 + mov stridemp, r1 + mov r1d, r6m + mov r4d, r7m + mov maxwm, r1d + mov maxhm, r4d + LEA r1, $$ + lea hd, [wq-4] + mova m0, [base+z1_shuf_w4] + shl hd, 6 + mova m1, [base+z2_h_shuf] + or hd, hm + mova m11, m0 + mov r9d, hd + mova m12, m1 +%endif + tzcnt wd, wd + movifnidn angled, anglem + movsxd wq, [base+ipred_z2_ssse3_table+wq*4] +%if ARCH_X86_64 + movzx dxd, angleb +%else + movzx dxd, byte anglem +%endif + xor angled, 0x400 + mova m0, [tlq-16*4] + mov dyd, dxd + mova m1, [tlq-16*3] + neg dxq + mova m2, [tlq-16*2] + and dyd, ~1 + mova m3, [tlq-16*1] + and dxq, ~1 + movd m4, [tlq] + movu m5, [tlq+16*0+1] + movu m6, [tlq+16*1+1] + movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 + movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle + mova [rsp+16*2], m0 + pxor m7, m7 + mova [rsp+16*3], m1 + pshufb m4, m7 + mova [rsp+16*4], m2 + lea wq, [base+ipred_z2_ssse3_table+wq] + mova [rsp+16*5], m3 + neg dxd + mova [rsp+16*6], m4 + or dyd, 4<<16 + mova [rsp+16*7], m4 + mova [rsp+16*8], m5 + mova [rsp+16*9], m6 + movq m0, [base+z_base_inc+2] + movsldup m1, [base+z2_dy_offset] + movq m2, [base+pw_256] ; 4<<6 + movq [rsp+16*14+8*0], m0 + movq [rsp+16*15+8*0], m1 + movq [rsp+16*15+8*1], m2 +%if ARCH_X86_64 + lea r10d, [dxq+(128<<6)] ; xpos +%else + mov [rsp+16*7+4*1], dyd + lea r4d, [dxq+(128<<6)] + mov r10d, r4d + movzx hd, r9b +%endif + mov r11d, (128-4)<<6 + jmp wq +.w4: + test angled, 0x400 + jnz .w4_main + movd m5, [tlq+4] + lea r3d, [hq+2] + add angled, 1022 + pshufb m5, m7 + shl r3d, 6 + movd [rsp+16*8+4], m5 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + call .upsample_above + sub angled, 1075 ; angle - 53 + lea r3d, [hq+3] + xor angled, 0x7f ; 180 - angle + movd m0, r3d + movd m6, angled + shr angled, 8 ; is_sm << 1 + pshufb m0, m7 + pshufb m6, m7 + pcmpeqb m0, [base+z_filter_wh4] + pand m6, m0 + pcmpgtb m6, [base+z_filter_t_w48+angleq*8] + jmp .w8_filter_left +.upsample_above: ; w4/w8 + movq m3, [rsp+gprsize+16*8-2] + movq m1, [rsp+gprsize+16*8-1] + movq m0, [rsp+gprsize+16*8+0] + movq m4, [rsp+gprsize+16*8+1] + movddup m5, [base+pb_36_m4] + punpcklbw m1, m3 + punpcklbw m2, m0, m4 + pmaddubsw m1, m5 + pmaddubsw m2, m5 +%if ARCH_X86_64 + mova m11, [base+pb_0to15] + lea r10d, [r10+dxq+(1<<6)] + mov r11d, (128-7)<<6 +%else + mova m3, [base+pb_0to15] + mov r3d, [rsp+gprsize+16*18+4*1] + mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6 + lea r3d, [r3+dxq+(1<<6)] + mov [rsp+gprsize+16*18+4*1], r3d + mova [rsp+gprsize+16*16], m3 +%endif + add dxd, dxd + paddw m1, m2 + pmulhrsw m1, m10 + movq m2, [rsp+gprsize+16*14] + paddw m2, m2 + movq [rsp+gprsize+16*14], m2 + packuswb m1, m1 + punpcklbw m1, m0 + mova [rsp+gprsize+16*8], m1 + ret +.w4_no_upsample_above: + lea r3d, [hq+3] + mov [rsp], angled + sub angled, 1112 ; angle - 90 + movd m0, r3d + mov r3d, 90 + movd m1, angled + sub r3d, angled ; 180 - angle + shr angled, 8 ; is_sm << 1 + movu m3, [base+z_filter_wh4] + mova m4, [base+z_filter_t_w48+angleq*8] + call .w8_filter_top + mov angled, [rsp] + lea r3d, [hq+2] + sub angled, 139 + shl r3d, 6 + test r3d, angled + jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) +.upsample_left: ; w4/w8 + neg hq + movd m0, [tlq+hq] + pshufb m0, m7 + movd [rsp+16*6+hq-4], m0 + movq m3, [rsp+16*5+7] + movq m0, [rsp+16*5+8] + movq m2, [rsp+16*5+9] + movq m4, [rsp+16*5+10] + movddup m5, [base+pb_36_m4] + punpcklbw m1, m0, m3 + punpcklbw m2, m4 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + movshdup m3, [base+z2_dy_offset] +%if ARCH_X86_64 + mova m12, [base+z2_upsample] + add dyd, dyd +%else + mova m4, [base+z2_upsample] + shl dword [rsp+16*7+4*1], 1 + mova m12, m4 +%endif + paddw m1, m2 + pmulhrsw m1, m10 + movq [rsp+16*15], m3 + packuswb m1, m1 + punpcklbw m0, m1 + mova [rsp+16*5], m0 +.w4_main: + movd m6, dxd +%if ARCH_X86_64 + movd m3, dyd +%else + movd m3, [rsp+16*7+4*1] +%endif + movddup m0, [rsp+16*14+8*0] + pshufb m6, [base+pw_256] + paddw m7, m6, m6 + movq m5, [base+pw_m1to4] + pshuflw m4, m3, q0000 + punpcklqdq m6, m7 + pmullw m4, m5 + pshuflw m3, m3, q1111 + paddw m6, m0 + mov r2d, r10d + pshuflw m0, m4, q3333 + psubw m4, [rsp+16*15] + movq [rsp+16*6+8*1], m3 + movq [rsp+8*1], m0 ; dy*4 + mov r5, dstq +.w4_loop0: + mova [rsp+16*12], m6 + movq [rsp+8*0], m4 + pand m0, m4, m8 + psraw m4, 6 + psubw m1, m9, m0 + psllw m0, 8 + por m0, m1 ; 64-frac_y, frac_y + movq [rsp+8*3], m0 + pabsw m4, m4 + movq [rsp+8*2], m4 + movzx hd, r9b +.w4_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movq m0, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + movhps m0, [rsp+r3] + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x2 + movq m1, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x3 + movhps m1, [rsp+r3] + pand m2, m8, m6 + paddsw m5, m6, m7 + psubw m3, m9, m2 + psllw m2, 8 + pshufb m0, m11 + por m2, m3 + pmaddubsw m0, m2 + pand m2, m8, m5 + psubw m3, m9, m2 + psllw m2, 8 + pshufb m1, m11 + por m2, m3 + pmaddubsw m1, m2 + cmp r3d, 127 ; topleft + jge .w4_toponly + movzx r3d, byte [rsp+8*2+0] ; base_y0 + movq m3, [rsp+r3] + movzx r3d, byte [rsp+8*2+2] ; base_y1 + movhps m3, [rsp+r3] + movzx r3d, byte [rsp+8*2+4] ; base_y2 + movq m4, [rsp+r3] + movzx r3d, byte [rsp+8*2+6] ; base_y3 + movhps m4, [rsp+r3] + pshufb m3, m12 + pshufb m4, m12 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + movddup m4, [rsp+8*3] + pmaddubsw m2, m4 + pmaddubsw m3, m4 + psraw m6, 15 ; base_x < topleft + pand m2, m6 + pandn m6, m0 + por m0, m2, m6 + psraw m6, m5, 15 + pand m3, m6 + pandn m6, m1 + por m1, m3, m6 +.w4_toponly: + pmulhrsw m0, m10 + pmulhrsw m1, m10 + movifnidn strideq, stridemp + packuswb m0, m1 + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + punpckhqdq m0, m0 + movd [dstq+strideq*0], m0 + psrlq m0, 32 + movd [dstq+strideq*1], m0 + sub hd, 4 + jz .w4_end + movq m4, [rsp+8*2] + movq m3, [rsp+16*6+8*1] + paddw m6, m5, m7 ; xpos += dx + psubw m4, m3 + movq [rsp+8*2], m4 + lea dstq, [dstq+strideq*2] + cmp r2d, r11d + jge .w4_loop + movddup m5, [rsp+8*3] +.w4_leftonly_loop: + movzx r2d, byte [rsp+8*2+0] ; base_y0 + movq m1, [rsp+r2] + movzx r2d, byte [rsp+8*2+2] ; base_y1 + movhps m1, [rsp+r2] + movzx r2d, byte [rsp+8*2+4] ; base_y2 + movq m2, [rsp+r2] + movzx r2d, byte [rsp+8*2+6] ; base_y3 + movhps m2, [rsp+r2] + psubw m4, m3 + pshufb m1, m12 + pshufb m2, m12 + movq [rsp+8*2], m4 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + punpckhqdq m0, m0 + movd [dstq+strideq*0], m0 + psrlq m0, 32 + movd [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 4 + jg .w4_leftonly_loop +.w4_end: + sub r9d, 1<<8 + jl .w4_ret + movq m4, [rsp+8*1] + add r5, 4 + mov dstq, r5 + paddw m4, [rsp+8*0] ; base_y += 4*dy + movzx r2d, word [rsp+16*15+8*1] + movddup m6, [rsp+16*15+8*1] + paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above) + add r2d, r10d + mov r10d, r2d + jmp .w4_loop0 +.w4_ret: + RET +.w8: + test angled, 0x400 + jnz .w4_main + movd m5, [tlq+8] + lea r3d, [angleq+126] + pshufb m5, m7 +%if ARCH_X86_64 + mov r3b, hb +%else + xor r3b, r3b + or r3d, hd +%endif + movd [rsp+16*8+8], m5 + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + call .upsample_above + sub angled, 53 + lea r3d, [hq+7] + xor angled, 0x7f ; 180 - angle + movu m1, [base+z_filter_wh8] + movd m0, r3d + movd m6, angled + shr angled, 8 ; is_sm << 1 + psrldq m2, [base+z_filter_t_w48+angleq*8], 4 + pshufb m0, m7 + pshufb m6, m7 + pcmpeqb m0, m1 + pand m6, m0 + pcmpgtb m6, m2 +%if ARCH_X86_64 + movq [rsp+16*15+8*1], m10 ; 8<<6 +%else + movq m0, m10 + movq [rsp+16*15+8*1], m0 +%endif + jmp .w8_filter_left +.w8_no_upsample_above: + lea r3d, [hq+7] + mov [rsp], angled + sub angled, 90 + movd m0, r3d + mov r3d, 90 + movd m1, angled + sub r3d, angled ; 180 - angle + shr angled, 8 ; is_sm << 1 + movu m3, [base+z_filter_wh8] + psrldq m4, [base+z_filter_t_w48+angleq*8], 4 + call .w8_filter_top + mov r3d, [rsp] + sub r3d, 141 +%if ARCH_X86_64 + mov r3b, hb +%else + xor r3b, r3b + or r3d, hd +%endif + cmp r3d, 8 + jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm +.w8_filter_left: + pmovmskb r5d, m6 + test r5d, r5d + jz .w4_main + imul r5d, 0x55555555 + mov r3, tlq + shr r5d, 30 + sub r5, 3 ; filter_strength-3 + jmp .filter_left +.w8_filter_top: + movd m6, r3d + REPX {pshufb x, m7}, m0, m1, m6 + pcmpeqb m0, m3 + pand m1, m0 + pand m6, m0 + pcmpgtb m1, m4 + pcmpgtb m6, m4 + pmovmskb r5d, m1 + test r5d, r5d + jz .w8_filter_top_end ; filter_strength == 0 + imul r5d, 0x55555555 + movq m0, [rsp+gprsize+16*8-2] + shr r5d, 30 + movq m1, [rsp+gprsize+16*8-1] + sub r5, 3 ; filter_strength-3 + movddup m7, [base+z_filter_k+8*2+r5*8+24*0] + punpcklbw m0, m1 + pmaddubsw m0, m7 + movq m1, [rsp+gprsize+16*8+0] + movq m2, [rsp+gprsize+16*8+1] + movddup m7, [base+z_filter_k+8*2+r5*8+24*1] + punpcklbw m1, m2 + pmaddubsw m1, m7 + movq m2, [rsp+gprsize+16*8+2] + movddup m7, [base+z_filter_k+8*2+r5*8+24*2] + punpcklbw m2, m2 + pmaddubsw m2, m7 + paddw m0, m1 + paddw m0, m2 +%if ARCH_X86_64 + mov r3d, r7m ; maxw, offset due to call +%else + mov r3d, [rsp+gprsize+16*18+4*3] +%endif + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + movq [rsp+gprsize+16*8], m0 + cmp r3d, 8 + jge .w8_filter_top_end + movq m0, [tlq+r3+1] + movq [rsp+gprsize+r3+16*8], m0 +.w8_filter_top_end: + ret +.w16: + test angled, 0x400 + jnz .w4_main + lea r3d, [hq+15] + sub angled, 90 + movd m0, r3d + mov r3d, 90 + movd m1, angled + sub r3d, angled ; 180 - angle + shr angled, 8 ; is_sm << 1 + movd m6, r3d + REPX {pshufb x, m7}, m0, m1, m6 + movq m3, [base+z_filter_t_w16+angleq*4] + pcmpeqb m0, [base+z_filter_wh16] + pand m1, m0 + pand m6, m0 + pcmpgtb m1, m3 + pcmpgtb m6, m3 + pmovmskb r5d, m1 + mov r3, tlq + test r5d, r5d + jz .w16_filter_left ; filter_strength == 0 + imul r5d, 0x24924924 + pshufb m5, [base+z_filter_t_w16] ; tlq[16] + shr r5d, 30 + adc r5, -4 ; filter_strength-3 + movd [rsp+16*9], m5 + movddup m7, [base+z_filter_k+8*2+r5*8+24*0] + movu m1, [rsp+16*8-2] + movu m2, [rsp+16*8-1] + punpcklbw m0, m1, m2 + pmaddubsw m0, m7 + punpckhbw m1, m2 + pmaddubsw m1, m7 + movddup m7, [base+z_filter_k+8*2+r5*8+24*1] + mova m3, [rsp+16*8+0] + movu m4, [rsp+16*8+1] + punpcklbw m2, m3, m4 + pmaddubsw m2, m7 + punpckhbw m3, m4 + pmaddubsw m3, m7 + paddw m0, m2 + paddw m1, m3 + test r5d, r5d + jnz .w16_filter_end ; 3-tap + movddup m7, [base+z_filter_k+8*8] + movu m3, [rsp+16*8+2] + punpcklbw m2, m3, m3 + pmaddubsw m2, m7 + punpckhbw m3, m3 + pmaddubsw m3, m7 + paddw m0, m2 + paddw m1, m3 +.w16_filter_end: + mov r2d, maxwm + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + mova [rsp+16*8], m0 + cmp r2d, 16 + jge .w16_filter_left + movu m0, [r3+r2+1] + movu [rsp+r2+16*8], m0 +.w16_filter_left: + pmovmskb r5d, m6 + test r5d, r5d + jz .w4_main + imul r5d, 0x24924924 + shr r5d, 30 + adc r5, -4 ; filter_strength-3 + jmp .filter_left +.w32: + test angled, 0x400 + jnz .w4_main + pshufb m6, [base+z_filter_t_w16] ; tlq[32] + mov r3, tlq + lea tlq, [rsp+16*9] + movd [tlq+16*1], m6 + xor r5d, r5d ; filter_strength = 3 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + mova m0, [tlq+16*0] + mova m1, [tlq+16*1] + mov r2d, maxwm + mova [rsp+16*8], m0 + mova [rsp+16*9], m1 + cmp r2d, 32 + jge .filter_left + movu m0, [r3+r2+16*0+1] + movu m1, [r3+r2+16*1+1] + movu [rsp+r2+16*8], m0 + movu [rsp+r2+16*9], m1 + jmp .filter_left +.w64: + movu m0, [tlq+16*2+1] + movu m1, [tlq+16*3+1] + mova [rsp+16*10], m0 + mova [rsp+16*11], m1 + test angled, 0x400 + jnz .w4_main + pshufb m1, [base+z_filter_t_w16] ; tlq[64] + mov r3, tlq + lea tlq, [rsp+16*11] + movd [tlq+16*1], m1 + xor r5d, r5d ; filter_strength = 3 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + sub tlq, 16*2 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + mova m0, [tlq+16*0] + mova m1, [tlq+16*1] + mova m2, [tlq+16*2] + mova m3, [tlq+16*3] + mov r2d, maxwm + mova [rsp+16* 8], m0 + mova [rsp+16* 9], m1 + mova [rsp+16*10], m2 + mova [rsp+16*11], m3 + cmp r2d, 64 + jge .filter_left + movu m0, [r3+r2+16*0+1] + movu m1, [r3+r2+16*1+1] + movu [rsp+r2+16* 8], m0 + movu [rsp+r2+16* 9], m1 + cmp r2d, 32 + jge .filter_left + movu m0, [r3+r2+16*2+1] + movu m1, [r3+r2+16*3+1] + movu [rsp+r2+16*10], m0 + movu [rsp+r2+16*11], m1 +.filter_left: + neg hq + movd m0, [r3+hq] + pxor m1, m1 + pshufb m0, m1 + movd [rsp+16*6+hq-4], m0 + lea tlq, [rsp+16*5] + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + cmp hd, -32 + jge .filter_left_end + sub tlq, 16*2 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + mova m0, [tlq+16*0] + mova m1, [tlq+16*1] + mova [rsp+16*2], m0 + mova [rsp+16*3], m1 +.filter_left_end: + mov r2d, maxhm + mova m0, [rsp+16*5] + mova m1, [rsp+16*6] + mova m2, [rsp+16*7] + neg r2 + mova [rsp+16*4], m0 + mova [rsp+16*5], m1 + mova [rsp+16*6], m2 + cmp r2d, hd + jle .w4_main + movu m0, [r3+r2-16*2] + movu m1, [r3+r2-16*1] + movu [rsp+r2+16*4], m0 + movu [rsp+r2+16*5], m1 + cmp r2d, -32 + jle .w4_main + movu m0, [r3+r2-16*4] + movu m1, [r3+r2-16*3] + movu [rsp+r2+16*2], m0 + movu [rsp+r2+16*3], m1 + jmp .w4_main + +%if ARCH_X86_64 +cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w + %define base r7-$$ + lea r7, [$$] + mova m8, [base+pw_62] + mova m9, [base+pw_64] + mova m10, [base+pw_512] + mov org_wd, wd +%else +cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy + %define base r1-$$ + %define m8 [base+pw_62] + %define m9 [base+pw_64] + %define m10 [base+pw_512] + %define org_wd r5 + %define org_wq r5 + mov [dstq+strideq*0], strideq + mov [dstq+strideq*1], wd + LEA r1, $$ +%endif + tzcnt hd, hm + movifnidn angled, anglem + dec tlq + movsxd hq, [base+ipred_z3_ssse3_table+hq*4] + sub angled, 180 + mov dyd, angled + neg dyd + xor angled, 0x400 + or dyq, ~0x7e + lea hq, [base+ipred_z3_ssse3_table+hq] + movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] + jmp hq +.h4: + lea r4d, [angleq+88] + test r4d, 0x480 + jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 + sar r4d, 9 + add r4d, wd + cmp r4d, 8 + jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm) + movu m3, [tlq-7] + movu m1, [base+z_upsample1-4] + movu m4, [base+z_filter_s+2] + pshufb m0, m3, m1 + pxor m1, m1 + pshufb m2, m3, m1 + pshufb m1, m3, m4 + mova [rsp+16], m2 ; top[max_base_y] + movddup m2, [base+pb_36_m4] + add dyd, dyd + pmaddubsw m0, m2 + pmaddubsw m1, m2 + movd m5, dyd + mov r5d, dyd + pshufb m5, [base+pw_256] + paddw m0, m1 + pmulhrsw m0, m10 + shl wd, 2 + mov tlq, rsp + sub rsp, wq + packuswb m0, m0 + punpcklbw m0, m3 + paddw m6, m5, m5 + punpcklqdq m5, m6 + pshufb m0, [base+pb_15to0] + mova [tlq], m0 +.h4_upsample_loop: + lea r4d, [r5+dyq] + shr r5d, 6 + movq m0, [tlq+r5] + lea r5d, [r4+dyq] + shr r4d, 6 + movhps m0, [tlq+r4] + pand m2, m8, m5 + psubw m1, m9, m2 + psllw m2, 8 + por m1, m2 + pmaddubsw m0, m1 + paddw m5, m6 + pmulhrsw m0, m10 + packuswb m0, m0 + movq [rsp+wq-8], m0 + sub wd, 8 + jg .h4_upsample_loop + jmp .h4_transpose +.h4_no_upsample: + mov r4d, 7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h4_main + lea r4d, [wq+3] + movd m0, r4d + movd m2, angled + shr angled, 8 ; is_sm << 1 + pxor m1, m1 + pshufb m0, m1 + pshufb m2, m1 + pcmpeqb m1, m0, [base+z_filter_wh4] + pand m1, m2 + pcmpgtb m1, [base+z_filter_t_w48+angleq*8] + pmovmskb r5d, m1 + mov r4d, 7 + test r5d, r5d + jz .h4_main ; filter_strength == 0 + movu m2, [tlq-7] + imul r5d, 0x55555555 + movu m3, [base+z_filter_s-2] + shr r5d, 30 ; filter_strength + mova m4, [base+z_upsample2] + movddup m5, [base+z_filter_k-8+r5*8+24*0] + movddup m6, [base+z_filter_k-8+r5*8+24*1] + movddup m7, [base+z_filter_k-8+r5*8+24*2] + pshufb m0, m2, m3 + shufps m3, m4, q2121 + pmaddubsw m1, m0, m5 + pmaddubsw m0, m6 + pshufb m5, m2, m3 + pmaddubsw m3, m5, m6 + pmaddubsw m5, m7 + pshufb m2, m4 + pmaddubsw m2, m7 + paddw m0, m1 + paddw m1, m3 + paddw m0, m5 + paddw m1, m2 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + lea r2d, [r4+2] + cmp wd, 4 + cmovne r4d, r2d + pshufd m0, m0, q0000 + lea tlq, [rsp+15] + packuswb m0, m1 + mova [rsp], m0 +.h4_main: + movd m5, dyd + movddup m0, [base+z_base_inc] ; base_inc << 6 + sub tlq, r4 + shl r4d, 6 + movd m7, [tlq] + movd m4, r4d + pshufb m5, [base+pw_256] + neg dyq + pshufb m7, [base+pw_m256] + mova m3, [base+z3_shuf_h4] + lea r5, [dyq+r4+63] ; ypos + pshufb m4, [base+pw_256] + psubw m4, m0 ; max_base_y + shl wd, 2 + paddw m6, m5, m5 + sub rsp, wq + punpcklqdq m5, m6 +.h4_loop: + lea r4, [r5+dyq] + sar r5, 6 + movq m0, [tlq+r5-4] + lea r5, [r4+dyq] + sar r4, 6 + movhps m0, [tlq+r4-4] + pand m2, m8, m5 + psubw m1, m9, m2 + psllw m2, 8 + pshufb m0, m3 + por m1, m2 + pmaddubsw m0, m1 + pcmpgtw m1, m4, m5 + paddw m5, m6 + pmulhrsw m0, m10 + pand m0, m1 + pandn m1, m7 + por m0, m1 + packuswb m0, m0 + movq [rsp+wq-8], m0 + sub wd, 8 + jz .h4_transpose + test r5d, r5d + jg .h4_loop + packuswb m7, m7 +.h4_end_loop: + movq [rsp+wq-8], m7 + sub wd, 8 + jg .h4_end_loop +.h4_transpose: + mova m1, [base+z_transpose4] +%if ARCH_X86_32 + mov strideq, [dstq] + mov org_wd, [dstq+strideq] +%endif + lea r2, [strideq*3] + lea dstq, [dstq+org_wq-4] +.h4_transpose_loop: + mova m0, [rsp] + add rsp, 16 + pshufb m0, m1 + movd [dstq+strideq*0], m0 + pshuflw m2, m0, q1032 + movd [dstq+strideq*1], m2 + punpckhqdq m0, m0 + movd [dstq+strideq*2], m0 + psrlq m0, 32 + movd [dstq+r2 ], m0 + sub dstq, 4 + sub org_wd, 4 + jg .h4_transpose_loop + RET +.h8: + lea r4d, [angleq+88] + and r4d, ~0x7f + or r4d, wd + cmp r4d, 8 + ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 + mova m4, [tlq-15] + and r4d, 4 + movu m3, [tlq- 9] + movd m1, r4d + movu m2, [base+z_filter_s+2] + pxor m0, m0 + movu m5, [base+z_filter_s+6] + movddup m7, [base+pb_36_m4] + pshufb m1, m0 ; w & 4 + movu m0, [base+z_upsample1-4] + pmaxub m1, m0 ; clip 4x8 + add dyd, dyd + pshufb m0, m4, m1 + pmaddubsw m0, m7 + pshufb m1, m4, m2 + pmaddubsw m1, m7 + pshufb m2, m3, [base+z_upsample1] + pmaddubsw m2, m7 + pshufb m3, m5 + pmaddubsw m3, m7 + movd m5, dyd + neg dyq + paddw m1, m0 + paddw m2, m3 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + shl wd, 3 + lea tlq, [rsp+16] + pshufb m5, [base+pw_256] + sub rsp, wq + packuswb m1, m2 + lea r5, [dyq+63] + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + mova [tlq-16*1], m0 + mova [tlq-16*0], m1 + paddw m6, m5, m5 + punpcklqdq m5, m6 +.h8_upsample_loop: + lea r4, [r5+dyq] + sar r5, 6 + movu m0, [tlq+r5] + lea r5, [r4+dyq] + sar r4, 6 + movu m1, [tlq+r4] + pand m3, m8, m5 + psubw m2, m9, m3 + psllw m2, 8 + por m3, m2 + pshufd m2, m3, q1010 + pmaddubsw m0, m2 + punpckhqdq m3, m3 + pmaddubsw m1, m3 + paddw m5, m6 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m1, m0 + mova [rsp+wq-16], m1 + sub wd, 16 + jg .h8_upsample_loop + jmp .h8_transpose +.h8_no_upsample: + lea r4d, [wq+7] + movd m0, r4d + and r4d, 7 + or r4d, 8 ; imin(w+7, 15) + test angled, 0x400 + jnz .h8_main + movd m2, angled + shr angled, 8 ; is_sm << 1 + pxor m1, m1 + pshufb m0, m1 + pshufb m2, m1 + movu m1, [base+z_filter_wh8] + psrldq m3, [base+z_filter_t_w48+angleq*8], 4 + pcmpeqb m1, m0 + pand m1, m2 + pcmpgtb m1, m3 + pmovmskb r5d, m1 + test r5d, r5d + jz .h8_main ; filter_strength == 0 + mova m0, [tlq-15] + imul r5d, 0x55555555 + movd m1, [tlq+1] + neg r4 + movd m2, [tlq+r4] + shr r5d, 30 + pxor m7, m7 + lea tlq, [rsp+16*2] + sub r5, 3 ; filter_strength-3 + mova [tlq+16*0], m0 + pshufb m1, m7 + mova [tlq+16*1], m1 + pshufb m2, m7 + movq [tlq+r4+8], m2 + neg r4d + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + sar r5d, 1 + add tlq, 31 + add r5d, 17 + cmp wd, 8 + cmova r4d, r5d +.h8_main: + movd m5, dyd + sub tlq, r4 + shl r4d, 6 + movd m7, [tlq] + movd m4, r4d + pshufb m5, [base+pw_256] + neg dyq + pshufb m7, [base+pw_m256] + mova m3, [base+z3_shuf] + lea r5, [dyq+r4+63] + pshufb m4, [base+pw_256] + psubw m4, [base+z3_base_inc] + shl wd, 3 + mova m6, m5 + sub rsp, wq +.h8_loop: + mov r4, r5 + sar r4, 6 + movu m0, [tlq+r4-8] + pand m2, m8, m5 + psubw m1, m9, m2 + psllw m2, 8 + pshufb m0, m3 + por m1, m2 + pmaddubsw m0, m1 + pcmpgtw m1, m4, m5 + paddw m5, m6 + pmulhrsw m0, m10 + pand m0, m1 + pandn m1, m7 + por m0, m1 + packuswb m0, m0 + movq [rsp+wq-8], m0 + sub wd, 8 + jz .h8_transpose + add r5, dyq + jg .h8_loop + packuswb m7, m7 +.h8_end_loop: + movq [rsp+wq-8], m7 + sub wd, 8 + jg .h8_end_loop +.h8_transpose: +%if ARCH_X86_32 + mov strideq, [dstq] + mov org_wd, [dstq+strideq] +%endif + or r3d, 8 + cmp org_wd, 4 +%if ARCH_X86_64 + jne .end_transpose_main +%else + jne .end_transpose_loop +%endif + mova m1, [rsp+16*1] + mova m0, [rsp+16*0] + lea r2, [strideq*3] + add rsp, 16*2 + punpcklbw m2, m1, m0 + punpckhbw m1, m0 + punpckhbw m0, m1, m2 + punpcklbw m1, m2 +.write_4x8_end: + call .write_4x8 + RET +.write_4x8: + movd [dstq+r2 ], m0 + pshuflw m4, m0, q1032 + movd [dstq+strideq*2], m4 + punpckhqdq m0, m0 + movd [dstq+strideq*1], m0 + psrlq m0, 32 + movd [dstq+strideq*0], m0 + lea dstq, [dstq+strideq*4] + movd [dstq+r2 ], m1 + pshuflw m4, m1, q1032 + movd [dstq+strideq*2], m4 + punpckhqdq m1, m1 + movd [dstq+strideq*1], m1 + psrlq m1, 32 + movd [dstq+strideq*0], m1 + ret +.h16: + lea r4d, [wq+15] + movd m0, r4d + and r4d, 15 + or r4d, 16 ; imin(w+15, 31) + test angled, 0x400 + jnz .h16_main + movd m2, angled + shr angled, 8 ; is_sm << 1 + pxor m1, m1 + pshufb m0, m1 + pshufb m2, m1 + movq m3, [base+z_filter_t_w16+angleq*4] + pcmpeqb m1, m0, [base+z_filter_wh16] + pand m1, m2 + pcmpgtb m1, m3 + pmovmskb r5d, m1 + test r5d, r5d + jz .h16_main ; filter_strength == 0 + mova m0, [tlq-16*2+1] + imul r5d, 0x24924924 + mova m1, [tlq-16*1+1] + neg r4 + movd m2, [tlq-16*0+1] + shr r5d, 30 + movd m3, [tlq+r4] + adc r5, -4 ; filter_strength-3 + pxor m7, m7 + lea tlq, [rsp+16*2] + mova [tlq-16*1], m0 + pshufb m2, m7 + mova [tlq+16*0], m1 + pshufb m3, m7 + mova [tlq+16*1], m2 + movq [tlq+r4+8], m3 + neg r4d + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + add tlq, 31 + cmp wd, 16 + jle .h16_main + pshuflw m0, [tlq-47], q0000 + sar r5, 1 + movq m1, [base+z3_filter_k_tail+r5*4] + lea r4d, [r5+33] + pmaddubsw m0, m1 +%if ARCH_X86_64 + pmulhrsw m0, m10 +%else + pmulhrsw m0, m4 +%endif + packuswb m0, m0 + movd [tlq-35], m0 +.h16_main: + movd m5, dyd + sub tlq, r4 + movd m4, r4d + shl r4d, 6 + movd m7, [tlq] + pxor m6, m6 + pshufb m5, [base+pw_256] + neg dyq + pshufb m7, m6 + mova m3, [base+z3_shuf] + lea r5, [dyq+r4+63] + pshufb m4, m6 + psubb m4, [base+pb_15to0] + shl wd, 4 + mova m6, m5 + sub rsp, wq +.h16_loop: + mov r4, r5 + pand m2, m8, m5 + sar r4, 6 + psubw m1, m9, m2 + psllw m2, 8 + movu m0, [tlq+r4-8*2] + por m2, m1 + movu m1, [tlq+r4-8*1] + pshufb m0, m3 + pmaddubsw m0, m2 + pshufb m1, m3 + pmaddubsw m1, m2 + psrlw m2, m5, 6 + paddw m5, m6 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packsswb m2, m2 + packuswb m0, m1 + pcmpgtb m1, m4, m2 + pand m0, m1 + pandn m1, m7 + por m0, m1 + mova [rsp+wq-16], m0 + sub wd, 16 + jz .h16_transpose + add r5, dyq + jg .h16_loop +.h16_end_loop: + mova [rsp+wq-16], m7 + sub wd, 16 + jg .h16_end_loop +.h16_transpose: +%if ARCH_X86_32 + mov strideq, [dstq] + mov org_wd, [dstq+strideq] +%endif + or r3d, 16 + cmp org_wd, 4 +%if ARCH_X86_64 + jne .end_transpose_main +%else + jne .end_transpose_loop +%endif +.h16_transpose_w4: + mova m2, [rsp+16*3] + mova m4, [rsp+16*2] + mova m3, [rsp+16*1] + mova m0, [rsp+16*0] + lea r2, [strideq*3] + add rsp, 16*4 + punpckhbw m1, m2, m4 + punpcklbw m2, m4 + punpckhbw m4, m3, m0 + punpcklbw m3, m0 + punpckhwd m0, m1, m4 + punpcklwd m1, m4 + call .write_4x8 + lea dstq, [dstq+strideq*4] + punpckhwd m0, m2, m3 + punpcklwd m1, m2, m3 + jmp .write_4x8_end +.h32: + lea r4d, [wq+31] + and r4d, 31 + or r4d, 32 ; imin(w+31, 63) + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h32_main + mova m0, [tlq-16*4+1] + mova m1, [tlq-16*3+1] + mova m2, [tlq-16*2+1] + mova m3, [tlq-16*1+1] + movd m4, [tlq-16*0+1] + neg r4 + movd m5, [tlq+r4] + pxor m7, m7 + lea tlq, [rsp+16*4] + mova [tlq-16*3], m0 + mova [tlq-16*2], m1 + xor r5d, r5d ; filter_strength = 3 + mova [tlq-16*1], m2 + pshufb m4, m7 + mova [tlq+16*0], m3 + pshufb m5, m7 + mova [tlq+16*1], m4 + movq [tlq+r4+8], m5 + neg r4d + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + sub tlq, 16*2 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + add tlq, 63 + cmp wd, 32 + jle .h32_main + pshuflw m0, [tlq-79], q0000 + movq m1, [base+z3_filter_k_tail] + add r4d, 2 + pmaddubsw m0, m1 +%if ARCH_X86_64 + pmulhrsw m0, m10 +%else + pmulhrsw m0, m4 +%endif + packuswb m0, m0 + movd [tlq-67], m0 +.h32_main: + movd m5, dyd + sub tlq, r4 + movd m4, r4d + shl r4d, 6 + movd m7, [tlq] + pxor m6, m6 + pshufb m5, [base+pw_256] + neg dyq + pshufb m7, m6 + mova m3, [base+z3_shuf] + lea r5, [dyq+r4+63] + pshufb m4, m6 + psubb m4, [base+pb_15to0] + mova m6, m5 +.h32_loop: + mov r4, r5 + pand m2, m8, m5 + sar r4, 6 + psubw m1, m9, m2 + psllw m2, 8 + movu m0, [tlq+r4-8*4] + por m2, m1 + movu m1, [tlq+r4-8*3] + pshufb m0, m3 + pmaddubsw m0, m2 + pshufb m1, m3 + pmaddubsw m1, m2 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + sub rsp, 32 + packuswb m0, m1 + mova [rsp+16*0], m0 + movu m0, [tlq+r4-8*2] + movu m1, [tlq+r4-8*1] + pshufb m0, m3 + pshufb m1, m3 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + psrlw m2, m5, 6 + paddw m5, m6 + packsswb m2, m2 + packuswb m0, m1 + pcmpgtb m1, m4, m2 + paddsb m2, [base+pb_16] + pand m0, m1 + pandn m1, m7 + por m0, m1 + pcmpgtb m1, m4, m2 + mova [rsp+16*1], m0 + pand m0, m1, [rsp+16*0] + pandn m1, m7 + por m0, m1 + mova [rsp+16*0], m0 + dec wd + jz .h32_transpose + add r5, dyq + jg .h32_loop +.h32_end_loop: + sub rsp, 32 + mova [rsp+16*1], m7 + mova [rsp+16*0], m7 + dec wd + jg .h32_end_loop +.h32_transpose: + or r3d, 32 + jmp .end_transpose_main +.h64: + lea r4d, [wq+63] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .h64_main + mova m0, [tlq-16*8+1] + mova m1, [tlq-16*7+1] + mova m2, [tlq-16*6+1] + mova m3, [tlq-16*5+1] + mova [rsp+16*1], m0 + mova [rsp+16*2], m1 + mova [rsp+16*3], m2 + mova [rsp+16*4], m3 + mova m0, [tlq-16*4+1] + mova m1, [tlq-16*3+1] + mova m2, [tlq-16*2+1] + mova m3, [tlq-16*1+1] + movd m4, [tlq-16*0+1] + neg r4 + movd m5, [tlq+r4] + pxor m7, m7 + lea tlq, [rsp+16*8] + mova [tlq-16*3], m0 + mova [tlq-16*2], m1 + xor r5d, r5d ; filter_strength = 3 + mova [tlq-16*1], m2 + pshufb m4, m7 + mova [tlq+16*0], m3 + pshufb m5, m7 + mova [tlq+16*1], m4 + movq [tlq+r4+8], m5 + neg r4d + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + sub tlq, 16*2 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + sub tlq, 16*2 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + sub tlq, 16*2 + cmp wd, 64 + jl .h64_filter96 ; skip one call if the last 32 bytes aren't used + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge +.h64_filter96: + add tlq, 127 +.h64_main: + movd m5, dyd + sub tlq, r4 + movd m4, r4d + shl r4d, 6 + movd m7, [tlq] + pxor m6, m6 + pshufb m5, [base+pw_256] + neg dyq + pshufb m7, m6 + mova m3, [base+z3_shuf] + lea r5, [dyq+r4+63] + pshufb m4, m6 + psubb m4, [base+pb_15to0] + mova m6, m5 +.h64_loop: + mov r4, r5 + pand m2, m8, m5 + sar r4, 6 + psubw m1, m9, m2 + psllw m2, 8 + movu m0, [tlq+r4-8*8] + por m2, m1 + movu m1, [tlq+r4-8*7] + pshufb m0, m3 + pmaddubsw m0, m2 + pshufb m1, m3 + pmaddubsw m1, m2 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + sub rsp, 64 + packuswb m0, m1 + mova [rsp+16*0], m0 + movu m0, [tlq+r4-8*6] + movu m1, [tlq+r4-8*5] + pshufb m0, m3 + pshufb m1, m3 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + mova [rsp+16*1], m0 + movu m0, [tlq+r4-8*4] + movu m1, [tlq+r4-8*3] + pshufb m0, m3 + pshufb m1, m3 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + mova [rsp+16*2], m0 + movu m0, [tlq+r4-8*2] + movu m1, [tlq+r4-8*1] + pshufb m0, m3 + pshufb m1, m3 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m10 + pmulhrsw m1, m10 + psrlw m2, m5, 6 + paddw m5, m6 + packsswb m2, m2 + packuswb m0, m1 + pcmpgtb m1, m4, m2 + paddsb m2, [base+pb_16] + pand m0, m1 + pandn m1, m7 + por m0, m1 + pcmpgtb m1, m4, m2 + paddsb m2, [base+pb_16] + mova [rsp+16*3], m0 + pand m0, m1, [rsp+16*2] + pandn m1, m7 + por m0, m1 + pcmpgtb m1, m4, m2 + paddsb m2, [base+pb_16] + mova [rsp+16*2], m0 + pand m0, m1, [rsp+16*1] + pandn m1, m7 + por m0, m1 + pcmpgtb m1, m4, m2 + mova [rsp+16*1], m0 + pand m0, m1, [rsp+16*0] + pandn m1, m7 + por m0, m1 + mova [rsp+16*0], m0 + dec wd + jz .h64_transpose + add r5, dyq + jg .h64_loop +.h64_end_loop: + sub rsp, 64 + mova [rsp+16*3], m7 + mova [rsp+16*2], m7 + mova [rsp+16*1], m7 + mova [rsp+16*0], m7 + dec wd + jg .h64_end_loop +.h64_transpose: + or r3d, 64 +.end_transpose_main: +%if ARCH_X86_64 + lea r5, [r3*3] + lea r7, [strideq*3] +%else + mov strideq, [dstq] + mov org_wd, [dstq+strideq] +%endif +.end_transpose_loop: + lea r4, [rsp+r3-8] + lea r6, [dstq+org_wq-8] +.end_transpose_loop_y: + movq m0, [r4+r3*1] + movq m4, [r4+r3*0] +%if ARCH_X86_64 + movq m1, [r4+r5 ] + movq m5, [r4+r3*2] + lea r2, [r4+r3*4] +%else + lea r2, [r4+r3*2] + movq m1, [r2+r3*1] + movq m5, [r2+r3*0] + lea r2, [r2+r3*2] +%endif + movq m2, [r2+r3*1] + movq m6, [r2+r3*0] +%if ARCH_X86_64 + movq m3, [r2+r5 ] + movq m7, [r2+r3*2] +%else + lea r2, [r2+r3*2] + movq m3, [r2+r3*1] + movq m7, [r2+r3*0] +%endif + sub r4, 8 + punpcklbw m0, m4 + punpcklbw m1, m5 + punpcklbw m2, m6 + punpcklbw m3, m7 + punpckhwd m4, m1, m0 + punpcklwd m1, m0 + punpckhwd m0, m3, m2 + punpcklwd m3, m2 + punpckhdq m2, m3, m1 + punpckldq m3, m1 + punpckldq m1, m0, m4 + punpckhdq m0, m4 + movhps [r6+strideq*0], m0 + movq [r6+strideq*1], m0 +%if ARCH_X86_64 + movhps [r6+strideq*2], m1 + movq [r6+r7 ], m1 + lea r6, [r6+strideq*4] +%else + lea r6, [r6+strideq*2] + movhps [r6+strideq*0], m1 + movq [r6+strideq*1], m1 + lea r6, [r6+strideq*2] +%endif + movhps [r6+strideq*0], m2 + movq [r6+strideq*1], m2 +%if ARCH_X86_64 + movhps [r6+strideq*2], m3 + movq [r6+r7 ], m3 + lea r6, [r6+strideq*4] +%else + lea r6, [r6+strideq*2] + movhps [r6+strideq*0], m3 + movq [r6+strideq*1], m3 + lea r6, [r6+strideq*2] +%endif + cmp r4, rsp + jae .end_transpose_loop_y + lea rsp, [rsp+r3*8] + sub org_wd, 8 + jg .end_transpose_loop + RET + +;------------------------------------------------------------------------------- +;int dav1d_pal_pred_ssse3(pixel *dst, ptrdiff_t stride, const pixel *pal, +; const uint8_t *idx, int w, int h); +;------------------------------------------------------------------------------- cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h - mova m4, [palq] + movq m4, [palq] LEA r2, pal_pred_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r2+wq*4] - packuswb m4, m4 add wq, r2 lea r2, [strideq*3] jmp wq .w4: - pshufb m0, m4, [idxq] - add idxq, 16 - movd [dstq ], m0 + movq m1, [idxq] + add idxq, 8 + psrlw m0, m1, 4 + punpcklbw m1, m0 + pshufb m0, m4, m1 + movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 - movd [dstq+strideq ], m1 + movd [dstq+strideq*1], m1 punpckhqdq m0, m0 movd [dstq+strideq*2], m0 psrlq m0, 32 @@ -1218,60 +3509,68 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h sub hd, 4 jg .w4 RET -ALIGN function_align .w8: - pshufb m0, m4, [idxq] - pshufb m1, m4, [idxq+16] - add idxq, 32 - movq [dstq ], m0 - movhps [dstq+strideq ], m0 + movu m0, [idxq] + add idxq, 16 + pshufb m1, m4, m0 + psrlw m0, 4 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 movq [dstq+strideq*2], m1 movhps [dstq+r2 ], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET -ALIGN function_align .w16: - pshufb m0, m4, [idxq] - pshufb m1, m4, [idxq+16] - pshufb m2, m4, [idxq+32] - pshufb m3, m4, [idxq+48] - add idxq, 64 - mova [dstq ], m0 - mova [dstq+strideq ], m1 - mova [dstq+strideq*2], m2 - mova [dstq+r2 ], m3 - lea dstq, [dstq+strideq*4] - sub hd, 4 + movu m0, [idxq] + add idxq, 16 + pshufb m1, m4, m0 + psrlw m0, 4 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub hd, 2 jg .w16 RET -ALIGN function_align .w32: - pshufb m0, m4, [idxq] - pshufb m1, m4, [idxq+16] - pshufb m2, m4, [idxq+32] - pshufb m3, m4, [idxq+48] - add idxq, 64 - mova [dstq ], m0 - mova [dstq+16 ], m1 - mova [dstq+strideq ], m2 - mova [dstq+strideq+16], m3 - lea dstq, [dstq+strideq*2] - sub hd, 2 + movu m0, [idxq] + add idxq, 16 + pshufb m1, m4, m0 + psrlw m0, 4 + pshufb m2, m4, m0 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + add dstq, strideq + dec hd jg .w32 RET -ALIGN function_align .w64: - pshufb m0, m4, [idxq] - pshufb m1, m4, [idxq+16] - pshufb m2, m4, [idxq+32] - pshufb m3, m4, [idxq+48] - add idxq, 64 - mova [dstq ], m0 - mova [dstq+16], m1 - mova [dstq+32], m2 - mova [dstq+48], m3 + movu m0, [idxq+16*0] + movu m2, [idxq+16*1] + add idxq, 32 + pshufb m1, m4, m0 + psrlw m0, 4 + pshufb m3, m4, m0 + punpcklbw m0, m1, m3 + punpckhbw m1, m3 + mova [dstq+16*0], m0 + mova [dstq+16*1], m1 + pshufb m1, m4, m2 + psrlw m2, 4 + pshufb m3, m4, m2 + punpcklbw m0, m1, m3 + punpckhbw m1, m3 + mova [dstq+16*2], m0 + mova [dstq+16*3], m1 add dstq, strideq sub hd, 1 jg .w64 diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx.h b/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx.h index 9db0454ab..346fde7d9 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx.h @@ -1,6 +1,6 @@ /* - * Copyright © 2018-2021, VideoLAN and dav1d authors - * Copyright © 2018, Two Orioles, LLC + * Copyright © 2018-2023, VideoLAN and dav1d authors + * Copyright © 2018-2023, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -317,6 +317,9 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons assign_itx16_bpc_fn(R, 16, 8, 12, avx2); assign_itx12_bpc_fn( , 16, 16, 12, avx2); assign_itx2_bpc_fn (R, 32, 8, 12, avx2); + assign_itx_bpc_fn(R, 16, 32, identity_identity, IDTX, 12, avx2); + assign_itx_bpc_fn(R, 32, 16, identity_identity, IDTX, 12, avx2); + assign_itx_bpc_fn( , 32, 32, identity_identity, IDTX, 12, avx2); } #endif @@ -346,8 +349,18 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons if (bpc == 10) { assign_itx16_bpc_fn( , 8, 8, 10, avx512icl); assign_itx16_bpc_fn(R, 8, 16, 10, avx512icl); + assign_itx2_bpc_fn (R, 8, 32, 10, avx512icl); assign_itx16_bpc_fn(R, 16, 8, 10, avx512icl); assign_itx12_bpc_fn( , 16, 16, 10, avx512icl); + assign_itx2_bpc_fn (R, 16, 32, 10, avx512icl); + assign_itx2_bpc_fn (R, 32, 8, 10, avx512icl); + assign_itx2_bpc_fn (R, 32, 16, 10, avx512icl); + assign_itx2_bpc_fn ( , 32, 32, 10, avx512icl); + assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl); + assign_itx1_bpc_fn (R, 32, 64, 10, avx512icl); + assign_itx1_bpc_fn (R, 64, 16, 10, avx512icl); + assign_itx1_bpc_fn (R, 64, 32, 10, avx512icl); + assign_itx1_bpc_fn ( , 64, 64, 10, avx512icl); } #endif #endif diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx16_avx2.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx16_avx2.asm index 115fc2070..0da970a1c 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx16_avx2.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx16_avx2.asm @@ -91,12 +91,14 @@ pd_17408: dd 17408 ; 1024 + 16384 pixel_10bpc_max: times 2 dw 0x03ff pixel_12bpc_max: times 2 dw 0x0fff +dconly_10bpc: times 2 dw 0x7c00 +dconly_12bpc: times 2 dw 0x7000 clip_18b_min: dd -0x20000 clip_18b_max: dd 0x1ffff clip_20b_min: dd -0x80000 clip_20b_max: dd 0x7ffff -idct64_mul_16bpc: +const idct64_mul_16bpc dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 @@ -286,12 +288,12 @@ ALIGN function_align %macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x4, %3 %ifidn %1_%2, dct_dct - vpbroadcastd xm3, [pixel_%3bpc_max] + vpbroadcastd xm2, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 4 + or r3d, 4 .dconly2: add r6d, 128 sar r6d, 8 @@ -300,14 +302,13 @@ ALIGN function_align add r6d, 2176 sar r6d, 12 movd xm0, r6d + paddsw xm0, xm2 vpbroadcastw xm0, xm0 - pxor xm2, xm2 .dconly_loop: movq xm1, [dstq+strideq*0] movhps xm1, [dstq+strideq*1] - paddw xm1, xm0 - pmaxsw xm1, xm2 - pminsw xm1, xm3 + paddsw xm1, xm0 + psubusw xm1, xm2 movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 lea dstq, [dstq+strideq*2] @@ -720,12 +721,12 @@ cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 %macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x8, %3 %ifidn %1_%2, dct_dct - vpbroadcastd xm3, [pixel_%3bpc_max] + vpbroadcastd xm2, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 8 + or r3d, 8 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -1216,10 +1217,10 @@ cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 %macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x16, %3 %ifidn %1_%2, dct_dct - vpbroadcastd xm3, [pixel_%3bpc_max] imul r6d, [cq], 181 + vpbroadcastd xm2, [dconly_%3bpc] mov [cq], eobd ; 0 - mov r3d, 16 + or r3d, 16 add r6d, 384 sar r6d, 9 jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3 @@ -1896,12 +1897,12 @@ cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 %macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 8x4, %3 %ifidn %1_%2, dct_dct - vpbroadcastd m3, [pixel_%3bpc_max] + vpbroadcastd m2, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 4 + or r3d, 4 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -2261,12 +2262,12 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 %macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 8x8, %3 %ifidn %1_%2, dct_dct - vpbroadcastd m3, [pixel_%3bpc_max] + vpbroadcastd m2, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 8 + or r3d, 8 .dconly2: add r6d, 384 sar r6d, 9 @@ -2275,14 +2276,13 @@ cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 add r6d, 2176 sar r6d, 12 movd xm0, r6d + paddsw xm0, xm2 vpbroadcastw m0, xm0 - pxor m2, m2 .dconly_loop: mova xm1, [dstq+strideq*0] vinserti128 m1, [dstq+strideq*1], 1 - paddw m1, m0 - pmaxsw m1, m2 - pminsw m1, m3 + paddsw m1, m0 + psubusw m1, m2 mova [dstq+strideq*0], xm1 vextracti128 [dstq+strideq*1], m1, 1 lea dstq, [dstq+strideq*2] @@ -2798,10 +2798,10 @@ cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 %macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth INV_TXFM_FN %1, %2, %3, 8x16, %4 %ifidn %1_%2, dct_dct - vpbroadcastd m3, [pixel_%4bpc_max] imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_%4bpc] mov [cq], eobd ; 0 - mov r3d, 16 + or r3d, 16 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -3137,10 +3137,14 @@ INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity -%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] +%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384] pmulhrsw m%2, m%3, m%1 %if %0 == 4 ; if downshifting by 1 +%ifnum %4 pmulhrsw m%2, m%4 +%else ; without rounding + psraw m%2, 1 +%endif %else paddsw m%1, m%1 %endif @@ -3443,12 +3447,12 @@ ALIGN function_align %macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 16x4, %3 %ifidn %1_%2, dct_dct - vpbroadcastd m4, [pixel_%3bpc_max] + vpbroadcastd m3, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 4 + or r3d, 4 .dconly2: add r6d, 384 sar r6d, 9 @@ -3457,15 +3461,13 @@ ALIGN function_align add r6d, 2176 sar r6d, 12 movd xm0, r6d + paddsw xm0, xm3 vpbroadcastw m0, xm0 - pxor m3, m3 .dconly_loop: - paddw m1, m0, [dstq+strideq*0] - paddw m2, m0, [dstq+strideq*1] - pmaxsw m1, m3 - pmaxsw m2, m3 - pminsw m1, m4 - pminsw m2, m4 + paddsw m1, m0, [dstq+strideq*0] + paddsw m2, m0, [dstq+strideq*1] + psubusw m1, m3 + psubusw m2, m3 mova [dstq+strideq*0], m1 mova [dstq+strideq*1], m2 lea dstq, [dstq+strideq*2] @@ -3963,10 +3965,10 @@ cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 %macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 16x8, %3 %ifidn %1_%2, dct_dct - vpbroadcastd m4, [pixel_%3bpc_max] imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%3bpc] mov [cq], eobd ; 0 - mov r3d, 8 + or r3d, 8 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -4602,10 +4604,10 @@ cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 %macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth INV_TXFM_FN %1, %2, %3, 16x16, %4 %ifidn %1_%2, dct_dct - vpbroadcastd m4, [pixel_%4bpc_max] imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%4bpc] mov [cq], eobd ; 0 - mov r3d, 16 + or r3d, 16 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 @@ -5760,10 +5762,10 @@ cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob call m(idct_8x8_internal_10bpc).write_8x4 RET .dconly: - vpbroadcastd m3, [pixel_10bpc_max] imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 32 + or r3d, 32 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 @@ -6306,10 +6308,10 @@ cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob call m(idct_8x8_internal_10bpc).write_8x4 RET .dconly: - vpbroadcastd m3, [pixel_12bpc_max] imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_12bpc] mov [cq], eobd ; 0 - mov r3d, 32 + or r3d, 32 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 @@ -6377,9 +6379,9 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .full imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 8 - vpbroadcastd m4, [pixel_10bpc_max] + or r3d, 8 .dconly: add r6d, 640 sar r6d, 10 @@ -6388,15 +6390,13 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob add r6d, 2176 sar r6d, 12 movd xm0, r6d + paddsw xm0, xm3 vpbroadcastw m0, xm0 - pxor m3, m3 .dconly_loop: - paddw m1, m0, [dstq+32*0] - paddw m2, m0, [dstq+32*1] - pmaxsw m1, m3 - pmaxsw m2, m3 - pminsw m1, m4 - pminsw m2, m4 + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + psubusw m1, m3 + psubusw m2, m3 mova [dstq+32*0], m1 mova [dstq+32*1], m2 add dstq, strideq @@ -6513,9 +6513,9 @@ cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .full imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_12bpc] mov [cq], eobd ; 0 - mov r3d, 8 - vpbroadcastd m4, [pixel_12bpc_max] + or r3d, 8 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly .full: PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob @@ -6593,10 +6593,10 @@ cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob REPX {mova [r6+32*x], m4}, 0, 1, 2, 3 jmp .fast .dconly: - vpbroadcastd m4, [pixel_10bpc_max] imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 32 + or r3d, 32 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -6841,10 +6841,11 @@ ALIGN function_align ret cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob + vpbroadcastd m7, [pixel_10bpc_max] +.pass1: vpbroadcastd m8, [pw_2896x8] vpbroadcastd m9, [pw_1697x16] vpbroadcastd m11, [pw_8192] - vpbroadcastd m7, [pixel_10bpc_max] lea r6, [strideq*5] pxor m6, m6 paddw m10, m11, m11 ; pw_16384 @@ -6914,11 +6915,15 @@ ALIGN function_align punpckhqdq m1, m3, m2 jmp m(iidentity_8x8_internal_10bpc).write_2x8x2 +cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1 + cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob - %undef cmp +%undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*4] @@ -6946,14 +6951,14 @@ cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob jmp .end .dconly: imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 16 + or r3d, 16 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 - vpbroadcastd m4, [pixel_10bpc_max] jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 .full: add cq, 32 @@ -7140,10 +7145,11 @@ ALIGN function_align jmp m(idct_16x8_internal_10bpc).write_16x4 cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob + vpbroadcastd m7, [pixel_10bpc_max] +.pass1: vpbroadcastd m8, [pw_2896x8] vpbroadcastd m9, [pw_1697x16] - vpbroadcastd m10, [pw_2048] - vpbroadcastd m7, [pixel_10bpc_max] + vpbroadcastd m10, [pw_4096] lea r6, [strideq*5] pxor m6, m6 mov r5, dstq @@ -7191,16 +7197,20 @@ ALIGN function_align packssdw m3, [cq+64*7] REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 REPX {paddsw x, x }, m0, m1, m2, m3 - REPX {IDTX16 x, 4, 9 }, 0, 1, 2, 3 + REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3 REPX {pmulhrsw x, m10}, m0, m1, m2, m3 REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 +cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1 + cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob - %undef cmp +%undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*7] @@ -7217,9 +7227,9 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob jmp .pass2 .dconly: imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 32 - vpbroadcastd m4, [pixel_10bpc_max] + or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly .fast: lea r4, [rsp+32*71] @@ -7368,9 +7378,10 @@ ALIGN function_align jmp m(idct_16x16_internal_8bpc).main cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob - %undef cmp - vpbroadcastd m5, [pw_8192] +%undef cmp vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m5, [pw_8192] pxor m6, m6 lea r6, [strideq*3] lea r5, [strideq*5] @@ -7436,6 +7447,10 @@ ALIGN function_align REPX {pmulhrsw x, m5}, m0, m1, m2, m3 jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero +cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1 + %macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) %if %1 & 1 mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n @@ -7476,7 +7491,7 @@ cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob - %undef cmp +%undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -7494,10 +7509,10 @@ cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob call .main jmp .pass2 .dconly: - vpbroadcastd m4, [pixel_10bpc_max] imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 64 + or r3d, 64 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 @@ -7818,7 +7833,7 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob - %undef cmp +%undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*6] @@ -7835,14 +7850,14 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob jmp .pass2 .dconly: imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 - mov r3d, 64 + or r3d, 64 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 - vpbroadcastd m4, [pixel_10bpc_max] jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 .fast: lea r4, [rsp+32*70] @@ -8019,28 +8034,24 @@ cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob jnz .normal imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 16 + or r3d, 16 .dconly: add r6d, 640 sar r6d, 10 .dconly2: + vpbroadcastd m5, [dconly_10bpc] imul r6d, 181 add r6d, 2176 sar r6d, 12 movd xm0, r6d -%if WIN64 - movaps [rsp+8], xmm6 -%endif + paddsw xm0, xm5 vpbroadcastw m0, xm0 - vpbroadcastd m6, [pixel_10bpc_max] - pxor m5, m5 .dconly_loop: - paddw m1, m0, [dstq+32*0] - paddw m2, m0, [dstq+32*1] - paddw m3, m0, [dstq+32*2] - paddw m4, m0, [dstq+32*3] - REPX {pmaxsw x, m5}, m1, m2, m3, m4 - REPX {pminsw x, m6}, m1, m2, m3, m4 + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + paddsw m3, m0, [dstq+32*2] + paddsw m4, m0, [dstq+32*3] + REPX {psubusw x, m5}, m1, m2, m3, m4 mova [dstq+32*0], m1 mova [dstq+32*1], m2 mova [dstq+32*2], m3 @@ -8048,13 +8059,10 @@ cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob add dstq, strideq dec r3d jg .dconly_loop -%if WIN64 - movaps xmm6, [rsp+8] -%endif RET .normal: PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob - %undef cmp +%undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -8273,7 +8281,7 @@ cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob - %undef cmp +%undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -8293,7 +8301,7 @@ cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 32 + or r3d, 32 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -8422,7 +8430,7 @@ cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob - %undef cmp +%undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -8442,7 +8450,7 @@ cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 64 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly .fast: pxor m0, m0 diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx16_avx512.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx16_avx512.asm index 9b615fd0b..9f5f909a5 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx16_avx512.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx16_avx512.asm @@ -1,5 +1,5 @@ -; Copyright © 2022, VideoLAN and dav1d authors -; Copyright © 2022, Two Orioles, LLC +; Copyright © 2022-2023, VideoLAN and dav1d authors +; Copyright © 2022-2023, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without @@ -58,12 +58,24 @@ permC: db 0, 9, 0, 0, 0, 1, 4, 4, 2, 11, 2, 2, 2, 3, 6, 6 db 1, 8, 1, 8, 4, 5, 5, 12, 3, 10, 3, 10, 6, 7, 7, 14 db 9, 1, 8, 1, 1, 0, 12, 5, 11, 3, 10, 3, 3, 2, 14, 7 db 8, 0, 9, 9, 5, 4, 13, 13, 10, 2, 11, 11, 7, 6, 15, 15 +idct8x32p: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 + db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 + db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 + db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 +idct32x8p: db 2, 18, 0, 16, 3, 19, 1, 17, 10, 26, 8, 24, 11, 27, 9, 25 + db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57 + db 6, 22, 4, 20, 7, 23, 5, 21, 14, 30, 12, 28, 15, 31, 13, 29 + db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61 +idtx32x8p: db 0, 8, 16, 24, 4, 12, 20, 28, 2, 10, 18, 26, 6, 14, 22, 30 + db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62 + db 1, 9, 17, 25, 5, 13, 21, 29, 3, 11, 19, 27, 7, 15, 23, 31 + db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63 pw_2048_m2048: times 16 dw 2048 pw_m2048_2048: times 16 dw -2048 pw_2048: times 16 dw 2048 -; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++- +; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-, 4=-- %macro COEF_PAIR 2-3 0 ; a, b, flags %if %3 == 1 pd_%1_m%2: dd %1, %1, -%2, -%2 @@ -73,6 +85,10 @@ pd_%1_m%2: dd %1, %1, -%2, -%2 pd_m%1_%2: dd -%1, -%1, %2, %2 %define pd_m%1 (pd_m%1_%2 + 4*0) %define pd_%2 (pd_m%1_%2 + 4*2) +%elif %3 == 4 +pd_m%1_m%2: dd -%1, -%1, -%2, -%2 +%define pd_m%1 (pd_m%1_m%2 + 4*0) +%define pd_m%2 (pd_m%1_m%2 + 4*2) %else pd_%1_%2: dd %1, %1, %2, %2 %define pd_%1 (pd_%1_%2 + 4*0) @@ -84,12 +100,17 @@ dd -%2, -%2 %endif %endmacro +COEF_PAIR 101, 501 +COEF_PAIR 201, 601, 1 COEF_PAIR 201, 995 COEF_PAIR 401, 1189, 1 COEF_PAIR 401, 1931 COEF_PAIR 401, 3920 +COEF_PAIR 401, 4076 +COEF_PAIR 700, 301, 4 COEF_PAIR 799, 2276, 1 COEF_PAIR 799, 3406 +COEF_PAIR 799, 4017 COEF_PAIR 1380, 601 COEF_PAIR 1751, 2440 COEF_PAIR 2598, 1189 @@ -106,15 +127,22 @@ COEF_PAIR 3703, 3290 COEF_PAIR 3857, 4052 COEF_PAIR 4017, 2276 COEF_PAIR 4017, 3406 +COEF_PAIR 4036, 4085 COEF_PAIR 4076, 1189 COEF_PAIR 4076, 3612 COEF_PAIR 4076, 3920 COEF_PAIR 4091, 3973 +COEF_PAIR 4091, 4052 +COEF_PAIR 4095, 4065 -pw_4096 times 2 dw 4096 +pb_32: times 4 db 32 +pw_5: times 2 dw 5 +pw_4096: times 2 dw 4096 +pw_8192: times 2 dw 8192 pw_1697x16: times 2 dw 1697*16 pw_2896x8: times 2 dw 2896*8 pixel_10bpc_max: times 2 dw 0x03ff +dconly_10bpc: times 2 dw 0x7c00 clip_18b_min: dd -0x20000 clip_18b_max: dd 0x1ffff pd_1: dd 1 @@ -127,15 +155,45 @@ pd_5119: dd 5119 ; 1024 + 4096 - 1 pd_5120: dd 5120 ; 1024 + 4096 pd_5793: dd 5793 +cextern dup16_perm cextern int8_permA +cextern idct64_mul_16bpc cextern idct_8x8_internal_8bpc_avx512icl.main cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2 +cextern idct_8x16_internal_8bpc_avx512icl.main cextern idct_8x16_internal_8bpc_avx512icl.main2 +cextern idct_8x16_internal_8bpc_avx512icl.main_fast +cextern idct_8x16_internal_8bpc_avx512icl.main_fast2 cextern iadst_8x16_internal_8bpc_avx512icl.main2 cextern idct_16x8_internal_8bpc_avx512icl.main cextern iadst_16x8_internal_8bpc_avx512icl.main_pass2 cextern idct_16x16_internal_8bpc_avx512icl.main +cextern idct_16x16_internal_8bpc_avx512icl.main2 +cextern idct_16x16_internal_8bpc_avx512icl.main_fast +cextern idct_16x16_internal_8bpc_avx512icl.main_fast2 cextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2 +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast2 +cextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main +cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf +cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast +cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast2 +cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast3 +cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf +cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast +cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2 +cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast3 +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast +cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1 +cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast +cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast2 +cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part2 SECTION .text @@ -224,31 +282,26 @@ ALIGN function_align %ifidn %1_%2, dct_dct imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 8 + or r3d, 8 .dconly: add r6d, 384 sar r6d, 9 +.dconly2: + vpbroadcastd ym2, [o(dconly_10bpc)] imul r6d, 181 - vpbroadcastd m3, [o(pixel_10bpc_max)] - lea r2, [strideq*3] add r6d, 2176 sar r6d, 12 - vpbroadcastw m1, r6d - pxor m2, m2 + vpbroadcastw ym1, r6d + paddsw ym1, ym2 .dconly_loop: mova xm0, [dstq+strideq*0] vinserti32x4 ym0, [dstq+strideq*1], 1 - vinserti32x4 m0, [dstq+strideq*2], 2 - vinserti32x4 m0, [dstq+r2 ], 3 - paddw m0, m1 - pmaxsw m0, m2 - pminsw m0, m3 + paddsw ym0, ym1 + psubusw ym0, ym2 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 - vextracti32x4 [dstq+strideq*2], m0, 2 - vextracti32x4 [dstq+r2 ], m0, 3 - lea dstq, [dstq+strideq*4] - sub r3d, 4 + lea dstq, [dstq+strideq*2] + sub r3d, 2 jg .dconly_loop RET %endif @@ -327,7 +380,7 @@ ALIGN function_align vpbroadcastd m11, [o(pd_1)] ret ALIGN function_align -.main_fast: +.main_fast: ; bottom half is zero vbroadcasti32x4 m3, [o(pd_4017_3406)] vbroadcasti32x4 m8, [o(pd_799_m2276)] vbroadcasti32x4 m2, [o(pd_2896_3784)] @@ -356,6 +409,7 @@ ALIGN function_align psubd m2, m1 ; dct4 out3 out2 REPX {pmaxsd x, m14}, m8, m0, m2 REPX {pminsd x, m15}, m8, m0, m2 +.main3: pshufd m1, m3, q1032 paddd m3, m13 psubd m9, m3, m1 @@ -372,7 +426,7 @@ ALIGN function_align paddd m0, m1 ; out0 out1 paddd m1, m2, m8 ; out3 out2 psubd m2, m8 ; out4 out5 - REPX {psrad x, 1}, m0, m2, m3, m1 + REPX {vpsravd x, m11}, m0, m2, m3, m1 ret INV_TXFM_8X8_FN adst, dct @@ -519,7 +573,7 @@ cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 16 + or r3d, 16 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -539,7 +593,6 @@ cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call .load call .main call .main_end - REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 .pass1_end: packssdw m0, m4 packssdw m1, m5 @@ -634,38 +687,84 @@ ALIGN function_align pmulld m5, m12, [cq+64*5] pmulld m6, m12, [cq+64*6] pmulld m7, m12, [cq+64*7] - REPX {paddd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 - REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.round: + REPX {paddd x, m13}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + REPX {paddd x, m13}, m4, m5, m6, m7 + REPX {psrad x, 12 }, m4, m5, m6, m7 ret ALIGN function_align +.main_fast2_rect2: + REPX {paddd x, m13}, m0, m1 + REPX {psrad x, 12 }, m0, m1 +.main_fast2: + pmulld m0, m12 + pmulld m6, m1, [o(pd_4017)] {1to16} ; t7a + pmulld m8, m1, [o(pd_799)] {1to16} ; t4a + REPX {paddd x, m13}, m0, m6, m8 + REPX {psrad x, 12 }, m0, m6, m8 + pmulld m5, m6, m12 + pmulld m1, m8, m12 + paddd m5, m13 + psubd m4, m5, m1 + paddd m5, m1 + REPX {psrad x, 12 }, m4, m5 + REPX {mova x, m0 }, m1, m2, m3 + ret +.main_fast_rect2: + REPX {paddd x, m13}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_fast: + pmulld m0, m12 + pmulld m5, m3, [o(pd_2276)] {1to16} ; t5a + pmulld m3, [o(pd_3406)] {1to16} ; t6a + pmulld m7, m1, [o(pd_4017)] {1to16} ; t7a + pmulld m1, [o(pd_799)] {1to16} ; t4a + pmulld m6, m2, [o(pd_3784)] {1to16} ; t3 + pmulld m2, [o(pd_1567)] {1to16} ; t2 + paddd m0, m13 + psubd m5, m13, m5 + psrad m0, 12 ; t0 + mova m9, m0 ; t1 + jmp .main2 +.main_rect2: + call .round .main: - ITX_MULSUB_2D 5, 3, 8, 9, 10, 13, 3406, 2276 ; t5a t6a - ITX_MULSUB_2D 1, 7, 8, 9, 10, 13, 799, 4017 ; t4a t7a pmulld m0, m12 + ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a + ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3 pmulld m4, m12 + paddd m0, m13 + paddd m5, m13 + psubd m9, m0, m4 ; t1 + paddd m0, m4 ; t0 + psrad m9, 12 + psrad m0, 12 +.main2: + REPX {paddd x, m13}, m3, m1, m7 + REPX {psrad x, 12 }, m5, m1, m3, m7 paddd m8, m1, m5 ; t4 psubd m1, m5 ; t5a psubd m5, m7, m3 ; t6a paddd m7, m3 ; t7 pmaxsd m5, m14 pmaxsd m1, m14 + paddd m2, m13 + paddd m6, m13 pminsd m5, m15 pminsd m1, m15 pmulld m5, m12 pmulld m1, m12 - ITX_MULSUB_2D 2, 6, 3, 9, 10, 13, 1567, 3784 ; t2 t3 pmaxsd m8, m14 pmaxsd m7, m14 - paddd m0, m13 pminsd m8, m15 - psubd m3, m0, m4 paddd m5, m13 - paddd m0, m4 psubd m4, m5, m1 paddd m5, m1 - REPX {psrad x, 12 }, m3, m5, m0, m4 - paddd m1, m3, m2 ; dct4 out1 - psubd m2, m3, m2 ; dct4 out2 + REPX {psrad x, 12 }, m2, m6, m5, m4 + paddd m1, m9, m2 ; dct4 out1 + psubd m2, m9, m2 ; dct4 out2 psubd m3, m0, m6 ; dct4 out3 paddd m0, m6 ; dct4 out0 pminsd m6, m15, m7 @@ -673,9 +772,9 @@ ALIGN function_align REPX {pminsd x, m15}, m0, m1, m2, m3 ret .main_end: - vpbroadcastd m7, [o(pd_1)] + vpbroadcastd m11, [o(pd_1)] .main_end2: - REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {paddd x, m11}, m0, m1, m2, m3 psubd m7, m0, m6 ; out7 paddd m0, m6 ; out0 psubd m6, m1, m5 ; out6 @@ -684,6 +783,7 @@ ALIGN function_align paddd m2, m4 ; out2 psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 + REPX {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 ret INV_TXFM_8X16_FN adst, dct @@ -942,25 +1042,25 @@ cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 8 + or r3d, 8 +.dconly: add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 -.dconly: +.dconly2: + vpbroadcastd m2, [o(dconly_10bpc)] imul r6d, 181 - vpbroadcastd m3, [o(pixel_10bpc_max)] add r6d, 2176 sar r6d, 12 vpbroadcastw m1, r6d - pxor m2, m2 + paddsw m1, m2 .dconly_loop: mova ym0, [dstq+strideq*0] vinserti32x8 m0, [dstq+strideq*1], 1 - paddw m0, m1 - pmaxsw m0, m2 - pminsw m0, m3 + paddsw m0, m1 + psubusw m0, m2 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] @@ -1026,42 +1126,39 @@ cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 mova m8, [o(permA)] psrlq m9, m8, 8 .pass1_end2: - REPX {psrad x, 1 }, m0, m4, m1, m5, m2, m6, m3, m7 mova m10, m9 mova m11, m8 -.pass1_end3: call .transpose_16x8 jmp tx2q .pass2: lea r5, [o_base_8bpc] call m(idct_16x8_internal_8bpc).main movshdup m4, [permC] - vpbroadcastd m13, [pw_2048] + vpbroadcastd m11, [pw_2048] psrlq m5, m4, 8 - vpermq m0, m4, m0 - vpermq m1, m5, m1 - vpermq m2, m4, m2 - vpermq m3, m5, m3 .end: - vpbroadcastd m15, [pixel_10bpc_max] - pxor m14, m14 - pmulhrsw m8, m13, m0 - pmulhrsw m9, m13, m1 + vpbroadcastd m13, [pixel_10bpc_max] + pxor m12, m12 + vpermq m8, m4, m0 + vpermq m9, m5, m1 lea r6, [strideq*3] call .write_16x4 - pmulhrsw m8, m13, m2 - pmulhrsw m9, m13, m3 + vpermq m8, m4, m2 + vpermq m9, m5, m3 .write_16x4: + pmulhrsw m8, m11 + pmulhrsw m9, m11 +.write_16x4_noround: mova ym10, [dstq+strideq*0] vinserti32x8 m10, [dstq+strideq*1], 1 paddw m8, m10 mova ym10, [dstq+strideq*2] vinserti32x8 m10, [dstq+r6 ], 1 paddw m9, m10 - pmaxsw m8, m14 - pmaxsw m9, m14 - pminsw m8, m15 - pminsw m9, m15 + pmaxsw m8, m12 + pmaxsw m9, m12 + pminsw m8, m13 + pminsw m9, m13 mova [dstq+strideq*0], ym8 vextracti32x8 [dstq+strideq*1], m8, 1 mova [dstq+strideq*2], ym9 @@ -1069,48 +1166,49 @@ cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 lea dstq, [dstq+strideq*4] ret ALIGN function_align -.main_fast: - vbroadcasti32x4 m6, [o(pd_4076_3920)] - vbroadcasti32x4 m3, [o(pd_401_m1189)] - vbroadcasti32x4 m5, [o(pd_m2598_1931)] - vbroadcasti32x4 m9, [o(pd_3166_3612)] - pmulld m6, m4 ; t15a t12a - pmulld m4, m3 ; t8a t11a - pmulld m5, m7 ; t9a t10a - pmulld m7, m9 ; t14a t13a +.main_fast: ; bottom half is zero + vbroadcasti32x4 m6, [o(pd_4076_3920)] + vbroadcasti32x4 m3, [o(pd_401_m1189)] + vbroadcasti32x4 m5, [o(pd_m2598_1931)] + vbroadcasti32x4 m9, [o(pd_3166_3612)] + pmulld m6, m4 ; t15a t12a + pmulld m4, m3 ; t8a t11a + pmulld m5, m7 ; t9a t10a + pmulld m7, m9 ; t14a t13a jmp .main2 .main: ITX_MULSUB_2D 4, 6, 3, 9, 10, _, 401_3920, 4076_1189 ITX_MULSUB_2D 5, 7, 3, 9, 10, _, 3166_1931, 2598_3612 .main2: - psubd m3, m0, m1 ; dct8 out7 out6 - paddd m0, m1 ; dct8 out0 out1 - paddd m1, m2, m8 ; dct8 out3 out2 - psubd m2, m8 ; dct8 out4 out5 REPX {paddd x, m13}, m4, m6, m5, m7 REPX {psrad x, 12 }, m4, m5, m6, m7 - paddd m8, m4, m5 ; t8 t11 + paddd m9, m4, m5 ; t8 t11 psubd m4, m5 ; t9 t10 psubd m5, m6, m7 ; t14 t13 paddd m6, m7 ; t15 t12 - REPX {pmaxsd x, m14}, m5, m4 - REPX {pminsd x, m15}, m5, m4 - vbroadcasti32x4 m7, [pd_3784_m3784] + REPX {pmaxsd x, m14}, m5, m4, m9, m6 + REPX {pminsd x, m15}, m5, m4, m9, m6 +.main3: + psubd m3, m0, m1 ; dct8 out7 out6 + paddd m0, m1 ; dct8 out0 out1 + vbroadcasti32x4 m7, [o(pd_3784_m3784)] pmulld m7, m5 - vpmulld m5, [pd_1567] {1to16} - vbroadcasti32x4 m9, [pd_1567_m1567] - pmulld m9, m4 - vpmulld m4, [pd_3784] {1to16} - REPX {pmaxsd x, m14}, m0, m1, m8, m6 - REPX {pminsd x, m15}, m0, m1, m8, m6 + vpmulld m5, [o(pd_1567)] {1to16} + paddd m1, m2, m8 ; dct8 out3 out2 + psubd m2, m8 ; dct8 out4 out5 + vbroadcasti32x4 m8, [o(pd_1567_m1567)] + pmulld m8, m4 + vpmulld m4, [o(pd_3784)] {1to16} + REPX {pmaxsd x, m14}, m0, m1 + REPX {pminsd x, m15}, m0, m1 paddd m7, m13 paddd m5, m13 - paddd m7, m9 + paddd m7, m8 psubd m5, m4 psrad m7, 12 ; t14a t10a psrad m5, 12 ; t9a t13a - punpckhqdq m4, m8, m7 - punpcklqdq m8, m5 + punpckhqdq m4, m9, m7 + punpcklqdq m8, m9, m5 punpckhqdq m5, m6, m5 punpcklqdq m6, m7 psubd m7, m8, m4 ; t11a t10 @@ -1166,17 +1264,19 @@ cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 psubd m5, m9, m6 paddd m6, m9, m7 psubd m7, m9, m8 +.pass1_end: mova m9, [o(permA)] psrlq m8, m9, 8 + REPX {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7 jmp m(idct_16x8_internal_10bpc).pass1_end2 .pass2: call .main_pass2 - vpermq m8, m13, m0 - vpermq m9, m13, m1 - call m(idct_16x8_internal_10bpc).write_16x4 - vpermq m8, m13, m2 - vpermq m9, m13, m3 - jmp m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m11, m0 + vpermq m9, m11, m1 + call m(idct_16x8_internal_10bpc).write_16x4_noround + vpermq m8, m11, m2 + vpermq m9, m11, m3 + jmp m(idct_16x8_internal_10bpc).write_16x4_noround ALIGN function_align .main_pass1: vpbroadcastd m12, [o(pd_2896)] @@ -1310,11 +1410,11 @@ ALIGN function_align pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_16x8_internal_8bpc).main_pass2 - movshdup m13, [permC] + movshdup m11, [permC] pmulhrsw m0, m6 pmulhrsw m1, m6 - vpbroadcastd m15, [pixel_10bpc_max] - pxor m14, m14 + vpbroadcastd m13, [pixel_10bpc_max] + pxor m12, m12 lea r6, [strideq*3] ret @@ -1334,18 +1434,16 @@ cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 paddd m1, m9, m7 paddd m7, m9, m0 psubd m0, m9, m8 - mova m9, [o(permA)] - psrlq m8, m9, 8 - jmp m(idct_16x8_internal_10bpc).pass1_end2 + jmp m(iadst_16x8_internal_10bpc).pass1_end .pass2: call m(iadst_16x8_internal_10bpc).main_pass2 - psrlq m13, 8 - vpermq m8, m13, m3 - vpermq m9, m13, m2 - call m(idct_16x8_internal_10bpc).write_16x4 - vpermq m8, m13, m1 - vpermq m9, m13, m0 - jmp m(idct_16x8_internal_10bpc).write_16x4 + psrlq m11, 8 + vpermq m8, m11, m3 + vpermq m9, m11, m2 + call m(idct_16x8_internal_10bpc).write_16x4_noround + vpermq m8, m11, m1 + vpermq m9, m11, m0 + jmp m(idct_16x8_internal_10bpc).write_16x4_noround INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst @@ -1355,22 +1453,21 @@ INV_TXFM_16X8_FN identity, identity cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call m(idct_8x16_internal_10bpc).load2 vpbroadcastd m8, [o(pd_5793)] - vpbroadcastd m9, [o(pd_3072)] + vpbroadcastd m13, [o(pd_3072)] pxor m10, m10 REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 - REPX {mova [cq+64*x], m10}, 0, 1, 2, 3 - REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 - REPX {mova [cq+64*x], m10}, 4, 5, 6, 7 - REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {mova [cq+64*x], m10}, 0, 1, 2, 3, 4, 5, 6, 7 + call m(idct_8x16_internal_10bpc).round psrlq m8, [o(permA)], 16 psrlq m9, m8, 8 mova m10, m8 mova m11, m9 - jmp m(idct_16x8_internal_10bpc).pass1_end3 + call m(idct_16x8_internal_10bpc).transpose_16x8 + jmp tx2q .pass2: movshdup m4, [o(permC)] - vpbroadcastd m13, [o(pw_4096)] - REPX {vpermq x, m4, x}, m0, m1, m2, m3 + vpbroadcastd m11, [o(pw_4096)] + mova m5, m4 jmp m(idct_16x8_internal_10bpc).end %macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset @@ -1378,10 +1475,10 @@ cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 mov [cq], eobd ; 0 - mov r3d, 16 + or r3d, 16 add r6d, 640 sar r6d, 10 - jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly + jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2 %endif %endmacro @@ -1422,52 +1519,13 @@ cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call .main call .main_end .pass1_end: - packssdw m0, m16 - packssdw m1, m17 - packssdw m2, m18 - packssdw m3, m19 - packssdw m4, m20 - packssdw m5, m21 - packssdw m6, m22 - packssdw m7, m23 %if WIN64 movaps xmm6, [cq+16*0] movaps xmm7, [cq+16*1] %endif vzeroupper .pass1_end2: - punpckhwd m8, m0, m1 - punpcklwd m0, m1 - punpckhwd m1, m2, m3 - punpcklwd m2, m3 - punpckhwd m3, m4, m5 - punpcklwd m4, m5 - punpcklwd m5, m6, m7 - punpckhwd m6, m7 - punpckhdq m7, m0, m2 - punpckldq m0, m2 - punpckhdq m2, m8, m1 - punpckldq m8, m1 - punpckhdq m1, m4, m5 - punpckldq m4, m5 - punpckhdq m5, m3, m6 - punpckldq m3, m6 - vshufi32x4 m6, m0, m4, q3232 - vinserti32x8 m0, ym4, 1 - vinserti32x8 m4, m8, ym3, 1 - vshufi32x4 m8, m3, q3232 - vinserti32x8 m3, m7, ym1, 1 - vshufi32x4 m7, m1, q3232 - vshufi32x4 m1, m2, m5, q3232 - vinserti32x8 m2, ym5, 1 - vshufi32x4 m5, m7, m1, q2020 ; 10 11 - vshufi32x4 m7, m1, q3131 ; 14 15 - vshufi32x4 m1, m3, m2, q2020 ; 2 3 - vshufi32x4 m3, m2, q3131 ; 6 7 - vshufi32x4 m2, m0, m4, q3131 ; 4 5 - vshufi32x4 m0, m4, q2020 ; 0 1 - vshufi32x4 m4, m6, m8, q2020 ; 8 9 - vshufi32x4 m6, m8, q3131 ; 12 13 + call .main_end3 .pass1_end3: mov r6d, 64*12 pxor m8, m8 @@ -1482,33 +1540,33 @@ cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 .pass2: lea r5, [o_base_8bpc] call m(idct_16x16_internal_8bpc).main - movshdup m10, [permC] - vpbroadcastd m13, [pw_2048] - psrlq m11, m10, 8 - vpermq m8, m10, m0 - vpermq m0, m11, m7 - vpermq m7, m11, m1 - vpermq m1, m10, m6 - vpermq m6, m10, m2 - vpermq m2, m11, m5 - vpermq m5, m11, m3 - vpermq m3, m10, m4 + movshdup m12, [permC] + vpbroadcastd m11, [pw_2048] + psrlq m13, m12, 8 + vpermq m8, m12, m0 + vpermq m0, m13, m7 + vpermq m7, m13, m1 + vpermq m1, m12, m6 + vpermq m6, m12, m2 + vpermq m2, m13, m5 + vpermq m5, m13, m3 + vpermq m3, m12, m4 .pass2_end: lea r6, [strideq*3] - vpbroadcastd m15, [pixel_10bpc_max] - pxor m14, m14 - pmulhrsw m8, m13, m8 - pmulhrsw m9, m13, m7 - call m(idct_16x8_internal_10bpc).write_16x4 - pmulhrsw m8, m13, m6 - pmulhrsw m9, m13, m5 - call m(idct_16x8_internal_10bpc).write_16x4 - pmulhrsw m8, m13, m3 - pmulhrsw m9, m13, m2 - call m(idct_16x8_internal_10bpc).write_16x4 - pmulhrsw m8, m13, m1 - pmulhrsw m9, m13, m0 - jmp m(idct_16x8_internal_10bpc).write_16x4 + vpbroadcastd m13, [pixel_10bpc_max] + pxor m12, m12 + pmulhrsw m8, m11, m8 + pmulhrsw m9, m11, m7 + call m(idct_16x8_internal_10bpc).write_16x4_noround + pmulhrsw m8, m11, m6 + pmulhrsw m9, m11, m5 + call m(idct_16x8_internal_10bpc).write_16x4_noround + pmulhrsw m8, m11, m3 + pmulhrsw m9, m11, m2 + call m(idct_16x8_internal_10bpc).write_16x4_noround + pmulhrsw m8, m11, m1 + pmulhrsw m9, m11, m0 + jmp m(idct_16x8_internal_10bpc).write_16x4_noround .fast: mova ym0, [cq+64*0] mova ym2, [cq+64*4] @@ -1525,17 +1583,54 @@ cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 vpermt2q m7, m8, m6 ; 7 5 call m(idct_8x8_internal_10bpc).main_fast call m(idct_16x8_internal_10bpc).main_fast - vpbroadcastd m7, [o(pd_2)] + vpbroadcastd m11, [o(pd_2)] call m(idct_8x16_internal_10bpc).main_end2 mova m8, [o(permA)] psrlq m9, m8, 8 jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2 ALIGN function_align +.main_fast2_rect2: + REPX {paddd x, m13}, m16, m17 + REPX {psrad x, 12 }, m16, m17 +.main_fast2: + pmulld m22, m16, [o(pd_4076)] {1to16} ; t15a + pmulld m9, m16, [o(pd_401)] {1to16} ; t8a + pmulld m18, m17, [o(pd_1189)] {1to16} ; t11a + pmulld m17, [o(pd_3920)] {1to16} ; t12a + psubd m18, m13, m18 + REPX {paddd x, m13}, m22, m9, m17 + REPX {psrad x, 12 }, m18, m22, m9, m17 + + mova m20, m9 + mova m16, m18 + mova m23, m22 + mova m19, m17 + jmp .main3 +.main_fast_rect2: + REPX {paddd x, m13}, m16, m17, m18, m19 + REPX {psrad x, 12 }, m16, m17, m18, m19 +.main_fast: + pmulld m23, m16, [o(pd_4076)] {1to16} ; t15a + pmulld m16, [o(pd_401)] {1to16} ; t8a + pmulld m20, m19, [o(pd_2598)] {1to16} ; t9a + pmulld m19, [o(pd_3166)] {1to16} ; t14a + pmulld m22, m17, [o(pd_1189)] {1to16} ; t11a + pmulld m17, [o(pd_3920)] {1to16} ; t12a + pmulld m21, m18, [o(pd_3612)] {1to16} ; t13a + pmulld m18, [o(pd_1931)] {1to16} ; t10a + psubd m20, m13, m20 + psubd m22, m13, m22 + call .round2 + jmp .main2 +.main_rect2: + call .round .main: - ITX_MULSUB_2D 16, 23, 7, 9, 10, 13, 401, 4076 ; t8a, t15a - ITX_MULSUB_2D 20, 19, 7, 9, 10, 13, 3166, 2598 ; t9a, t14a - ITX_MULSUB_2D 22, 17, 7, 9, 10, 13, 3920, 1189 ; t11a, t12a - ITX_MULSUB_2D 18, 21, 7, 9, 10, 13, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 401, 4076 ; t8a, t15a + ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3166, 2598 ; t9a, t14a + ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3920, 1189 ; t11a, t12a + ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1931, 3612 ; t10a, t13a + call .round +.main2: paddd m9, m20, m16 ; t8 psubd m20, m16, m20 ; t9 psubd m16, m22, m18 ; t10 @@ -1544,14 +1639,15 @@ ALIGN function_align psubd m23, m19 ; t14 psubd m19, m17, m21 ; t13 paddd m17, m21 ; t12 - vpbroadcastd m11, [o(pd_3784)] REPX {pmaxsd x, m14}, m20, m23, m16, m19 - vpbroadcastd m10, [o(pd_1567)] REPX {pminsd x, m15}, m20, m23, m16, m19 - ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11 - ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2 REPX {pmaxsd x, m14}, m9, m18, m22, m17 REPX {pminsd x, m15}, m9, m18, m22, m17 +.main3: + vpbroadcastd m11, [o(pd_3784)] + vpbroadcastd m10, [o(pd_1567)] + ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11 + ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2 paddd m21, m20, m19 ; t14 psubd m20, m19 ; t13 psubd m19, m9, m18 ; t11a @@ -1586,8 +1682,9 @@ ALIGN function_align REPX {psrad x, 12 }, m20, m19, m18, m17 ret .main_end: - vpbroadcastd m23, [o(pd_2)] - REPX {paddd x, m23}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m11, [o(pd_2)] +.main_end2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 psubd m23, m0, m22 ; out15 paddd m0, m22 ; out0 psubd m22, m1, m21 ; out14 @@ -1604,8 +1701,62 @@ ALIGN function_align paddd m6, m16 ; out6 psubd m16, m7, m9 ; out8 paddd m7, m9 ; out7 - REPX {psrad x, 2 }, m0, m16, m1, m17, m2, m18, m3, m19, \ + REPX {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \ m4, m20, m5, m21, m6, m22, m7, m23 + packssdw m0, m16 + packssdw m1, m17 + packssdw m2, m18 + packssdw m3, m19 + packssdw m4, m20 + packssdw m5, m21 + packssdw m6, m22 + packssdw m7, m23 + ret +.main_end3: + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m4, m5 + punpcklwd m4, m5 + punpcklwd m5, m6, m7 + punpckhwd m6, m7 + punpckhdq m7, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m8, m1 + punpckldq m8, m1 + punpckhdq m1, m4, m5 + punpckldq m4, m5 + punpckhdq m5, m3, m6 + punpckldq m3, m6 + vshufi32x4 m6, m0, m4, q3232 + vinserti32x8 m0, ym4, 1 + vinserti32x8 m4, m8, ym3, 1 + vshufi32x4 m8, m3, q3232 + vinserti32x8 m3, m7, ym1, 1 + vshufi32x4 m7, m1, q3232 + vshufi32x4 m1, m2, m5, q3232 + vinserti32x8 m2, ym5, 1 + vshufi32x4 m5, m7, m1, q2020 ; 10 11 + vshufi32x4 m7, m1, q3131 ; 14 15 + vshufi32x4 m1, m3, m2, q2020 ; 2 3 + vshufi32x4 m3, m2, q3131 ; 6 7 + vshufi32x4 m2, m0, m4, q3131 ; 4 5 + vshufi32x4 m0, m4, q2020 ; 0 1 + vshufi32x4 m4, m6, m8, q2020 ; 8 9 + vshufi32x4 m6, m8, q3131 ; 12 13 + ret +ALIGN function_align +.round: + paddd m20, m13 + paddd m22, m13 +.round2: + paddd m16, m13 + paddd m18, m13 +.round3: + REPX {psrad x, 12 }, m16, m18, m20, m22 + REPX {paddd x, m13}, m17, m19, m21, m23 + REPX {psrad x, 12 }, m17, m19, m21, m23 ret INV_TXFM_16X16_FN adst, dct @@ -1617,22 +1768,14 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 cmp eobd, 36 jl .fast call .main_pass1 - psrad m0, m1, 2 - psrad m16, 13 - psrad m1, m2, 2 - psrad m17, 13 - psrad m2, m3, 2 - psrad m18, 13 - psrad m3, m4, 2 - psrad m19, 13 - psrad m4, m5, 13 - psrad m20, 2 - psrad m5, m6, 13 - psrad m21, 2 - psrad m6, m7, 13 - psrad m22, 2 - psrad m7, m8, 13 - psrad m23, 2 + packssdw m0, m16 + packssdw m1, m17 + packssdw m2, m18 + packssdw m3, m19 + packssdw m4, m5, m20 + packssdw m5, m6, m21 + packssdw m6, m7, m22 + packssdw m7, m8, m23 jmp m(idct_16x16_internal_10bpc).pass1_end .fast: call .main_pass1_fast @@ -1648,8 +1791,8 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 .pass1_fast_end: mova m9, [o(permA)] psrlq m8, m9, 8 -.pass1_fast_end2: REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 +.pass1_fast_end2: mova m10, m9 mova m11, m8 call m(idct_16x8_internal_10bpc).transpose_16x8 @@ -1660,17 +1803,17 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 .pass2: lea r5, [o_base_8bpc] call m(iadst_16x16_internal_8bpc).main_pass2b - movshdup m10, [permC] - mova m13, [pw_2048_m2048] - psrlq m11, m10, 8 - vpermq m8, m11, m0 - vpermq m0, m10, m7 - vpermq m7, m11, m1 - vpermq m1, m10, m6 - vpermq m6, m11, m2 - vpermq m2, m10, m5 - vpermq m5, m11, m3 - vpermq m3, m10, m4 + movshdup m12, [permC] + mova m11, [pw_2048_m2048] + psrlq m13, m12, 8 + vpermq m8, m13, m0 + vpermq m0, m12, m7 + vpermq m7, m13, m1 + vpermq m1, m12, m6 + vpermq m6, m13, m2 + vpermq m2, m12, m5 + vpermq m5, m13, m3 + vpermq m3, m12, m4 jmp m(idct_16x16_internal_10bpc).pass2_end ALIGN function_align .main_pass1: @@ -1705,89 +1848,89 @@ ALIGN function_align ITX_MULSUB_2D 1, 22, 8, 9, 10, 13, 4052, 601 ; t15 t14 vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] - psubd m8, m6, m22 ; t14a - paddd m6, m22 ; t6a - psubd m22, m0, m16 ; t8a - paddd m16, m0 ; t0a - REPX {pmaxsd x, m14}, m8, m6, m22, m16 - psubd m0, m23, m7 ; t9a + psubd m9, m23, m7 ; t9a paddd m23, m7 ; t1a - REPX {pminsd x, m15}, m8, m6, m22, m16 psubd m7, m2, m18 ; t10a paddd m18, m2 ; t2a - REPX {pmaxsd x, m14}, m0, m23, m7, m18 - psubd m2, m21, m5 ; t11a + REPX {pmaxsd x, m14}, m9, m23, m7, m18 + psubd m2, m17, m1 ; t15a + paddd m17, m1 ; t7a + REPX {pminsd x, m15}, m9, m23, m7, m18 + psubd m1, m21, m5 ; t11a paddd m21, m5 ; t3a - REPX {pminsd x, m15}, m0, m23, m7, m18 + REPX {pmaxsd x, m14}, m2, m17, m1, m21 psubd m5, m4, m20 ; t12a paddd m4, m20 ; t4a - REPX {pmaxsd x, m14}, m2, m21, m5, m4 + REPX {pminsd x, m15}, m2, m17, m1, m21 psubd m20, m19, m3 ; t13a paddd m19, m3 ; t5a - REPX {pminsd x, m15}, m2, m21, m5, m4 - psubd m3, m17, m1 ; t15a - paddd m17, m1 ; t7a - REPX {pmaxsd x, m14}, m20, m19, m3, m17 + REPX {pmaxsd x, m14}, m5, m4, m20, m19 + psubd m8, m6, m22 ; t14a + paddd m6, m22 ; t6a + REPX {pminsd x, m15}, m5, m4, m20, m19 + psubd m22, m0, m16 ; t8a + paddd m16, m0 ; t0a + REPX {pmaxsd x, m14}, m8, m6, m22, m16 vpbroadcastd m11, [o(pd_4017)] vpbroadcastd m10, [o(pd_799)] - REPX {pminsd x, m15}, m20, m19, m3, m17 - ITX_MULSUB_2D 22, 0, 1, 9, _, 13, 10, 11 ; t9 t8 - ITX_MULSUB_2D 20, 5, 1, 9, _, 13, 11, 10 ; t12 t13 + REPX {pminsd x, m15}, m8, m6, m22, m16 + ITX_MULSUB_2D 22, 9, 0, 3, _, 13, 10, 11 ; t9 t8 + ITX_MULSUB_2D 20, 5, 0, 3, _, 13, 11, 10 ; t12 t13 vpbroadcastd m11, [o(pd_2276)] vpbroadcastd m10, [o(pd_3406)] - ITX_MULSUB_2D 7, 2, 1, 9, _, 13, 10, 11 ; t11 t10 - ITX_MULSUB_2D 3, 8, 1, 9, _, 13, 11, 10 ; t14 t15 - paddd m1, m16, m4 ; t0 + ITX_MULSUB_2D 7, 1, 0, 3, _, 13, 10, 11 ; t11 t10 + ITX_MULSUB_2D 2, 8, 0, 3, _, 13, 11, 10 ; t14 t15 + paddd m0, m16, m4 ; t0 psubd m16, m4 ; t4 - psubd m4, m23, m19 ; t5 + psubd m3, m23, m19 ; t5 paddd m23, m19 ; t1 - REPX {pmaxsd x, m14}, m1, m16, m4, m23 + REPX {pmaxsd x, m14}, m0, m16, m3, m23 psubd m19, m18, m6 ; t6 paddd m18, m6 ; t2 - REPX {pminsd x, m15}, m1, m16, m4, m23 + REPX {pminsd x, m15}, m0, m16, m3, m23 psubd m6, m21, m17 ; t7 paddd m21, m17 ; t3 REPX {pmaxsd x, m14}, m19, m18, m6, m21 - paddd m17, m0, m20 ; t8a - psubd m0, m20 ; t12a + paddd m17, m9, m20 ; t8a + psubd m9, m20 ; t12a REPX {pminsd x, m15}, m19, m18, m6, m21 psubd m20, m22, m5 ; t13a paddd m22, m5 ; t9a - REPX {pmaxsd x, m14}, m17, m0, m20, m22 - psubd m5, m2, m3 ; t14a - paddd m2, m3 ; t10a - REPX {pminsd x, m15}, m17, m0, m20, m22 - psubd m3, m7, m8 ; t15a + REPX {pmaxsd x, m14}, m17, m9, m20, m22 + psubd m5, m1, m2 ; t14a + paddd m1, m2 ; t10a + REPX {pminsd x, m15}, m17, m9, m20, m22 + psubd m2, m7, m8 ; t15a paddd m7, m8 ; t11a - REPX {pmaxsd x, m14}, m5, m2, m3, m7 + REPX {pmaxsd x, m14}, m5, m1, m2, m7 vpbroadcastd m11, [o(pd_3784)] vpbroadcastd m10, [o(pd_1567)] - REPX {pminsd x, m15}, m5, m2, m3, m7 - ITX_MULSUB_2D 16, 4, 8, 9, _, 13, 10, 11 ; t5a t4a - ITX_MULSUB_2D 6, 19, 8, 9, _, 13, 11, 10 ; t6a t7a - ITX_MULSUB_2D 0, 20, 8, 9, _, 13, 10, 11 ; t13 t12 - ITX_MULSUB_2D 3, 5, 8, 9, _, 13, 11, 10 ; t14 t15 - psubd m8, m1, m18 ; t2a - paddd m1, m18 ; out0 + REPX {pminsd x, m15}, m5, m1, m2, m7 + ITX_MULSUB_2D 16, 3, 4, 8, _, 13, 10, 11 ; t5a t4a + ITX_MULSUB_2D 6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a + ITX_MULSUB_2D 9, 20, 4, 8, _, 13, 10, 11 ; t13 t12 + ITX_MULSUB_2D 2, 5, 4, 8, _, 13, 11, 10 ; t14 t15 + psubd m8, m0, m18 ; t2a + paddd m0, m18 ; out0 psubd m18, m23, m21 ; t3a paddd m23, m21 ; -out15 - paddd m21, m0, m5 ; -out13 - psubd m0, m5 ; t15a - psubd m5, m4, m6 ; t6 - paddd m4, m6 ; -out3 - REPX {pmaxsd x, m14}, m8, m18, m0, m5 - psubd m6, m20, m3 ; t14a - paddd m3, m20 ; out2 + paddd m21, m9, m5 ; -out13 + psubd m9, m5 ; t15a + psubd m5, m3, m6 ; t6 + paddd m3, m6 ; -out3 + REPX {pmaxsd x, m14}, m8, m18, m9, m5 + psubd m6, m20, m2 ; t14a + paddd m2, m20 ; out2 paddd m20, m16, m19 ; out12 psubd m16, m19 ; t7 - REPX {pminsd x, m15}, m8, m18, m0, m5 + REPX {pminsd x, m15}, m8, m18, m9, m5 psubd m19, m22, m7 ; t11 paddd m22, m7 ; out14 - psubd m7, m17, m2 ; t10 - paddd m2, m17 ; -out1 + psubd m7, m17, m1 ; t10 + paddd m1, m17 ; -out1 REPX {pmaxsd x, m14}, m6, m16, m19, m7 vpbroadcastd m12, [o(pd_1448)] - vpbroadcastd m9, [o(pd_2)] + vpbroadcastd m4, [o(pd_2)] vpbroadcastd m10, [o(pd_5120)] vpbroadcastd m11, [o(pd_5119)] REPX {pminsd x, m15}, m6, m16, m19, m7 @@ -1798,13 +1941,15 @@ ALIGN function_align REPX {pmulld x, m12}, m17, m7, m19, m5 psubd m16, m8, m18 ; out8 paddd m8, m18 ; -out7 - psubd m18, m6, m0 ; out10 - paddd m6, m0 ; -out5 + psubd m18, m6, m9 ; out10 + paddd m6, m9 ; -out5 REPX {pmulld x, m12}, m16, m8, m18, m6 - REPX {paddd x, m9 }, m1, m3, m20, m22 - REPX {psubd x, m9, x}, m2, m4, m21, m23 + REPX {paddd x, m4 }, m0, m2, m20, m22 + REPX {psubd x, m4, x}, m1, m3, m21, m23 REPX {paddd x, m10 }, m7, m5, m16, m18 REPX {psubd x, m11, x}, m17, m19, m8, m6 + REPX {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3 + REPX {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8 ret ALIGN function_align .main_pass1_fast: @@ -1834,22 +1979,14 @@ cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 cmp eobd, 36 jl .fast call m(iadst_16x16_internal_10bpc).main_pass1 - psrad m0, m23, 2 - psrad m23, m1, 2 - psrad m1, m22, 2 - psrad m22, m2, 2 - psrad m2, m21, 2 - psrad m21, m3, 2 - psrad m3, m20, 2 - psrad m20, m4, 2 - psrad m4, m19, 13 - psrad m19, m5, 13 - psrad m5, m18, 13 - psrad m18, m6, 13 - psrad m6, m17, 13 - psrad m17, m7, 13 - psrad m7, m16, 13 - psrad m16, m8, 13 + packssdw m4, m19, m3 + packssdw m3, m20, m5 + packssdw m5, m18, m2 + packssdw m2, m21, m6 + packssdw m6, m17, m1 + packssdw m1, m22, m7 + packssdw m7, m16, m0 + packssdw m0, m23, m8 jmp m(idct_16x16_internal_10bpc).pass1_end .fast: call m(iadst_16x16_internal_10bpc).main_pass1_fast @@ -1866,17 +2003,17 @@ cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 .pass2: lea r5, [o_base_8bpc] call m(iadst_16x16_internal_8bpc).main_pass2b - movshdup m10, [permC] - movu m13, [pw_m2048_2048] - psrlq m11, m10, 8 - vpermq m8, m11, m7 - vpermq m7, m11, m6 - vpermq m6, m11, m5 - vpermq m5, m11, m4 - vpermq m3, m10, m3 - vpermq m2, m10, m2 - vpermq m1, m10, m1 - vpermq m0, m10, m0 + movshdup m12, [permC] + movu m11, [pw_m2048_2048] + psrlq m13, m12, 8 + vpermq m8, m13, m7 + vpermq m7, m13, m6 + vpermq m6, m13, m5 + vpermq m5, m13, m4 + vpermq m3, m12, m3 + vpermq m2, m12, m2 + vpermq m1, m12, m1 + vpermq m0, m12, m0 jmp m(idct_16x16_internal_10bpc).pass2_end INV_TXFM_16X16_FN identity, dct, -92 @@ -1927,32 +2064,30 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 REPX {mova x, m7}, m4, m5, m6 jmp m(idct_16x16_internal_10bpc).pass1_end3 .pass2: - movshdup m11, [o(permC)] - vpbroadcastd m12, [o(pw_1697x16)] + movshdup m14, [o(permC)] + vpbroadcastd m15, [o(pw_1697x16)] lea r6, [strideq*3] - vpbroadcastd m13, [o(pw_2048)] - pxor m14, m14 - vpbroadcastd m15, [pixel_10bpc_max] - vpermq m8, m11, m0 - vpermq m9, m11, m1 + vpbroadcastd m11, [o(pw_2048)] + pxor m12, m12 + vpbroadcastd m13, [pixel_10bpc_max] + vpermq m8, m14, m0 + vpermq m9, m14, m1 call .pass2_main - vpermq m8, m11, m2 - vpermq m9, m11, m3 + vpermq m8, m14, m2 + vpermq m9, m14, m3 call .pass2_main - vpermq m8, m11, m4 - vpermq m9, m11, m5 + vpermq m8, m14, m4 + vpermq m9, m14, m5 call .pass2_main - vpermq m8, m11, m6 - vpermq m9, m11, m7 + vpermq m8, m14, m6 + vpermq m9, m14, m7 .pass2_main: - pmulhrsw m0, m12, m8 - pmulhrsw m1, m12, m9 + pmulhrsw m0, m15, m8 + pmulhrsw m1, m15, m9 paddsw m8, m8 paddsw m9, m9 paddsw m8, m0 paddsw m9, m1 - pmulhrsw m8, m13 - pmulhrsw m9, m13 jmp m(idct_16x8_internal_10bpc).write_16x4 ALIGN function_align .pass1_main: @@ -1976,4 +2111,3946 @@ ALIGN function_align REPX {psrad x, 13 }, m6, m7 ret +cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + vpbroadcastd m11, [o(pd_2)] + mova m20, [o(idct8x32p)] + pxor m21, m21 + cmp eobd, 43 + jl .fast + call .pass1_main + punpcklwd m16, m0, m1 + punpcklwd m17, m2, m3 + punpckhwd m18, m0, m1 + punpckhwd m19, m2, m3 + cmp eobd, 107 + jge .full + punpckldq m0, m16, m17 ; 0 2 + punpckhdq m1, m16, m17 ; 4 6 + punpckldq m2, m18, m19 ; 8 10 + punpckhdq m3, m18, m19 ; 12 14 + lea r5, [o_base_8bpc] + vextracti32x8 ym14, m0, 1 + vextracti32x8 ym15, m1, 1 + vextracti32x8 ym16, m2, 1 + vextracti32x8 ym17, m3, 1 + call m(idct_8x16_internal_8bpc).main_fast + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast + jmp .end +.full: + add cq, 64 + call .pass1_main + punpcklwd m5, m0, m1 + punpcklwd m6, m2, m3 + punpckhwd m7, m0, m1 + punpckhwd m8, m2, m3 + punpckldq m0, m16, m17 ; 0 2 + punpckhdq m1, m16, m17 ; 4 6 + punpckldq m2, m18, m19 ; 8 10 + punpckhdq m3, m18, m19 ; 12 14 + punpckldq m4, m5, m6 ; 16 18 + punpckhdq m5, m6 ; 20 22 + punpckldq m6, m7, m8 ; 24 26 + punpckhdq m7, m8 ; 28 30 + lea r5, [o_base_8bpc] + vextracti32x8 ym14, m0, 1 + vextracti32x8 ym15, m1, 1 + vextracti32x8 ym16, m2, 1 + vextracti32x8 ym17, m3, 1 + vextracti32x8 ym18, m4, 1 + vextracti32x8 ym19, m5, 1 + vextracti32x8 ym20, m6, 1 + vextracti32x8 ym21, m7, 1 + call m(idct_8x16_internal_8bpc).main + REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main + jmp .end +.fast: + movshdup m8, [o(permB)] + mova ym1, [cq+128*1] + mova ym5, [cq+128*5] + mova ym7, [cq+128*3] + mova ym3, [cq+128*7] + mova ym0, [cq+128*0] + mova ym4, [cq+128*2] + mova ym2, [cq+128*4] + mova ym6, [cq+128*6] + vpermt2q m1, m8, m5 ; 1 5 + vpermt2q m3, m8, m7 ; 7 3 + vpermt2q m0, m8, m4 ; 0 2 + vpermt2q m2, m8, m6 ; 4 6 + mova [cq+128*0], ym21 + REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x8_internal_10bpc).main_end + packssdw m0, m2 + packssdw m1, m3 + vpermb m0, m20, m0 + vprold m20, 16 + vpermb m2, m20, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + lea r5, [o_base_8bpc] + vextracti32x8 ym14, m0, 1 + vextracti32x8 ym15, m1, 1 + call m(idct_8x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2 +.end: + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper + lea r3, [strideq*2] + vpbroadcastd m12, [pixel_10bpc_max] + lea r6, [strideq*3] + pxor m11, m11 + lea r3, [dstq+r3*8] + pmulhrsw m0, m10 + pmulhrsw m1, m10 + call .write_8x4x2 + pmulhrsw m0, m10, m2 + pmulhrsw m1, m10, m3 + call .write_8x4x2 + pmulhrsw m0, m10, m4 + pmulhrsw m1, m10, m5 + call .write_8x4x2 + pmulhrsw m0, m10, m6 + pmulhrsw m1, m10, m7 +.write_8x4x2: + mova xm8, [dstq+strideq*0] + vinserti32x4 ym8, [dstq+strideq*1], 1 + vinserti32x4 m8, [dstq+strideq*2], 2 + vinserti32x4 m8, [dstq+r6 ], 3 + mova xm9, [r3 +r6 ] + vinserti32x4 ym9, [r3 +strideq*2], 1 + vinserti32x4 m9, [r3 +strideq*1], 2 + vinserti32x4 m9, [r3 +strideq*0], 3 + paddw m8, m0 + paddw m9, m1 + pmaxsw m8, m11 + pmaxsw m9, m11 + pminsw m8, m12 + pminsw m9, m12 + mova [dstq+strideq*0], xm8 + vextracti32x4 [dstq+strideq*1], ym8, 1 + vextracti32x4 [dstq+strideq*2], m8, 2 + vextracti32x4 [dstq+r6 ], m8, 3 + lea dstq, [dstq+strideq*4] + vextracti32x4 [r3 +strideq*0], m9, 3 + vextracti32x4 [r3 +strideq*1], m9, 2 + vextracti32x4 [r3 +strideq*2], ym9, 1 + mova [r3 +r6 ], xm9 + lea r3, [r3+strideq*4] + ret +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 +ALIGN function_align +.pass1_main: + mova m0, [cq+128*0] + mova m1, [cq+128*1] + mova m2, [cq+128*2] + mova m3, [cq+128*3] + mova m4, [cq+128*4] + mova m5, [cq+128*5] + mova m6, [cq+128*6] + mova m7, [cq+128*7] + REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7 + call m(idct_8x16_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_end2 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + REPX {vpermb x, m20, x}, m0, m1, m2, m3 + ret + +cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob + vpbroadcastd m9, [pw_5] + lea r4, [strideq*3] + pxor m10, m10 + lea r5, [strideq*5] + vpbroadcastd m11, [pixel_10bpc_max] + sub eobd, 107 + lea r6, [strideq+r4*2] +.loop: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + lea r7, [dstq+strideq*8] + REPX {mova [cq+128*x], m10}, 0, 1, 2, 3 + REPX {paddsw x, m9}, m0, m1, m2, m3 + REPX {mova [cq+128*x], m10}, 4, 5, 6, 7 + REPX {psraw x, 3 }, m0, m1, m2, m3 + add cq, 64 + mova xm4, [dstq+strideq*0] + mova xm5, [dstq+strideq*1] + mova xm6, [dstq+strideq*2] + mova xm7, [dstq+r4 *1] + punpckhwd m8, m0, m1 + vinserti32x4 ym4, [dstq+strideq*4], 1 + punpcklwd m0, m1 + vinserti32x4 ym5, [dstq+r5 *1], 1 + punpckhwd m1, m2, m3 + vinserti32x4 ym6, [dstq+r4 *2], 1 + punpcklwd m2, m3 + vinserti32x4 ym7, [dstq+r6 *1], 1 + punpckhwd m3, m0, m8 + vinserti32x4 m4, [r7 +strideq*0], 2 + punpcklwd m0, m8 + vinserti32x4 m5, [r7 +strideq*1], 2 + punpckhwd m8, m2, m1 + vinserti32x4 m6, [r7 +strideq*2], 2 + punpcklwd m2, m1 + vinserti32x4 m7, [r7 +r4 *1], 2 + punpckhqdq m1, m0, m2 + vinserti32x4 m4, [r7 +strideq*4], 3 + punpcklqdq m0, m2 + vinserti32x4 m5, [r7 +r5 *1], 3 + punpcklqdq m2, m3, m8 + vinserti32x4 m6, [r7 +r4 *2], 3 + punpckhqdq m3, m8 + vinserti32x4 m7, [r7 +r6 *1], 3 + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + REPX {pmaxsw x, m10}, m0, m1, m2, m3 + REPX {pminsw x, m11}, m0, m1, m2, m3 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + mova [dstq+strideq*2], xm2 + mova [dstq+r4 *1], xm3 + vextracti32x4 [dstq+strideq*4], ym0, 1 + vextracti32x4 [dstq+r5 *1], ym1, 1 + vextracti32x4 [dstq+r4 *2], ym2, 1 + vextracti32x4 [dstq+r6 *1], ym3, 1 + lea dstq, [r7+strideq*8] + vextracti32x4 [r7 +strideq*0], m0, 2 + vextracti32x4 [r7 +strideq*1], m1, 2 + vextracti32x4 [r7 +strideq*2], m2, 2 + vextracti32x4 [r7 +r4 *1], m3, 2 + vextracti32x4 [r7 +strideq*4], m0, 3 + vextracti32x4 [r7 +r5 *1], m1, 3 + vextracti32x4 [r7 +r4 *2], m2, 3 + vextracti32x4 [r7 +r6 *1], m3, 3 + add eobd, 0x80000000 + jnc .loop + RET + +cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + mova m11, [o(permB)] + mova m0, [cq+64* 0] ; 0 1 + mova m4, [cq+64* 1] ; 2 3 + mova m1, [cq+64* 2] ; 4 5 + mova m8, [cq+64* 3] ; 6 7 + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + psrlq m10, m11, 32 +%if WIN64 + movaps [cq+16*0], xmm6 + movaps [cq+16*1], xmm7 +%endif + mova m16, m11 + vpermi2q m16, m0, m1 ; 1 5 + mova m17, m11 + vpermi2q m17, m8, m4 ; 7 3 + cmp eobd, 43 + jl .fast + mova m18, [cq+64* 4] ; 8 9 + mova m20, [cq+64* 5] ; 10 11 + mova m6, [cq+64* 6] ; 12 13 + mova m7, [cq+64* 7] ; 14 15 + vpermt2q m0, m10, m18 ; 0 8 + vpermt2q m18, m11, m6 ; 9 13 + mova m19, m11 + vpermi2q m19, m7, m20 ; 15 11 + cmp eobd, 107 + jge .full + vpermt2q m1, m10, m6 ; 4 12 + vpermt2q m4, m10, m8 ; 2 6 + vpermt2q m7, m10, m20 ; 14 10 + mov r6d, 64*1 + call m(idct_8x8_internal_10bpc).main_fast + call m(idct_16x8_internal_10bpc).main_fast + call .main_fast + call m(idct_16x16_internal_10bpc).main_end + jmp .end +.full: + mova m2, [cq+64* 8] ; 16 17 + mova m5, [cq+64* 9] ; 18 19 + mova m9, [cq+64*10] ; 20 21 + mova m21, [cq+64*11] ; 22 23 + vpermt2q m1, m10, m9 ; 4 20 + vpermt2q m7, m10, m21 ; 14 22 + vpermt2q m21, m11, m5 ; 23 19 + vpermt2q m5, m10, m20 ; 18 10 + mova m20, m11 + vpermi2q m20, m2, m9 ; 17 21 + mova m22, [cq+64*12] ; 24 25 + mova m9, [cq+64*13] ; 26 27 + mova m3, [cq+64*14] ; 28 29 + mova m23, [cq+64*15] ; 30 31 + vpermt2q m2, m10, m22 ; 16 24 + vpermt2q m22, m11, m3 ; 25 29 + vpermt2q m3, m10, m6 ; 28 12 + vpermt2q m4, m10, m9 ; 2 26 + mova m6, m10 + vpermi2q m6, m23, m8 ; 30 6 + vpermt2q m23, m11, m9 ; 31 27 + mov r6d, 64*3 + call m(idct_8x8_internal_10bpc).main + call m(idct_16x8_internal_10bpc).main + call .main + call m(idct_16x16_internal_10bpc).main_end + jmp .end +.fast: + vpermq m0, m10, m0 ; 0 0 + vpermq m1, m10, m1 ; 4 4 + vpermt2q m4, m10, m8 ; 2 6 + xor r6d, r6d + call .main_fast2 + call m(idct_16x16_internal_10bpc).main_end +.end: +%if WIN64 + movaps xmm6, [cq+16*0] + movaps xmm7, [cq+16*1] +%endif + vzeroupper + call .transpose_8x32 + pxor m14, m14 +.zero_loop: + mova [cq+r6*4+64*3], m14 + mova [cq+r6*4+64*2], m14 + mova [cq+r6*4+64*1], m14 + mova [cq+r6*4+64*0], m14 + sub r6d, 64 + jge .zero_loop + lea r5, [o_base_8bpc] + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m8 + punpcklqdq m6, m8 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + pxor m12, m12 +.write_32x8_start: + vpbroadcastd m11, [pw_2048] + vpbroadcastd m13, [pixel_10bpc_max] + lea r3, [strideq*3] +.write_32x8: + pmulhrsw m0, m11 + pmulhrsw m1, m11 + pmulhrsw m2, m11 + pmulhrsw m3, m11 + call .write_32x4 + pmulhrsw m0, m11, m4 + pmulhrsw m1, m11, m5 + pmulhrsw m2, m11, m6 + pmulhrsw m3, m11, m7 +.write_32x4: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3 ] + REPX {pmaxsw x, m12}, m0, m1, m2, m3 + REPX {pminsw x, m13}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + lea dstq, [dstq+strideq*4] + ret +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 8 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2 +ALIGN function_align +.main_fast3: + ; assuming m0=in0 in0, m4=in2 in2, and m16=in1 in3 + vbroadcasti32x4 m5, [o(pd_401_4076)] + pmulld m3, m0, m12 + pmulld m4, m5 + REPX {paddd x, m13}, m3, m4 + REPX {psrad x, 12 }, m3, m4 ; m3=idct8:t0-7, m4=t8a t15a + + ; t8a t15a -> t8/9 t14/15 + + vbroadcasti32x4 m5, [o(pd_3784_m3784)] + pshufd m7, m4, q1032 + pmulld m6, m4, [o(pd_1567)]{bcstd} + pmulld m5, m7 + paddd m6, m13 + paddd m5, m6 + psrad m5, 12 ; m5=t9a t14a + + ; t14a t9a -> t13/14 t9/10 [m5] & t8 15 -> t8/11a t12/15a [m4] + + shufps m6, m4, m5, q1032 ; t12 t13 + shufps m8, m4, m5, q3210 ; t11a t10 + pmulld m9, m6, m12 + pmulld m7, m8, m12 + paddd m9, m13 + paddd m5, m9, m7 ; t12 t13a + psubd m4, m9, m7 ; t11 t10a + REPX {psrad x, 12 }, m5, m4 + + psubd m7, m3, m6 ; dct16 out15 out14 + paddd m0, m3, m6 ; dct16 out0 out1 + psubd m6, m3, m5 ; dct16 out12 out13 + paddd m1, m3, m5 ; dct16 out3 out2 + psubd m5, m3, m4 ; dct16 out11 out10 + paddd m2, m3, m4 ; dct16 out4 out5 + psubd m4, m3, m8 ; dct16 out8 out9 + paddd m3, m8 ; dct16 out7 out6 + REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3 + REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + + ; idct32_bottomhalf + vbroadcasti32x4 m18, [o(pd_201_m601)] + vbroadcasti32x4 m19, [o(pd_4091_4052)] + pmulld m17, m16, m19 + pmulld m16, m18 + REPX {paddd x, m13}, m17, m16 + REPX {psrad x, 12 }, m17, m16 + + ; m17: t31a t24a -> t30/31 t24/25, m16: t16a t23a -> t16/17 t22/23 [step2] + + vbroadcasti32x4 m10, [o(pd_799_m2276)] + vbroadcasti32x4 m11, [o(pd_4017_3406)] + pmulld m18, m17, m10 + pmulld m19, m17, m11 + pmulld m8, m16, m11 + pmulld m9, m16, m10 + REPX {paddd x, m13}, m18, m19 + psubd m18, m8 + paddd m19, m9 + REPX {psrad x, 12 }, m18, m19 + + ; m17=t31 t24 -> t28/31a t24/27a, m16=t16 t23 -> t16/19a t20/23a + ; m18=t17a t22a -> t17/18 t21/22, m19=t30a t25a -> t29/30 t25/26 + + punpckhqdq m23, m17, m19 ; t24a t25 [or t27a t26] + punpcklqdq m20, m16, m18 ; t16a t17 [or t19a t18] + punpckhqdq m22, m16, m18 ; t23a t22 [or t20a t21] + punpcklqdq m16, m17, m19 ; t28a t29 [or t31a t30] + mova m21, m23 + mova m18, m20 + mova m17, m22 + mova m19, m16 + + jmp .main4 +.main_fast2: ; bottom three-quarters are zero + vbroadcasti32x4 m8, [o(pd_799_4017)] + pmulld m8, m1 ; t4 t7 + vpmulld m0, [o(pd_2896)] {1to16} ; t0 t1 + REPX {paddd x, m13}, m8, m0 + REPX {psrad x, 12 }, m8, m0 + pmulld m3, m8, m12 + mova m2, m0 ; t3 t2 + call m(idct_8x8_internal_10bpc).main3 + vbroadcasti32x4 m6, [o(pd_4076_3920)] + vbroadcasti32x4 m3, [o(pd_401_m1189)] + pmulld m6, m4 ; t15 t12 + pmulld m4, m3 ; t9 t10 + REPX {paddd x, m13}, m6, m4 + REPX {psrad x, 12 }, m6, m4 + mova m5, m6 ; t14 t13 + mova m9, m4 ; t8 t11 + call m(idct_16x8_internal_10bpc).main3 + vbroadcasti32x4 m23, [o(pd_4091_3973)] + vbroadcasti32x4 m7, [o(pd_201_995)] + vbroadcasti32x4 m22, [o(pd_1380_601)] + vbroadcasti32x4 m9, [o(pd_3857_4052)] + pmulld m23, m16 ; t16 t20 + pmulld m16, m7 ; t31 t27 + pmulld m22, m17 ; -t19 -t25 + pmulld m17, m9 ; t28 t24 + REPX {paddd x, m13}, m23, m16, m17 + psubd m22, m13, m22 + REPX {psrad x, 12 }, m23, m16, m22, m17 + mova m20, m23 ; t30 t26 + mova m9, m16 ; t17 t21 + mova m19, m22 ; t18 t22 + mova m18, m17 ; t29 t25 + jmp .main3 +.main_fast: ; bottom half is zero + vbroadcasti32x4 m23, [o(pd_4091_3973)] + vbroadcasti32x4 m7, [o(pd_201_995)] + vbroadcasti32x4 m20, [o(pd_2751_2106)] + vbroadcasti32x4 m9, [o(pd_3035_3513)] + vbroadcasti32x4 m21, [o(pd_3703_3290)] + vbroadcasti32x4 m10, [o(pd_1751_2440)] + vbroadcasti32x4 m22, [o(pd_1380_601)] + vbroadcasti32x4 m11, [o(pd_3857_4052)] + pmulld m23, m16 ; t16a t20a + pmulld m16, m7 ; t31a t27a + pmulld m20, m19 ; -t17a -t21a + pmulld m19, m9 ; t30a t26a + pmulld m21, m18 ; t18a t22a + pmulld m18, m10 ; t29a t25a + pmulld m22, m17 ; -t19a -t25a + pmulld m17, m11 ; t28a t24a + psubd m20, m13, m20 + psubd m22, m13, m22 + jmp .main2 +.main: + ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 201_995, 4091_3973 + ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3035_3513, 2751_2106 + ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1751_2440, 3703_3290 + ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3857_4052, 1380_601 + paddd m20, m13 + paddd m22, m13 +.main2: + REPX {paddd x, m13}, m16, m23, m19 + REPX {psrad x, 12 }, m16, m20, m23, m19 + psubd m9, m16, m20 ; t17 t21 + paddd m16, m20 ; t16 t20 + psubd m20, m23, m19 ; t30 t26 + paddd m23, m19 ; t31 t27 + REPX {pmaxsd x, m14}, m9, m16, m20, m23 + REPX {paddd x, m13}, m21, m18, m17 + REPX {psrad x, 12 }, m18, m22, m21, m17 + psubd m19, m22, m18 ; t18 t22 + paddd m22, m18 ; t19 t23 + psubd m18, m17, m21 ; t29 t25 + paddd m17, m21 ; t28 t24 + REPX {pmaxsd x, m14}, m19, m22, m18, m17 + REPX {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17 +.main3: + vbroadcasti32x4 m11, [o(pd_4017_2276)] + vbroadcasti32x4 m10, [o(pd_799_3406)] + psubd m7, m0, m6 ; dct16 out15 out14 + paddd m0, m6 ; dct16 out0 out1 + psubd m6, m1, m5 ; dct16 out12 out13 + paddd m1, m5 ; dct16 out3 out2 + psubd m5, m2, m4 ; dct16 out11 out10 + paddd m2, m4 ; dct16 out4 out5 + psubd m4, m3, m8 ; dct16 out8 out9 + paddd m3, m8 ; dct16 out7 out6 + ITX_MULSUB_2D 20, 9, 8, 21, _, 13, 10, 11 + ITX_MULSUB_2D 18, 19, 8, 21, _, 13, 10, 11, 2 + REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3 + punpckhqdq m21, m16, m20 ; t20 t21a + punpcklqdq m16, m20 ; t16 t17a + punpcklqdq m20, m22, m19 ; t19 t18a + punpckhqdq m22, m19 ; t23 t22a + REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + punpcklqdq m19, m23, m9 ; t31 t30a + punpckhqdq m23, m9 ; t27 t26a + punpckhqdq m9, m17, m18 ; t24 t25a + punpcklqdq m17, m18 ; t28 t29a + psubd m18, m16, m20 ; t19a t18 + paddd m20, m16 ; t16a t17 + psubd m16, m19, m17 ; t28a t29 + paddd m19, m17 ; t31a t30 + psubd m17, m22, m21 ; t20a t21 + paddd m22, m21 ; t23a t22 + psubd m21, m9, m23 ; t27a t26 + paddd m23, m9 ; t24a t25 + REPX {pmaxsd x, m14}, m18, m16, m17, m21 + REPX {pminsd x, m15}, m16, m18, m21, m17 + REPX {pmaxsd x, m14}, m20, m22, m19, m23 + REPX {pminsd x, m15}, m20, m22, m19, m23 +.main4: + vpbroadcastd m11, [o(pd_3784)] + vpbroadcastd m10, [o(pd_1567)] + ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11 + ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2 + paddd m9, m20, m22 ; t16 t17a + psubd m20, m22 ; t23 t22a + paddd m22, m19, m23 ; t31 t30a + psubd m19, m23 ; t24 t25a + psubd m23, m16, m17 ; t20a t21 + paddd m16, m17 ; t19a t18 + psubd m17, m18, m21 ; t27a t26 + paddd m21, m18 ; t28a t29 + REPX {pmaxsd x, m14}, m20, m19, m23, m17 + REPX {pminsd x, m15}, m19, m20, m17, m23 + REPX {pmulld x, m12}, m19, m20, m17, m23 + REPX {pmaxsd x, m14}, m22, m21, m16, m9 + paddd m19, m13 + paddd m17, m13 + REPX {pminsd x, m15}, m22, m21, m16, m9 + psubd m18, m19, m20 ; t23a t22 + paddd m19, m20 ; t24a t25 + paddd m20, m17, m23 ; t27 t26a + psubd m17, m23 ; t20 t21a + REPX {psrad x, 12 }, m20, m19, m18, m17 + ret +.transpose_8x32: + mova m10, [o(idct32x8p)] + psrlw m8, m10, 8 + mova m9, m8 + vpermi2w m8, m1, m5 + vpermt2w m1, m10, m5 + vprold m5, m9, 16 + vpermi2w m9, m3, m7 + vpermt2w m3, m10, m7 + vprold m10, 16 + mova m7, m5 + vpermi2w m5, m0, m4 + vpermt2w m0, m10, m4 + vpermi2w m7, m2, m6 + vpermt2w m2, m10, m6 + punpckhdq m6, m5, m8 + punpckldq m5, m8 + punpckhdq m8, m7, m9 + punpckldq m7, m9 + punpckhdq m4, m2, m3 + punpckldq m2, m3 + punpckhdq m3, m0, m1 + punpckldq m0, m1 + ret + +cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob + vpbroadcastd m5, [pw_4096] + lea r4, [strideq*3] + mova m6, [idtx32x8p] + lea r5, [strideq*5] + vpbroadcastd m9, [pixel_10bpc_max] + lea r6, [strideq+r4*2] + pxor m8, m8 + sub eobd, 107 + psrlw m7, m6, 8 +.loop: + mova m0, [cq+64*0] + packssdw m0, [cq+64*1] ; 02 13 + mova m1, [cq+64*2] + packssdw m1, [cq+64*3] ; 46 57 + mova m2, [cq+64*4] + packssdw m2, [cq+64*5] ; 8a 9b + mova m3, [cq+64*6] + packssdw m3, [cq+64*7] ; ce df + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {mova [cq+64*x], m8}, 0, 1, 2, 3 + mova m4, m6 + vpermi2w m4, m1, m3 + vpermt2w m1, m7, m3 + REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 + mova m3, m7 + vpermi2w m3, m0, m2 + vpermt2w m0, m6, m2 + add cq, 64*8 + punpcklqdq m2, m3, m1 ; 4 5 + punpckhqdq m3, m1 ; 6 7 + punpckhqdq m1, m0, m4 ; 2 3 + punpcklqdq m0, m4 ; 0 1 + mova ym4, [dstq+strideq*0] + vinserti32x8 m4, [dstq+strideq*1], 1 + paddw m0, m4 + mova ym4, [dstq+strideq*2] + vinserti32x8 m4, [dstq+r4 *1], 1 + paddw m1, m4 + mova ym4, [dstq+strideq*4] + vinserti32x8 m4, [dstq+r5 *1], 1 + paddw m2, m4 + mova ym4, [dstq+r4 *2] + vinserti32x8 m4, [dstq+r6 *1], 1 + paddw m3, m4 + REPX {pmaxsw x, m8}, m0, m1, m2, m3 + REPX {pminsw x, m9}, m0, m1, m2, m3 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym1 + vextracti32x8 [dstq+r4 *1], m1, 1 + mova [dstq+strideq*4], ym2 + vextracti32x8 [dstq+r5 *1], m2, 1 + mova [dstq+r4 *2], ym3 + vextracti32x8 [dstq+r6 *1], m3, 1 + add dstq, 32 + add eobd, 0x80000000 + jnc .loop + RET + +cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] +%if WIN64 + movaps [rsp+ 8], xmm6 + movaps [rsp+24], xmm7 +%endif + cmp eobd, 36 + jl .fast + call .pass1 + cmp eobd, 151 + jge .full + lea r5, [o_base_8bpc] + pxor m9, m9 + punpcklwd m8, m1, m1 ; 2 + punpckhwd m14, m1, m1 ; 3 + punpcklwd m1, m3, m3 ; 6 + punpckhwd m15, m3, m3 ; 7 + punpcklwd m3, m6, m6 ; 12 + punpckhwd m19, m6, m6 ; 13 + punpcklwd m6, m9, m4 ; __ 8 + punpckhwd m20, m4, m4 ; 9 + punpckhwd m16, m5, m5 ; 11 + punpcklwd m5, m5 ; 10 + punpcklwd m9, m0 ; __ 0 + punpckhwd m21, m0, m0 ; 1 + punpcklwd m0, m7, m7 ; 14 + punpckhwd m17, m7, m7 ; 15 + punpcklwd m7, m2, m2 ; 4 + punpckhwd m18, m2, m2 ; 5 + call m(idct_16x16_internal_8bpc).main_fast + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mov r6d, 64*3 + pxor m8, m8 +.zero_loop: + REPX {mova [cq+r6*8+128*x], m8}, 3, 2, 1, 0 + sub r6d, 64 + jge .zero_loop + jmp .pass2_end +.full: + mova [cq+128*0], m0 + mova [cq+128*1], m1 + mova [cq+128*2], m2 + mova [cq+128*3], m3 + mova [cq+128*4], m4 + mova [cq+128*5], m5 + mova [cq+128*6], m6 + mova [cq+128*7], m7 + add cq, 64 + call .pass1 + mova m9, [cq-64* 1] ; 0 1 + mova m14, [cq+64* 1] ; 2 3 + mova m18, [cq+64* 3] ; 4 5 + mova m15, [cq+64* 5] ; 6 7 + mova m20, [cq+64* 7] ; 8 9 + mova m16, [cq+64* 9] ; 10 11 + mova m22, [cq+64*11] ; 12 13 + mova m19, [cq+64*13] ; 14 15 + lea r5, [o_base_8bpc] + punpcklwd m8, m7, m14 ; 30 2 + punpckhwd m21, m7, m9 ; 31 1 + punpcklwd m7, m6, m18 ; 28 4 + punpckhwd m14, m6 ; 3 29 + punpcklwd m9, m0, m9 ; 16 0 + punpckhwd m17, m19, m0 ; 15 17 + punpcklwd m0, m19, m1 ; 14 18 + punpckhwd m19, m1, m22 ; 19 13 + punpcklwd m1, m15, m5 ; 6 26 + punpckhwd m18, m5, m18 ; 27 5 + punpcklwd m6, m4, m20 ; 24 8 + punpckhwd m15, m4 ; 7 25 + punpcklwd m5, m3, m16 ; 22 10 + punpckhwd m20, m3, m20 ; 23 9 + punpcklwd m3, m22, m2 ; 12 20 + punpckhwd m16, m2 ; 11 21 + call m(idct_16x16_internal_8bpc).main2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + mov r6d, 32*7 + pxor m8, m8 +.full_zero_loop: + REPX {mova [cq+r6*8+64*x], m8}, 2, 1, 0, -1 + sub r6d, 32 + jge .full_zero_loop + jmp .pass2_end +.fast: + mova ym0, [cq+128*0] + mova ym2, [cq+128*4] + movshdup m8, [o(permB)] + mova ym1, [cq+128*2] + mova ym3, [cq+128*6] + mova ym4, [cq+128*1] + mova ym5, [cq+128*3] + mova ym6, [cq+128*5] + mova ym7, [cq+128*7] + vpermt2q m0, m8, m2 ; 0 4 + vpermt2q m1, m8, m3 ; 2 6 + vpermt2q m4, m8, m5 ; 1 3 + vpermt2q m7, m8, m6 ; 7 5 + REPX {pmulld x, m12}, m0, m1, m4, m7 + pxor ym16, ym16 + mova [cq+128*0], ym16 + REPX {vmovdqa32 [cq+128*x], ym16}, 1, 2, 3, 4, 5, 6, 7 + REPX {paddd x, m13}, m0, m1, m4, m7 + REPX {psrad x, 12 }, m0, m1, m4, m7 + call m(idct_8x8_internal_10bpc).main_fast + call m(idct_16x8_internal_10bpc).main_fast + vpbroadcastd m11, [o(pd_1)] + call m(idct_8x16_internal_10bpc).main_end2 + mova m8, [o(idct8x32p)] + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + mova m6, [dup16_perm] + vpermb m0, m8, m0 + vpermb m2, m8, m2 + vprold m8, 16 + vpermb m1, m8, m1 + vpermb m3, m8, m3 + punpckldq m4, m0, m2 + punpckhdq m0, m2 + punpckldq m2, m1, m3 + punpckhdq m1, m3 + punpckldq m21, m4, m2 + punpckhdq m14, m4, m2 + punpckldq m18, m0, m1 + punpckhdq m15, m0, m1 + vpermb m8, m6, m14 ; 2 + vpermb m1, m6, m15 ; 6 + vpermb m7, m6, m18 ; 4 + pmovzxwd m9, ym21 ; 0 + vpord m6, [o(pb_32)] {1to16} + lea r5, [o_base_8bpc] + vpermb m21, m6, m21 ; 1 + vpermb m15, m6, m15 ; 7 + vpermb m18, m6, m18 ; 5 + vpermb m14, m6, m14 ; 3 + pslld m9, 16 + call m(idct_16x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 +.pass2_end: + movshdup m22, [permC] + vpbroadcastd m11, [pw_2048] + vpbroadcastd m13, [pixel_10bpc_max] + lea r6, [strideq*3] + pxor m12, m12 + psrlq m23, m22, 8 + vpermq m8, m22, m0 + vpermq m9, m23, m1 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m22, m2 + vpermq m9, m23, m3 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m22, m4 + vpermq m9, m23, m5 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m22, m6 + vpermq m9, m23, m7 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m22, m14 + vpermq m9, m23, m15 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m22, m16 + vpermq m9, m23, m17 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m22, m18 + vpermq m9, m23, m19 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m22, m20 + vpermq m9, m23, m21 +%if WIN64 + movaps xmm6, [rsp+ 8] + movaps xmm7, [rsp+24] +%endif + vzeroupper + jmp m(idct_16x8_internal_10bpc).write_16x4 +.pass1: + pmulld m0, m12, [cq+128* 0] + pmulld m1, m12, [cq+128* 2] + pmulld m2, m12, [cq+128* 4] + pmulld m3, m12, [cq+128* 6] + pmulld m4, m12, [cq+128* 8] + pmulld m5, m12, [cq+128*10] + pmulld m6, m12, [cq+128*12] + pmulld m7, m12, [cq+128*14] + call m(idct_8x16_internal_10bpc).main_rect2 + pmulld m16, m12, [cq+128* 1] + pmulld m17, m12, [cq+128* 3] + pmulld m18, m12, [cq+128* 5] + pmulld m19, m12, [cq+128* 7] + pmulld m20, m12, [cq+128* 9] + pmulld m21, m12, [cq+128*11] + pmulld m22, m12, [cq+128*13] + pmulld m23, m12, [cq+128*15] + call m(idct_16x16_internal_10bpc).main_rect2 + vpbroadcastd m11, [o(pd_1)] + call m(idct_16x16_internal_10bpc).main_end2 + jmp m(idct_16x16_internal_10bpc).main_end3 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly + +cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 16, dst, stride, c, eob +%undef cmp + vpbroadcastd m10, [pw_2896x8] + vpbroadcastd m11, [pw_1697x16] + vpbroadcastd m13, [pw_8192] + vpbroadcastd m15, [pixel_10bpc_max] + lea r6, [strideq*9] + pxor m14, m14 + paddw m12, m13, m13 ; pw_16384 + cmp eobd, 151 + jl .main + call .main + add cq, 64-128*4 + lea dstq, [dstq+strideq*8] +.main: + call .main_internal + add cq, 128*4 + pmulhrsw m1, m13, m2 + pmulhrsw m3, m13, m4 + pmulhrsw m5, m13, m6 + pmulhrsw m7, m13, m8 + call .main_internal +.main2: + pmulhrsw m2, m13 + pmulhrsw m4, m13 + pmulhrsw m6, m13 + pmulhrsw m8, m13 + punpcklqdq m0, m1, m2 ; 0 8 + punpckhqdq m1, m2 ; 1 9 + call .write_16x2x2 + punpcklqdq m0, m3, m4 ; 2 10 + punpckhqdq m1, m3, m4 ; 3 11 + call .write_16x2x2 + punpcklqdq m0, m5, m6 ; 4 12 + punpckhqdq m1, m5, m6 ; 5 13 + call .write_16x2x2 + punpcklqdq m0, m7, m8 ; 6 14 + punpckhqdq m1, m7, m8 ; 7 15 +.write_16x2x2: + mova ym2, [dstq+strideq*0] + vinserti32x8 m2, [dstq+strideq*8], 1 + mova ym9, [dstq+strideq*1] + vinserti32x8 m9, [dstq+r6 ], 1 + paddw m0, m2 + paddw m1, m9 + pmaxsw m0, m14 + pmaxsw m1, m14 + pminsw m0, m15 + pminsw m1, m15 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*8], m0, 1 + mova [dstq+strideq*1], ym1 + vextracti32x8 [dstq+r6 ], m1, 1 + lea dstq, [dstq+strideq*2] + ret +.main_internal: + mova m8, [cq+128* 0] + packssdw m8, [cq+128* 8] + mova m6, [cq+128* 1] + packssdw m6, [cq+128* 9] + mova m0, [cq+128* 2] + packssdw m0, [cq+128*10] + mova m2, [cq+128* 3] + packssdw m2, [cq+128*11] + REPX {pmulhrsw x, m10}, m8, m6, m0, m2 + REPX {vpermq x, x, q3120}, m8, m6, m0, m2 + pmulhrsw m4, m11, m8 + pmulhrsw m9, m11, m6 + REPX {mova [cq+128*x], m14}, 0, 1, 2, 3 + pmulhrsw m4, m12 + pmulhrsw m9, m12 + paddsw m8, m4 + paddsw m6, m9 + pmulhrsw m4, m11, m0 + pmulhrsw m9, m11, m2 + REPX {mova [cq+128*x], m14}, 8, 9, 10, 11 + pmulhrsw m4, m12 + pmulhrsw m9, m12 + paddsw m0, m4 + paddsw m2, m9 + punpcklwd m4, m8, m6 + punpckhwd m8, m6 + punpcklwd m6, m0, m2 + punpckhwd m0, m2 + punpckldq m2, m4, m6 ; 0 1 + punpckhdq m4, m6 ; 2 3 + punpckldq m6, m8, m0 ; 4 5 + punpckhdq m8, m0 ; 6 7 + ret + +cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] +%if WIN64 + movaps [rsp+ 8], xmm6 + movaps [rsp+24], xmm7 +%endif + mov r6d, 8*12 + cmp eobd, 36 + jl .fast + pmulld m0, m12, [cq+64* 0] + pmulld m1, m12, [cq+64* 4] + pmulld m2, m12, [cq+64* 8] + pmulld m3, m12, [cq+64*12] + pmulld m16, m12, [cq+64* 2] + pmulld m17, m12, [cq+64* 6] + pmulld m18, m12, [cq+64*10] + pmulld m19, m12, [cq+64*14] + cmp eobd, 151 + jge .full + call m(idct_8x16_internal_10bpc).main_fast_rect2 + call m(idct_16x16_internal_10bpc).main_fast_rect2 + call .idct16_sumsub + call .pass1_load_spill + call .main_fast_rect2 + jmp .pass1_end +.full: + pmulld m4, m12, [cq+64*16] + pmulld m5, m12, [cq+64*20] + pmulld m6, m12, [cq+64*24] + pmulld m7, m12, [cq+64*28] + pmulld m20, m12, [cq+64*18] + pmulld m21, m12, [cq+64*22] + pmulld m22, m12, [cq+64*26] + pmulld m23, m12, [cq+64*30] + add r6d, 8*16 + call m(idct_8x16_internal_10bpc).main_rect2 + call m(idct_16x16_internal_10bpc).main_rect2 + call .idct16_sumsub + call .pass1_load_spill + pmulld m16, m12, [cq+64*17] + pmulld m17, m12, [cq+64*19] + pmulld m18, m12, [cq+64*21] + pmulld m19, m12, [cq+64*23] + pmulld m20, m12, [cq+64*25] + pmulld m21, m12, [cq+64*27] + pmulld m22, m12, [cq+64*29] + pmulld m23, m12, [cq+64*31] + call .main_rect2 +.pass1_end: + vpbroadcastd m11, [o(pd_1)] + lea r4, [cq+64] + call .idct32_pass1_end + lea r5, [o_base_8bpc] + punpckhqdq m19, m5, m16 ; 11 + punpcklqdq m5, m16 ; 10 + punpckhqdq m16, m2, m1 ; 5 + punpcklqdq m2, m1 ; 4 + punpcklqdq m1, m15, m4 ; 2 + punpckhqdq m15, m4 ; 3 + punpcklqdq m4, m14, m18 ; 8 + punpckhqdq m18, m14, m18 ; 9 + punpckhqdq m14, m0, m20 ; 1 + punpcklqdq m0, m20 ; 0 + punpckhqdq m20, m6, m17 ; 13 + punpcklqdq m6, m17 ; 12 + punpckhqdq m17, m3, m21 ; 7 + punpcklqdq m3, m21 ; 6 + punpckhqdq m21, m7, m8 ; 15 + punpcklqdq m7, m8 ; 14 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + jmp .end +.fast: + pmulld ym0, ym12, [cq+64*0] + pmulld ym1, ym12, [cq+64*4] + movshdup m7, [o(permB)] + mova ym4, [cq+64*2] + mova ym5, [cq+64*6] + mova ym16, [cq+64*1] + mova ym2, [cq+64*5] + mova ym3, [cq+64*3] + mova ym17, [cq+64*7] + vpermt2q m4, m7, m5 ; 2 6 + vpermt2q m16, m7, m2 ; 1 5 + vpermt2q m17, m7, m3 ; 7 3 + paddd ym0, ym13 + paddd ym1, ym13 + psrad ym0, 12 + psrad ym1, 12 + vpermq m0, m7, m0 ; 0 0 + vpermq m1, m7, m1 ; 4 4 + REPX {pmulld x, m12}, m4, m16, m17 + REPX {paddd x, m13}, m4, m16, m17 + REPX {psrad x, 12 }, m4, m16, m17 + call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2 + vpbroadcastd m11, [o(pd_1)] + call m(idct_16x16_internal_10bpc).main_end2 + call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 + lea r5, [o_base_8bpc] + punpckhqdq m14, m0, m2 ; 1 + punpcklqdq m0, m2 ; 0 + punpcklqdq m1, m3, m4 ; 2 + punpckhqdq m15, m3, m4 ; 3 + punpcklqdq m2, m5, m7 ; 4 + punpckhqdq m16, m5, m7 ; 5 + punpcklqdq m3, m6, m8 ; 6 + punpckhqdq m17, m6, m8 ; 7 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast +.end: +%if WIN64 + movaps xmm6, [rsp+ 8] + movaps xmm7, [rsp+24] +%endif + pxor m12, m12 +.zero_loop: + mova [cq+r6*8+64*3], m12 + mova [cq+r6*8+64*2], m12 + mova [cq+r6*8+64*1], m12 + mova [cq+r6*8+64*0], m12 + sub r6d, 8*4 + jge .zero_loop + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start + pmulhrsw m0, m11, m14 + pmulhrsw m1, m11, m15 + pmulhrsw m2, m11, m16 + pmulhrsw m3, m11, m17 + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 + pmulhrsw m0, m11, m18 + pmulhrsw m1, m11, m19 + pmulhrsw m2, m11, m20 + pmulhrsw m3, m11, m21 + vzeroupper + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 16 +.dconly3: + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 +.dconly2: + vpbroadcastd m3, [o(dconly_10bpc)] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + vpbroadcastw m2, r6d + paddsw m2, m3 +.dconly_loop: + paddsw m0, m2, [dstq+strideq*0] + paddsw m1, m2, [dstq+strideq*1] + psubusw m0, m3 + psubusw m1, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +ALIGN function_align +.idct16_sumsub: + psubd m23, m0, m22 ; t15 + paddd m0, m22 ; t0 + psubd m22, m1, m21 ; t14 + paddd m1, m21 ; t1 + REPX {pmaxsd x, m14}, m23, m0, m22, m1 + psubd m21, m2, m20 ; t13 + paddd m2, m20 ; t2 + REPX {pminsd x, m15}, m23, m0, m22, m1 + psubd m20, m3, m19 ; t12 + paddd m3, m19 ; t3 + REPX {pmaxsd x, m14}, m21, m2, m20, m3 + psubd m19, m4, m18 ; t11 + paddd m4, m18 ; t4 + REPX {pminsd x, m15}, m21, m2, m20, m3 + psubd m18, m5, m17 ; t10 + paddd m5, m17 ; t5 + REPX {pmaxsd x, m14}, m19, m4, m18, m5 + psubd m17, m6, m16 ; t9 + paddd m6, m16 ; t6 + REPX {pminsd x, m15}, m19, m4, m18, m5 + psubd m16, m7, m9 ; t8 + paddd m7, m9 ; t7 + REPX {pmaxsd x, m14}, m17, m6, m16, m7 + REPX {pminsd x, m15}, m17, m6, m16, m7 + ret +.idct32_pass1_end: + psrlq m12, [o(permC)], 24 ; 0 2 8 10 1 3 9 11 + psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 +%macro IDCT32_PASS1_END 2 ; low, high + paddd m8, m11, [r4+128*%1] + paddd m9, m11, [cq+128*%1] + psubd m10, m8, m%1 ; out 16+n + paddd m8, m%1 ; out 15-n + paddd m%1, m9, m%2 ; out 0+n + psubd m9, m%2 ; out 31-n + REPX {vpsravd x, m11}, m10, m%1, m8, m9 + packssdw m%1, m10 ; 0+n 16+n + packssdw m%2, m8, m9 ; 15-n 31-n +%endmacro + IDCT32_PASS1_END 0, 23 ; 0 16, 15 31 + IDCT32_PASS1_END 7, 16 ; 7 23, 8 24 + IDCT32_PASS1_END 1, 22 ; 1 17, 14 30 + IDCT32_PASS1_END 6, 17 ; 6 22, 9 25 + IDCT32_PASS1_END 2, 21 ; 2 18, 13 29 + IDCT32_PASS1_END 5, 18 ; 5 21, 10 26 + IDCT32_PASS1_END 3, 20 ; 3 19, 12 28 + IDCT32_PASS1_END 4, 19 ; 4 20, 11 27 +.transpose_16x32: + mova m14, m13 + vpermi2q m14, m0, m16 + vpermt2q m0, m12, m16 + mova m15, m13 + vpermi2q m15, m1, m17 + vpermt2q m1, m12, m17 + mova m16, m13 + vpermi2q m16, m2, m18 + vpermt2q m2, m12, m18 + mova m17, m13 + vpermi2q m17, m3, m19 + vpermt2q m3, m12, m19 + mova m18, m13 + vpermi2q m18, m4, m20 + vpermt2q m4, m12, m20 + mova m19, m13 + vpermi2q m19, m5, m21 + vpermt2q m5, m12, m21 + mova m20, m13 + vpermi2q m20, m6, m22 + vpermt2q m6, m12, m22 + mova m21, m13 + vpermi2q m21, m7, m23 + vpermt2q m7, m12, m23 + punpckhwd m8, m2, m3 ; c04 d04 c05 d05 c06 d06 c07 d07 + punpcklwd m2, m3 ; c00 d00 c01 d01 c02 d02 c03 d03 + punpckhwd m3, m0, m1 ; a04 b04 a05 b05 a06 b06 a07 b07 + punpcklwd m0, m1 ; a00 b00 a01 b01 a02 b02 a03 b03 + punpckhwd m1, m4, m5 ; e04 f04 e05 f05 e06 f06 e07 f07 + punpcklwd m4, m5 ; e00 f00 e01 f01 e02 f02 e03 f03 + punpckhwd m5, m6, m7 ; g04 h04 g05 h05 g06 h06 g07 h07 + punpcklwd m6, m7 ; g00 h00 g01 h01 g02 h02 g03 h03 + punpckhwd m7, m14, m15 ; a12 b12 a13 b13 a14 b14 a15 b15 + punpcklwd m14, m15 ; a08 b08 a09 b09 a10 b10 a11 b11 + punpckhwd m15, m16, m17 ; c12 d12 c13 d13 c14 d14 c15 d15 + punpcklwd m16, m17 ; c08 d08 c09 d09 c10 d10 c11 d11 + punpckhwd m17, m18, m19 ; e12 f12 e13 f13 e14 f14 e15 f15 + punpcklwd m18, m19 ; e08 f08 e09 f09 e10 f10 e11 f11 + punpckhwd m19, m20, m21 ; g12 h12 g13 h13 g14 h14 g15 h15 + punpcklwd m20, m21 ; g08 h08 g09 h09 g10 h10 g11 h11 + punpckhdq m21, m1, m5 ; e06 f06 g06 h06 e07 f07 g07 h07 + punpckldq m1, m5 ; e04 f04 g04 h04 e05 f05 g05 h05 + punpckhdq m5, m14, m16 ; a10 b10 c10 d10 a11 b11 c11 d11 + punpckldq m14, m16 ; a08 b08 c08 d08 a09 b09 c09 d09 + punpckhdq m16, m18, m20 ; e10 f10 g10 h10 e11 f11 g11 h11 + punpckldq m18, m20 ; e08 f08 g08 h08 e09 f09 g09 h09 + punpckldq m20, m4, m6 ; e00 f00 g00 h00 e01 f01 g01 h01 + punpckhdq m4, m6 ; e02 f02 g02 h02 e03 f03 g03 h03 + punpckldq m6, m7, m15 ; a12 b12 c12 d12 a13 b13 c13 d13 + punpckhdq m7, m15 ; a14 b14 c14 d14 a15 b15 c15 d15 + punpckhdq m15, m0, m2 ; a02 b02 c02 d02 a03 b03 c03 d03 + punpckldq m0, m2 ; a00 b00 c00 d00 a01 b01 c01 d01 + punpckldq m2, m3, m8 ; a04 b04 c04 d04 a05 b05 c05 d05 + punpckhdq m3, m8 ; a06 b06 c06 d06 a07 b07 c07 d07 + punpckhdq m8, m17, m19 ; e14 f14 g14 h14 e15 f15 g15 h15 + punpckldq m17, m19 ; e12 f12 g12 h12 e13 f13 g13 h13 + ret +.pass1_load_spill: + mova [cq+64* 0], m0 + mova [cq+64* 2], m1 + mova [cq+64* 4], m2 + mova [cq+64* 6], m3 + mova [cq+64* 8], m4 + mova [cq+64*10], m5 + mova [cq+64*12], m6 + mova [cq+64*14], m7 + pmulld m0, m12, [cq+64* 1] + pmulld m1, m12, [cq+64* 3] + pmulld m2, m12, [cq+64* 5] + pmulld m3, m12, [cq+64* 7] + pmulld m4, m12, [cq+64* 9] + pmulld m5, m12, [cq+64*11] + pmulld m6, m12, [cq+64*13] + pmulld m7, m12, [cq+64*15] + mova [cq+64* 1], m23 + mova [cq+64* 3], m22 + mova [cq+64* 5], m21 + mova [cq+64* 7], m20 + mova [cq+64* 9], m19 + mova [cq+64*11], m18 + mova [cq+64*13], m17 + mova [cq+64*15], m16 + ret +.main_fast2_rect2: + REPX {paddd x, m13}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_fast2: ; bottom 3/4 is zero + pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a + pmulld m0, [o(pd_201)] {1to16} ; t16a + pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a + pmulld m3, [o(pd_3857)] {1to16} ; t28a + pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a + pmulld m2, [o(pd_995)] {1to16} ; t20a + pmulld m6, m1, [o(pd_601)] {1to16} ; t23a + pmulld m17, m1, [o(pd_4052)] {1to16} ; t24a + REPX {psubd x, m13, x}, m20, m6 + REPX {paddd x, m13}, m23, m0, m3, m21, m2, m17 + REPX {psrad x, 12 }, m20, m6, m23, m0, m3, m21, m2, m17 + mova m8, m0 + mova m16, m23 + mova m7, m20 + mova m4, m3 + mova m19, m2 + mova m18, m21 + mova m5, m6 + mova m22, m17 + jmp .main3 +.main_fast_rect2: + call m(idct_8x16_internal_10bpc).round +.main_fast: ; bottom half is zero + pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a + pmulld m0, [o(pd_201)] {1to16} ; t16a + pmulld m16, m7, [o(pd_2751)] {1to16} ; t17a + pmulld m7, [o(pd_3035)] {1to16} ; t30a + pmulld m19, m4, [o(pd_3703)] {1to16} ; t29a + pmulld m4, [o(pd_1751)] {1to16} ; t18a + pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a + pmulld m3, [o(pd_3857)] {1to16} ; t28a + pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a + pmulld m2, [o(pd_995)] {1to16} ; t20a + pmulld m18, m5, [o(pd_2106)] {1to16} ; t21a + pmulld m5, [o(pd_3513)] {1to16} ; t26a + pmulld m17, m6, [o(pd_3290)] {1to16} ; t25a + pmulld m6, [o(pd_2440)] {1to16} ; t22a + pmulld m22, m1, [o(pd_601)] {1to16} ; t23a + pmulld m1, [o(pd_4052)] {1to16} ; t24a + REPX {psubd x, m13, x}, m16, m20, m18, m22 + call m(idct_16x16_internal_10bpc).round3 + jmp .main2 +.main_rect2: + call m(idct_8x16_internal_10bpc).round + call m(idct_16x16_internal_10bpc).round +.main: + ITX_MULSUB_2D 0, 23, 8, 9, 10, _, 201, 4091 ; t16a, t31a + ITX_MULSUB_2D 16, 7, 8, 9, 10, _, 3035, 2751 ; t17a, t30a + ITX_MULSUB_2D 4, 19, 8, 9, 10, _, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2D 20, 3, 8, 9, 10, _, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2D 2, 21, 8, 9, 10, _, 995, 3973 ; t20a, t27a + ITX_MULSUB_2D 18, 5, 8, 9, 10, _, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2D 6, 17, 8, 9, 10, _, 2440, 3290 ; t22a, t25a + ITX_MULSUB_2D 22, 1, 8, 9, 10, _, 4052, 601 ; t23a, t24a + call m(idct_16x16_internal_10bpc).round +.main2: + call m(idct_8x16_internal_10bpc).round + psubd m8, m0, m16 ; t17 + paddd m0, m16 ; t16 + psubd m16, m23, m7 ; t30 + paddd m23, m7 ; t31 + REPX {pmaxsd x, m14}, m8, m0, m16, m23 + paddd m7, m20, m4 ; t19 + psubd m20, m4 ; t18 + REPX {pminsd x, m15}, m8, m0, m16, m23 + paddd m4, m3, m19 ; t28 + psubd m3, m19 ; t29 + REPX {pmaxsd x, m14}, m7, m20, m4, m3 + psubd m19, m2, m18 ; t21 + paddd m2, m18 ; t20 + REPX {pminsd x, m15}, m7, m20, m4, m3 + psubd m18, m21, m5 ; t26 + paddd m21, m5 ; t27 + REPX {pmaxsd x, m14}, m19, m2, m18, m21 + psubd m5, m22, m6 ; t22 + paddd m6, m22 ; t23 + REPX {pminsd x, m15}, m19, m2, m18, m21 + psubd m22, m1, m17 ; t25 + paddd m17, m1 ; t24 + REPX {pmaxsd x, m14}, m5, m6, m22, m17 + REPX {pminsd x, m15}, m5, m6, m22, m17 +.main3: + vpbroadcastd m11, [o(pd_4017)] + vpbroadcastd m10, [o(pd_799)] + ITX_MULSUB_2D 16, 8, 9, 1, _, 13, 10, 11 ; t17a, t30a + ITX_MULSUB_2D 3, 20, 9, 1, _, 13, 10, 11, 2 ; t29a, t18a + vpbroadcastd m11, [o(pd_2276)] + vpbroadcastd m10, [o(pd_3406)] + ITX_MULSUB_2D 18, 19, 9, 1, _, 13, 10, 11 ; t21a, t26a + ITX_MULSUB_2D 22, 5, 9, 1, _, 13, 10, 11, 2 ; t25a, t22a + paddd m1, m6, m2 ; t23a + psubd m6, m2 ; t20a + psubd m2, m17, m21 ; t27a + paddd m17, m21 ; t24a + REPX {pmaxsd x, m14}, m1, m6, m2, m17 + psubd m21, m23, m4 ; t28a + paddd m23, m4 ; t31a + REPX {pminsd x, m15}, m1, m6, m2, m17 + psubd m4, m16, m20 ; t18 + paddd m16, m20 ; t17 + REPX {pmaxsd x, m14}, m21, m23, m4, m16 + psubd m20, m0, m7 ; t19a + paddd m0, m7 ; t16a + REPX {pminsd x, m15}, m21, m23, m4, m16 + psubd m7, m8, m3 ; t29 + paddd m3, m8 ; t30 + REPX {pmaxsd x, m14}, m20, m0, m7, m3 + paddd m8, m5, m18 ; t22 + psubd m5, m18 ; t21 + REPX {pminsd x, m15}, m20, m0, m7, m3 + psubd m18, m22, m19 ; t26 + paddd m22, m19 ; t25 + REPX {pmaxsd x, m14}, m8, m5, m18, m22 + vpbroadcastd m11, [o(pd_3784)] + vpbroadcastd m10, [o(pd_1567)] + REPX {pminsd x, m15}, m8, m5, m18, m22 + ITX_MULSUB_2D 21, 20, 9, 19, _, 13, 10, 11 ; t19, t28 + ITX_MULSUB_2D 2, 6, 9, 19, _, 13, 10, 11, 2 ; t27, t20 + ITX_MULSUB_2D 7, 4, 9, 19, _, 13, 10, 11 ; t18a, t29a + ITX_MULSUB_2D 18, 5, 9, 19, _, 13, 10, 11, 2 ; t26a, t21a + psubd m19, m0, m1 ; t23 + paddd m0, m1 ; t16 + paddd m1, m8, m16 ; t17a + psubd m8, m16, m8 ; t22a + REPX {pmaxsd x, m14}, m19, m0, m1, m8 + psubd m16, m23, m17 ; t24 + paddd m23, m17 ; t31 + REPX {pminsd x, m15}, m19, m0, m1, m8 + psubd m17, m3, m22 ; t25a + paddd m22, m3 ; t30a + REPX {pmaxsd x, m14}, m16, m23, m17, m22 + paddd m3, m6, m21 ; t19a + psubd m6, m21, m6 ; t20a + REPX {pminsd x, m15}, m16, m23, m17, m22 + paddd m21, m18, m4 ; t29 + psubd m18, m4, m18 ; t26 + REPX {pmaxsd x, m14}, m3, m6, m21, m18 + psubd m4, m20, m2 ; t27a + paddd m20, m2 ; t28a + REPX {pminsd x, m15}, m3, m6, m21, m18 + paddd m2, m7, m5 ; t18 + psubd m7, m5 ; t21 + REPX {pmaxsd x, m14}, m4, m20, m2, m7 + REPX {pminsd x, m15}, m4, m20, m2, m7 + REPX {pmulld x, m12}, m18, m16, m4, m17, m7, m19, m6, m8 + REPX {paddd x, m13}, m18, m16, m4, m17 + psubd m5, m18, m7 ; t21a + paddd m18, m7 ; t26a + psubd m7, m16, m19 ; t23a + paddd m16, m19 ; t24a + REPX {psrad x, 12 }, m5, m18, m7, m16 + paddd m19, m4, m6 ; t27 + psubd m4, m6 ; t20 + psubd m6, m17, m8 ; t22 + paddd m17, m8 ; t25 + REPX {psrad x, 12 }, m19, m4, m6, m17 + ret + +cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 16, dst, stride, c, eob +%undef cmp + vpbroadcastd m10, [pw_2896x8] + vpbroadcastd m11, [pw_1697x16] + vpbroadcastd m13, [pw_2048] + vpbroadcastd m15, [pixel_10bpc_max] + lea r6, [strideq*9] + pxor m14, m14 + cmp eobd, 151 + jl .main + mov r4, dstq + call .main + add cq, 64*12 + lea dstq, [r4+32] +.main: + call .main_internal + add cq, 64*4 + pmulhrsw m1, m13, m2 + pmulhrsw m3, m13, m4 + pmulhrsw m5, m13, m6 + pmulhrsw m7, m13, m8 + call .main_internal + jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 +.main_internal: + mova m8, [cq+64* 0] + packssdw m8, [cq+64* 8] + mova m6, [cq+64* 1] + packssdw m6, [cq+64* 9] + mova m0, [cq+64* 2] + packssdw m0, [cq+64*10] + mova m2, [cq+64* 3] + packssdw m2, [cq+64*11] + REPX {pmulhrsw x, m10}, m8, m6, m0, m2 + REPX {paddsw x, x }, m8, m6, m0, m2 + REPX {vpermq x, x, q3120}, m8, m6, m0, m2 + pmulhrsw m4, m11, m8 + pmulhrsw m9, m11, m6 + paddsw m8, m8 + paddsw m6, m6 + REPX {mova [cq+64*x], m14}, 0, 1, 2, 3 + paddsw m8, m4 + paddsw m6, m9 + pmulhrsw m4, m11, m0 + pmulhrsw m9, m11, m2 + paddsw m0, m0 + paddsw m2, m2 + REPX {mova [cq+64*x], m14}, 8, 9, 10, 11 + paddsw m0, m4 + paddsw m2, m9 + punpcklwd m4, m8, m6 + punpckhwd m8, m6 + punpcklwd m6, m0, m2 + punpckhwd m0, m2 + punpckldq m2, m4, m6 ; 0 1 + punpckhdq m4, m6 ; 2 3 + punpckldq m6, m8, m0 ; 4 5 + punpckhdq m8, m0 ; 6 7 + ret + +cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + WIN64_SPILL_XMM 30 + cmp eobd, 136 + jl .fast + add cq, 64 + cmp eobd, 543 + jge .full + call .pass1_fast ; bottomright 16x16 zero + mov r6d, 16*12 + jmp .lefthalf +.full: + call .pass1 + mov r6d, 16*28 +.lefthalf: + mova [cq+128* 0], m0 + mova [cq+128* 1], m1 + mova [cq+128* 2], m2 + mova [cq+128* 3], m3 + mova [cq+128* 4], m14 + mova [cq+128* 5], m15 + mova [cq+128* 6], m16 + mova [cq+128* 7], m17 + mova [cq+128* 8], m22 + mova [cq+128* 9], m23 + mova [cq+128*10], m24 + mova [cq+128*11], m25 + mova [cq+128*12], m26 + mova [cq+128*13], m27 + mova [cq+128*14], m28 + mova [cq+128*15], m29 + sub cq, 64 + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + call .pass1 + lea r5, [o_base_8bpc] + call .pass2_start + pxor m12, m12 +.right_zero_loop: + mova [cq+r6*8+64+128*3], m12 + mova [cq+r6*8+64+128*2], m12 + mova [cq+r6*8+64+128*1], m12 + mova [cq+r6*8+64+128*0], m12 + sub r6d, 16*4 + jge .right_zero_loop + mov r6d, 16*28 + jmp .end2 +.pass2_start: + mova m4, [cq+64+128* 0] + mova m5, [cq+64+128* 1] + mova m6, [cq+64+128* 2] + mova m7, [cq+64+128* 3] + mova m18, [cq+64+128* 4] + mova m19, [cq+64+128* 5] + mova m20, [cq+64+128* 6] + mova m21, [cq+64+128* 7] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova [cq+128*0], m14 + mova [cq+128*1], m15 + mova [cq+128*2], m16 + mova [cq+128*3], m17 + mova [cq+128*4], m18 + mova [cq+128*5], m19 + mova [cq+128*6], m20 + mova [cq+128*7], m21 + mova m14, [cq+64+128* 8] + mova m15, [cq+64+128* 9] + mova m16, [cq+64+128*10] + mova m17, [cq+64+128*11] + mova m18, [cq+64+128*12] + mova m19, [cq+64+128*13] + mova m20, [cq+64+128*14] + mova m21, [cq+64+128*15] + jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf +.fast: ; topleft 16x16 nonzero + cmp eobd, 36 + jl .fast2 + call .pass1_fast + lea r5, [o_base_8bpc] + call .pass2_fast_start + jmp .end +.pass2_fast_start: + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + mova [cq+128*0], m14 + mova [cq+128*1], m15 + mova [cq+128*2], m16 + mova [cq+128*3], m17 + mova [cq+128*4], m18 + mova [cq+128*5], m19 + mova [cq+128*6], m20 + mova [cq+128*7], m21 + jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast +.fast2: ; topleft 8x8 nonzero + movshdup m7, [o(permB)] + mova ym0, [cq+128*0] + mova ym1, [cq+128*4] + mova ym4, [cq+128*2] + mova ym5, [cq+128*6] + mova ym16, [cq+128*1] + mova ym2, [cq+128*5] + mova ym3, [cq+128*3] + mova ym17, [cq+128*7] + mov r6d, 16*4 + vpermq m0, m7, m0 ; 0 0 + vpermq m1, m7, m1 ; 4 4 + vpermt2q m4, m7, m5 ; 2 6 + vpermt2q m16, m7, m2 ; 1 5 + vpermt2q m17, m7, m3 ; 7 3 + call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2 + call m(idct_16x16_internal_10bpc).main_end + call .pass2_fast2_start +.end: + pxor m12, m12 +.end2: + call .pass2_end +.zero_loop: + mova [cq+r6*8+128*3], m12 + mova [cq+r6*8+128*2], m12 + mova [cq+r6*8+128*1], m12 + mova [cq+r6*8+128*0], m12 + sub r6d, 16*4 + jge .zero_loop + WIN64_RESTORE_XMM + vzeroupper + ret +.pass2_fast2_start: + call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 + lea r5, [o_base_8bpc] + punpckhqdq m22, m0, m2 ; 1 + punpcklqdq m0, m2 ; 0 + punpcklqdq m1, m5, m7 ; 4 + punpckhqdq m24, m5, m7 ; 5 + punpcklqdq m14, m3, m4 ; 2 + punpckhqdq m23, m3, m4 ; 3 + punpcklqdq m15, m6, m8 ; 6 + punpckhqdq m25, m6, m8 ; 7 + mova m10, m13 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 + mova [cq+128*0], m14 + mova [cq+128*1], m15 + mova [cq+128*2], m16 + mova [cq+128*3], m17 + mova [cq+128*4], m18 + mova [cq+128*5], m19 + mova [cq+128*6], m20 + mova [cq+128*7], m21 + jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 +.pass2_end: + psubsw m9, m0, m29 ; out31 + paddsw m0, m29 ; out0 + psubsw m29, m1, m28 ; out30 + paddsw m1, m28 ; out1 + psubsw m28, m2, m27 ; out29 + paddsw m2, m27 ; out2 + psubsw m27, m3, m26 ; out28 + paddsw m3, m26 ; out3 + psubsw m26, m4, m25 ; out27 + paddsw m4, m25 ; out4 + psubsw m25, m5, m24 ; out26 + paddsw m5, m24 ; out5 + psubsw m24, m6, m23 ; out25 + paddsw m6, m23 ; out6 + psubsw m23, m7, m22 ; out24 + paddsw m7, m22 ; out7 + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start + mova m0, [cq+128*0] + mova m1, [cq+128*1] + mova m2, [cq+128*2] + mova m3, [cq+128*3] + mova m4, [cq+128*4] + mova m5, [cq+128*5] + mova m6, [cq+128*6] + mova m7, [cq+128*7] + psubsw m22, m0, m21 ; out23 + paddsw m0, m21 ; out8 + psubsw m21, m1, m20 ; out22 + paddsw m1, m20 ; out9 + psubsw m20, m2, m19 ; out21 + paddsw m2, m19 ; out10 + psubsw m19, m3, m18 ; out20 + paddsw m3, m18 ; out11 + psubsw m18, m4, m17 ; out19 + paddsw m4, m17 ; out12 + psubsw m17, m5, m16 ; out18 + paddsw m5, m16 ; out13 + psubsw m16, m6, m15 ; out17 + paddsw m6, m15 ; out14 + psubsw m15, m7, m14 ; out16 + paddsw m7, m14 ; out15 + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8 + pmulhrsw m0, m11, m15 + pmulhrsw m1, m11, m16 + pmulhrsw m2, m11, m17 + pmulhrsw m3, m11, m18 + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 + pmulhrsw m0, m11, m19 + pmulhrsw m1, m11, m20 + pmulhrsw m2, m11, m21 + pmulhrsw m3, m11, m22 + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 + pmulhrsw m0, m11, m23 + pmulhrsw m1, m11, m24 + pmulhrsw m2, m11, m25 + pmulhrsw m3, m11, m26 + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 + pmulhrsw m0, m11, m27 + pmulhrsw m1, m11, m28 + pmulhrsw m2, m11, m29 + pmulhrsw m3, m11, m9 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2 +.pass1_fast: + mova m0, [cq+128* 0] + mova m1, [cq+128* 4] + mova m2, [cq+128* 8] + mova m3, [cq+128*12] + mov r6d, 16*12 + call m(idct_8x16_internal_10bpc).main_fast + mova m16, [cq+128* 2] + mova m17, [cq+128* 6] + mova m18, [cq+128*10] + mova m19, [cq+128*14] + call m(idct_16x16_internal_10bpc).main_fast + call .pass1_load_spill + call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast + jmp .pass1_end +.pass1: + mova m0, [cq+128* 0] + mova m1, [cq+128* 4] + mova m2, [cq+128* 8] + mova m3, [cq+128*12] + mova m4, [cq+128*16] + mova m5, [cq+128*20] + mova m6, [cq+128*24] + mova m7, [cq+128*28] + call m(idct_8x16_internal_10bpc).main + mova m16, [cq+128* 2] + mova m17, [cq+128* 6] + mova m18, [cq+128*10] + mova m19, [cq+128*14] + mova m20, [cq+128*18] + mova m21, [cq+128*22] + mova m22, [cq+128*26] + mova m23, [cq+128*30] + call m(idct_16x16_internal_10bpc).main + call .pass1_load_spill + mova m16, [cq+128*17] + mova m17, [cq+128*19] + mova m18, [cq+128*21] + mova m19, [cq+128*23] + mova m20, [cq+128*25] + mova m21, [cq+128*27] + mova m22, [cq+128*29] + mova m23, [cq+128*31] + call m(inv_txfm_add_dct_dct_32x16_10bpc).main +.pass1_end: + vpbroadcastd m11, [o(pd_2)] + lea r4, [cq+128*8] + call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end + punpckhqdq m22, m0, m20 ; 1 + punpcklqdq m0, m20 ; 0 + punpckhqdq m24, m2, m1 ; 5 + punpcklqdq m1, m2, m1 ; 4 + punpcklqdq m2, m14, m18 ; 8 + punpckhqdq m26, m14, m18 ; 9 + punpcklqdq m14, m15, m4 ; 2 + punpckhqdq m23, m15, m4 ; 3 + punpckhqdq m25, m3, m21 ; 7 + punpcklqdq m15, m3, m21 ; 6 + punpckhqdq m28, m6, m17 ; 13 + punpcklqdq m3, m6, m17 ; 12 + punpckhqdq m27, m5, m16 ; 11 + punpcklqdq m16, m5, m16 ; 10 + punpckhqdq m29, m7, m8 ; 15 + punpcklqdq m17, m7, m8 ; 14 + ret +.pass1_load_spill: + call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub + mova [cq+128* 0], m0 + mova m0, [cq+128* 1] + mova [cq+128* 1], m1 + mova [cq+128* 2], m2 + mova m1, [cq+128* 3] + mova m2, [cq+128* 5] + mova [cq+128* 3], m3 + mova [cq+128* 4], m4 + mova m3, [cq+128* 7] + mova m4, [cq+128* 9] + mova [cq+128* 5], m5 + mova [cq+128* 6], m6 + mova [cq+128* 7], m7 + mova m5, [cq+128*11] + mova m6, [cq+128*13] + mova m7, [cq+128*15] + mova [cq+128* 8], m23 + mova [cq+128* 9], m22 + mova [cq+128*10], m21 + mova [cq+128*11], m20 + mova [cq+128*12], m19 + mova [cq+128*13], m18 + mova [cq+128*14], m17 + mova [cq+128*15], m16 + ret + +cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 7, 16, dst, stride, c, eob +%undef cmp + vpbroadcastd m13, [pw_8192] + vpbroadcastd m15, [pixel_10bpc_max] + pxor m14, m14 + lea r6, [strideq*9] + cmp eobd, 136 + jl .main + mov r4, dstq + call .main + add cq, 64-128*4 + lea dstq, [dstq+strideq*8] + call .main + add cq, 128*12-64 + lea dstq, [r4+32] + cmp eobd, 543 + jl .main + call .main + add cq, 64-128*4 + lea dstq, [dstq+strideq*8] +.main: + call .main_internal + add cq, 128*4 + pmulhrsw m1, m13, m2 + pmulhrsw m3, m13, m4 + pmulhrsw m5, m13, m6 + pmulhrsw m7, m13, m8 + call .main_internal + jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 +.main_internal: + mova m8, [cq+128* 0] + packssdw m8, [cq+128* 8] + mova m6, [cq+128* 1] + packssdw m6, [cq+128* 9] + mova m0, [cq+128* 2] + packssdw m0, [cq+128*10] + mova m2, [cq+128* 3] + packssdw m2, [cq+128*11] + REPX {vpermq x, x, q3120}, m8, m6, m0, m2 + REPX {mova [cq+128*x], m14}, 0, 1, 2, 3 + punpcklwd m4, m8, m6 + punpckhwd m8, m6 + punpcklwd m6, m0, m2 + punpckhwd m0, m2 + REPX {mova [cq+128*x], m14}, 8, 9, 10, 11 + punpckldq m2, m4, m6 ; 0 1 + punpckhdq m4, m6 ; 2 3 + punpckldq m6, m8, m0 ; 4 5 + punpckhdq m8, m0 ; 6 7 + ret + +cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + + PROLOGUE 4, 7, 32, -8*mmsize, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + cmp eobd, 36 + jl .fast + call .pass1 + cmp eobd, 151 + jge .full + lea r5, [o_base_8bpc] + + punpckhwd m22, m0, m0 + punpckhwd m23, m1, m1 + punpckhwd m24, m2, m2 + punpckhwd m25, m3, m3 + punpckhwd m26, m4, m4 + punpckhwd m27, m5, m5 + punpckhwd m28, m6, m6 + punpckhwd m29, m7, m7 + punpcklwd m21, m1, m1 + punpcklwd m14, m3, m3 + punpcklwd m18, m5, m5 + punpcklwd m15, m7, m7 + pxor m9, m9 + punpcklwd m9, m9, m0 + punpcklwd m8, m2, m2 + punpcklwd m7, m4, m4 + punpcklwd m1, m6, m6 + call m(idct_16x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + mova [rsp+mmsize*0], m14 + mova [rsp+mmsize*1], m15 + mova [rsp+mmsize*2], m16 + mova [rsp+mmsize*3], m17 + mova [rsp+mmsize*4], m18 + mova [rsp+mmsize*5], m19 + mova [rsp+mmsize*6], m20 + mova [rsp+mmsize*7], m21 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast + + pxor m12, m12 + mov r3d, 64*3 +.zero_loop: + REPX {mova [cq+r3*8+128*x], m12}, 0, 1, 2, 3 + sub r3d, 64 + jge .zero_loop + + jmp .pass2_end +.full: + mova [cq+128*0], m0 + mova [cq+128*1], m1 + mova [cq+128*2], m2 + mova [cq+128*3], m3 + mova [cq+128*4], m4 + mova [cq+128*5], m5 + mova [cq+128*6], m6 + mova [cq+128*7], m7 + add cq, 64 + call .pass1 + sub cq, 64 + mova m22, [cq+128*0] ; 0 1 + mova m23, [cq+128*1] ; 2 3 + mova m24, [cq+128*2] ; 4 5 + mova m25, [cq+128*3] ; 6 7 + mova m26, [cq+128*4] ; 8 9 + mova m27, [cq+128*5] ; 10 11 + mova m28, [cq+128*6] ; 12 13 + mova m29, [cq+128*7] ; 14 15 + mova [cq+64* 8], m0 + mova [cq+64* 9], m1 + mova [cq+64*10], m2 + mova [cq+64*11], m3 + mova [cq+64*12], m4 + mova [cq+64*13], m5 + mova [cq+64*14], m6 + mova [cq+64*15], m7 + lea r5, [o_base_8bpc] + + punpcklwd m20, m1, m1 + punpcklwd m16, m3, m3 + punpcklwd m19, m5, m5 + punpcklwd m17, m7, m7 + punpcklwd m8, m24, m24 ; 4 + punpcklwd m5, m2, m2 ; 20 + punpcklwd m1, m28, m28 ; 12 + punpcklwd m7, m26, m26 ; 8 + punpcklwd m3, m4, m4 ; 24 + punpcklwd m4, m6, m6 ; 28 + pxor m9, m9 + punpcklwd m6, m9, m0 ; __ 16 + mova m0, m4 + punpcklwd m9, m9, m22 ; __ 0 + call m(idct_16x16_internal_8bpc).main_fast + punpcklwd m21, m23, m23 ; 2 + punpcklwd m15, m29, m29 ; 14 + punpcklwd m18, m27, m27 ; 10 + punpcklwd m14, m25, m25 ; 6 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova [rsp+mmsize*0], m14 + mova [rsp+mmsize*1], m15 + mova [rsp+mmsize*2], m16 + mova [rsp+mmsize*3], m17 + mova [rsp+mmsize*4], m18 + mova [rsp+mmsize*5], m19 + mova [rsp+mmsize*6], m20 + mova [rsp+mmsize*7], m21 + mova m21, [cq+64*15] + mova m14, [cq+64* 8] + mova m17, [cq+64*11] + mova m18, [cq+64*12] + mova m19, [cq+64*13] + mova m16, [cq+64*10] + mova m15, [cq+64* 9] + mova m20, [cq+64*14] + REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ + m24, m19, m16, m27, m28, m15, m20, m23 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf + + pxor m12, m12 + mov r3d, 32*7 +.full_zero_loop: + REPX {mova [cq+r3*8+64*x], m12}, 0, 1, 2, 3 + sub r3d, 32 + jge .full_zero_loop + + jmp .pass2_end +.fast: + mova ym0, [cq+128*0] + mova ym2, [cq+128*4] + movshdup m8, [o(permB)] + mova ym1, [cq+128*2] + mova ym3, [cq+128*6] + mova ym4, [cq+128*1] + mova ym5, [cq+128*3] + mova ym6, [cq+128*5] + mova ym7, [cq+128*7] + vpermt2q m0, m8, m2 ; 0 4 + vpermt2q m1, m8, m3 ; 2 6 + vpermt2q m4, m8, m5 ; 1 3 + vpermt2q m7, m8, m6 ; 7 5 + call m(idct_8x8_internal_10bpc).main_fast + call m(idct_16x8_internal_10bpc).main_fast + vpbroadcastd m11, [o(pd_2)] + call m(idct_8x16_internal_10bpc).main_end2 + mova m8, [o(idct8x32p)] + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + mova m6, [dup16_perm] + vpermb m0, m8, m0 + vpermb m2, m8, m2 + vprold m8, 16 + vpermb m1, m8, m1 + vpermb m3, m8, m3 + punpckldq m4, m0, m2 + punpckhdq m0, m2 + punpckldq m2, m1, m3 + punpckhdq m1, m3 + punpckldq m21, m4, m2 + punpckhdq m14, m4, m2 + punpckldq m18, m0, m1 + punpckhdq m15, m0, m1 + vpord m7, m6, [o(pb_32)] {1to16} + vpermb m22, m7, m21 ; 1 + pmovzxwd m9, ym21 ; 0 + vpermb m8, m6, m18 ; 4 + vpermb m24, m7, m18 ; 5 + vpermb m21, m6, m14 ; 2 + vpermb m23, m7, m14 ; 3 + vpermb m14, m6, m15 ; 6 + vpermb m25, m7, m15 ; 7 + lea r5, [o_base_8bpc] + pslld m9, 16 + + pxor m7, m7 + REPX {mova x, m7}, m1, m18, m15, m26, m27, m28, m29 + + call m(idct_16x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + mova [rsp+mmsize*0], m14 + mova [rsp+mmsize*1], m15 + mova [rsp+mmsize*2], m16 + mova [rsp+mmsize*3], m17 + mova [rsp+mmsize*4], m18 + mova [rsp+mmsize*5], m19 + mova [rsp+mmsize*6], m20 + mova [rsp+mmsize*7], m21 + + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast + + pxor m12, m12 + REPX {mova [cq+128*x], ym12}, 0, 1, 2, 3, 4, 5, 6, 7 +.pass2_end: + movshdup m30, [permC] + vpbroadcastd m11, [pw_2048] + vpbroadcastd m13, [pixel_10bpc_max] + lea r6, [strideq*3] + psrlq m31, m30, 8 + vpermq m8, m30, m0 + vpermq m9, m31, m1 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m2 + vpermq m9, m31, m3 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m4 + vpermq m9, m31, m5 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m6 + vpermq m9, m31, m7 + call m(idct_16x8_internal_10bpc).write_16x4 + + mova m1, [rsp+mmsize*0] + mova m2, [rsp+mmsize*1] + mova m3, [rsp+mmsize*2] + mova m4, [rsp+mmsize*3] + mova m5, [rsp+mmsize*4] + mova m6, [rsp+mmsize*5] + mova m7, [rsp+mmsize*6] + mova m8, [rsp+mmsize*7] + + paddsw m0, m1, m21 + psubsw m21, m1, m21 + paddsw m1, m2, m20 + psubsw m20, m2, m20 + paddsw m2, m3, m19 + psubsw m19, m3, m19 + paddsw m3, m4, m18 + psubsw m18, m4, m18 + paddsw m4, m5, m17 + psubsw m17, m5, m17 + paddsw m5, m6, m16 + psubsw m16, m6, m16 + paddsw m6, m7, m15 + psubsw m15, m7, m15 + paddsw m7, m8, m14 + psubsw m14, m8, m14 + + vpermq m8, m30, m0 + vpermq m9, m31, m1 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m2 + vpermq m9, m31, m3 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m4 + vpermq m9, m31, m5 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m6 + vpermq m9, m31, m7 + call m(idct_16x8_internal_10bpc).write_16x4 + + vpermq m8, m30, m14 + vpermq m9, m31, m15 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m16 + vpermq m9, m31, m17 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m18 + vpermq m9, m31, m19 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m20 + vpermq m9, m31, m21 + call m(idct_16x8_internal_10bpc).write_16x4 + + vpermq m8, m30, m22 + vpermq m9, m31, m23 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m24 + vpermq m9, m31, m25 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m26 + vpermq m9, m31, m27 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m28 + vpermq m9, m31, m29 + call m(idct_16x8_internal_10bpc).write_16x4 + RET +.pass1: + mova m0, [cq+128* 0] + mova m1, [cq+128* 2] + mova m2, [cq+128* 4] + mova m3, [cq+128* 6] + mova m4, [cq+128* 8] + mova m5, [cq+128*10] + mova m6, [cq+128*12] + mova m7, [cq+128*14] + call m(idct_8x16_internal_10bpc).main + mova m16, [cq+128* 1] + mova m17, [cq+128* 3] + mova m18, [cq+128* 5] + mova m19, [cq+128* 7] + mova m20, [cq+128* 9] + mova m21, [cq+128*11] + mova m22, [cq+128*13] + mova m23, [cq+128*15] + call m(idct_16x16_internal_10bpc).main + call m(idct_16x16_internal_10bpc).main_end + jmp m(idct_16x16_internal_10bpc).main_end3 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 64 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2 + +cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 4, 7, 32, -64*40, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + cmp eobd, 136 + jl .fast + add cq, 64 + cmp eobd, 543 + jge .full + call .pass1_fast ; bottomright 16x16 zero + jmp .lefthalf +.full: + call .pass1 + mov r3d, 16*28 +.lefthalf: + mova [cq+128* 0], m27 + mova [cq+128* 1], m14 + mova [cq+128* 2], m28 + mova [cq+128* 3], m15 + mova [cq+128* 4], m22 + mova [cq+128* 5], m23 + mova [cq+128* 6], m24 + mova [cq+128* 7], m25 + mova [cq+128* 8], m0 + mova [cq+128* 9], m26 + mova [cq+128*10], m20 + mova [cq+128*11], m21 + mova [cq+128*12], m18 + mova [cq+128*13], m16 + mova [cq+128*14], m17 + mova [cq+128*15], m3 + sub cq, 64 + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + call .pass1 + call .pass2_start + + pxor m31, m31 +.right_zero_loop: + REPX {mova [cq+r3*8+64+128*x], m31}, 0, 1, 2, 3 + sub r3d, 16*4 + jge .right_zero_loop + mov r3d, 16*28 + jmp .left_zero_loop +.pass2_start: + vpbroadcastd m10, [o(pd_2048)] + lea r5, [o_base_8bpc] + + lea r4, [rsp+gprsize] + mova m1, [cq+128*15+64] + mova m2, [cq+128* 8+64] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + mova m0, m21 + mova m1, [cq+128*12+64] + mova m2, [cq+128*11+64] + mova m3, m18 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + mova m0, m20 + mova m1, [cq+128*13+64] + mova m2, [cq+128*10+64] + mova m3, m16 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + mova m0, m26 + mova m1, [cq+128*14+64] + mova m2, [cq+128* 9+64] + mova m3, m17 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 + + mova m0, m27 + mova m1, m28 + mova m2, [cq+128* 0+64] + mova m3, [cq+128* 2+64] + mova m16, [cq+128* 1+64] + mova m17, [cq+128* 3+64] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + mova m26, [cq+128* 4+64] + mova m27, [cq+128* 5+64] + mova m28, [cq+128* 6+64] + mova m29, [cq+128* 7+64] + mova [rsp+64*32+gprsize], m14 + mova [rsp+64*33+gprsize], m15 + mova [rsp+64*34+gprsize], m16 + mova [rsp+64*35+gprsize], m17 + mova [rsp+64*36+gprsize], m18 + mova [rsp+64*37+gprsize], m19 + mova [rsp+64*38+gprsize], m20 + mova [rsp+64*39+gprsize], m21 + jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast +.fast: ; topleft 16x16 nonzero + cmp eobd, 36 + jl .fast2 + call .pass1_fast + vpbroadcastd m10, [o(pd_2048)] + call .pass2_fast_start + jmp .end +.fast2: ; topleft 8x8 nonzero + movshdup m7, [o(permB)] + mova ym0, [cq+128*0] + mova ym1, [cq+128*4] + mova ym4, [cq+128*2] + mova ym5, [cq+128*6] + mova ym16, [cq+128*1] + mova ym2, [cq+128*5] + mova ym3, [cq+128*3] + mova ym17, [cq+128*7] + mov r3d, 16*4 + vpermq m0, m7, m0 ; 0 0 + vpermq m1, m7, m1 ; 4 4 + vpermt2q m4, m7, m5 ; 2 6 + vpermt2q m16, m7, m2 ; 1 5 + vpermt2q m17, m7, m3 ; 7 3 + REPX {pmulld x, m12}, m0, m1, m4, m16, m17 + REPX {paddd x, m13}, m0, m1, m4, m16, m17 + REPX {psrad x, 12 }, m0, m1, m4, m16, m17 + call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2 + vpbroadcastd m11, [o(pd_1)] + call m(idct_16x16_internal_10bpc).main_end2 + + call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 + punpcklqdq m27, m0, m2 ; 0 + punpckhqdq m0, m2 ; 1 + punpcklqdq m22, m3, m4 ; 2 + punpckhqdq m26, m3, m4 ; 3 + punpcklqdq m14, m5, m7 ; 4 + punpckhqdq m20, m5, m7 ; 5 + punpcklqdq m23, m6, m8 ; 6 + punpckhqdq m21, m6, m8 ; 7 + + mova m10, m13 + call .pass2_fast2_start +.end: + + pxor m31, m31 + +.left_zero_loop: + REPX {mova [cq+r3*8+128*x], m31}, 0, 1, 2, 3 + sub r3d, 16*4 + jge .left_zero_loop + + call .pass2_end + RET +.pass2_end: + DEFINE_ARGS dst, stride, _, dst2, stride32, stklo, stkhi + vpbroadcastd m30, [pixel_10bpc_max] + vpbroadcastd m13, [pw_2048] + + mov stride32q, strideq + shl stride32q, 5 + lea stkhiq, [rsp+31*mmsize+gprsize] + lea dst2q, [dstq+stride32q] + lea stkloq, [rsp+gprsize] + sub dst2q, strideq ; dst31 + + paddsw m8, m0, m29 ; t0[idct32] + psubsw m9, m0, m29 ; t31[idct32] + call .end_sumsub_write + paddsw m8, m1, m28 ; t1[idct32] + psubsw m9, m1, m28 ; t30[idct32] + call .end_sumsub_write + paddsw m8, m2, m27 ; t2[idct32] + psubsw m9, m2, m27 ; t29[idct32] + call .end_sumsub_write + paddsw m8, m3, m26 ; t3[idct32] + psubsw m9, m3, m26 ; t28[idct32] + call .end_sumsub_write + paddsw m8, m4, m25 ; t4[idct32] + psubsw m9, m4, m25 ; t27[idct32] + call .end_sumsub_write + paddsw m8, m5, m24 ; t5[idct32] + psubsw m9, m5, m24 ; t26[idct32] + call .end_sumsub_write + paddsw m8, m6, m23 ; t6[idct32] + psubsw m9, m6, m23 ; t25[idct32] + call .end_sumsub_write + paddsw m8, m7, m22 ; t7[idct32] + psubsw m9, m7, m22 ; t24[idct32] + call .end_sumsub_write + mova m0, [rsp+64*32+gprsize] + mova m1, [rsp+64*33+gprsize] + mova m2, [rsp+64*34+gprsize] + mova m3, [rsp+64*35+gprsize] + mova m4, [rsp+64*36+gprsize] + mova m5, [rsp+64*37+gprsize] + mova m6, [rsp+64*38+gprsize] + mova m7, [rsp+64*39+gprsize] + paddsw m8, m0, m21 ; t8[idct32] + psubsw m9, m0, m21 ; t23[idct32] + call .end_sumsub_write + paddsw m8, m1, m20 ; t9[idct32] + psubsw m9, m1, m20 ; t22[idct32] + call .end_sumsub_write + paddsw m8, m2, m19 ; t10[idct32] + psubsw m9, m2, m19 ; t21[idct32] + call .end_sumsub_write + paddsw m8, m3, m18 ; t11[idct32] + psubsw m9, m3, m18 ; t20[idct32] + call .end_sumsub_write + paddsw m8, m4, m17 ; t12[idct32] + psubsw m9, m4, m17 ; t19[idct32] + call .end_sumsub_write + paddsw m8, m5, m16 ; t13[idct32] + psubsw m9, m5, m16 ; t18[idct32] + call .end_sumsub_write + paddsw m8, m6, m15 ; t14[idct32] + psubsw m9, m6, m15 ; t17[idct32] + call .end_sumsub_write + paddsw m8, m7, m14 ; t15[idct32] + psubsw m9, m7, m14 ; t16[idct32] + ; fall-through +.end_sumsub_write: + mova m10, [stkhiq] ; t63-n + mova m12, [stkloq] ; t32+n + psubsw m11, m8, m10 ; out63-n + paddsw m8, m10 ; out0 +n + psubsw m10, m9, m12 ; out32+n + paddsw m9, m12 ; out32-n + REPX {pmulhrsw x, m13}, m11, m8, m10, m9 + paddw m8, [dstq] + paddw m9, [dst2q] + paddw m10, [dstq+stride32q] + paddw m11, [dst2q+stride32q] + REPX {pminsw x, m30}, m11, m8, m10, m9 + REPX {pmaxsw x, m31}, m11, m8, m10, m9 + mova [dstq ], m8 + mova [dst2q ], m9 + mova [dstq +stride32q], m10 + mova [dst2q+stride32q], m11 + add stkloq, mmsize + sub stkhiq, mmsize + add dstq, strideq + sub dst2q, strideq + ret +.pass2_fast_start: + lea r5, [o_base_8bpc] + lea r4, [rsp+gprsize] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + mova m0, m21 + mova m3, m18 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + mova m0, m20 + mova m3, m16 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + mova m0, m26 + mova m3, m17 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 + + mova m0, m27 + mova m1, m28 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 + mova [rsp+64*32+gprsize], m14 + mova [rsp+64*33+gprsize], m15 + mova [rsp+64*34+gprsize], m16 + mova [rsp+64*35+gprsize], m17 + mova [rsp+64*36+gprsize], m18 + mova [rsp+64*37+gprsize], m19 + mova [rsp+64*38+gprsize], m20 + mova [rsp+64*39+gprsize], m21 + jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 +.pass2_fast2_start: + lea r5, [o_base_8bpc] + lea r4, [rsp+gprsize] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 + mova m0, m21 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 + mova m0, m20 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 + mova m0, m26 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 + + mova m0, m27 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast3 + mova [rsp+64*32+gprsize], m14 + mova [rsp+64*33+gprsize], m15 + mova [rsp+64*34+gprsize], m16 + mova [rsp+64*35+gprsize], m17 + mova [rsp+64*36+gprsize], m18 + mova [rsp+64*37+gprsize], m19 + mova [rsp+64*38+gprsize], m20 + mova [rsp+64*39+gprsize], m21 + jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast3 +.dconly: + DEFINE_ARGS dst, stride, c, eob + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 64 + jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly3 +.pass1_fast: + pmulld m0, m12, [cq+128* 0] + pmulld m1, m12, [cq+128* 4] + pmulld m2, m12, [cq+128* 8] + pmulld m3, m12, [cq+128*12] + mov r3d, 16*12 + call m(idct_8x16_internal_10bpc).main_fast_rect2 + pmulld m16, m12, [cq+128* 2] + pmulld m17, m12, [cq+128* 6] + pmulld m18, m12, [cq+128*10] + pmulld m19, m12, [cq+128*14] + call m(idct_16x16_internal_10bpc).main_fast_rect2 + call .pass1_load_spill + call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2 + jmp .pass1_end +.pass1: + pmulld m0, m12, [cq+128* 0] + pmulld m1, m12, [cq+128* 4] + pmulld m2, m12, [cq+128* 8] + pmulld m3, m12, [cq+128*12] + pmulld m4, m12, [cq+128*16] + pmulld m5, m12, [cq+128*20] + pmulld m6, m12, [cq+128*24] + pmulld m7, m12, [cq+128*28] + call m(idct_8x16_internal_10bpc).main_rect2 + pmulld m16, m12, [cq+128* 2] + pmulld m17, m12, [cq+128* 6] + pmulld m18, m12, [cq+128*10] + pmulld m19, m12, [cq+128*14] + pmulld m20, m12, [cq+128*18] + pmulld m21, m12, [cq+128*22] + pmulld m22, m12, [cq+128*26] + pmulld m23, m12, [cq+128*30] + call m(idct_16x16_internal_10bpc).main_rect2 + call .pass1_load_spill + pmulld m16, m12, [cq+128*17] + pmulld m17, m12, [cq+128*19] + pmulld m18, m12, [cq+128*21] + pmulld m19, m12, [cq+128*23] + pmulld m20, m12, [cq+128*25] + pmulld m21, m12, [cq+128*27] + pmulld m22, m12, [cq+128*29] + pmulld m23, m12, [cq+128*31] + call m(inv_txfm_add_dct_dct_32x16_10bpc).main_rect2 +.pass1_end: + vpbroadcastd m11, [o(pd_1)] + lea r4, [cq+128*8] + call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end + punpcklqdq m27, m0, m20 ; 0 + punpckhqdq m0, m20 ; 1 + punpcklqdq m24, m5, m16 ; 10 + punpckhqdq m16, m5, m16 ; 11 + punpcklqdq m23, m3, m21 ; 6 + punpckhqdq m21, m3, m21 ; 7 + punpcklqdq m25, m7, m8 ; 14 + punpckhqdq m3, m7, m8 ; 15 + punpcklqdq m22, m15, m4 ; 2 + punpckhqdq m26, m15, m4 ; 3 + punpcklqdq m15, m6, m17 ; 12 + punpckhqdq m17, m6, m17 ; 13 + punpcklqdq m28, m14, m18 ; 8 + punpckhqdq m18, m14, m18 ; 9 + punpcklqdq m14, m2, m1 ; 4 + punpckhqdq m20, m2, m1 ; 5 + ret +.pass1_load_spill: + call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub + mova [cq+128* 0], m0 + pmulld m0, m12, [cq+128* 1] + mova [cq+128* 1], m1 + mova [cq+128* 2], m2 + pmulld m1, m12, [cq+128* 3] + pmulld m2, m12, [cq+128* 5] + mova [cq+128* 3], m3 + mova [cq+128* 4], m4 + pmulld m3, m12, [cq+128* 7] + pmulld m4, m12, [cq+128* 9] + mova [cq+128* 5], m5 + mova [cq+128* 6], m6 + mova [cq+128* 7], m7 + pmulld m5, m12, [cq+128*11] + pmulld m6, m12, [cq+128*13] + pmulld m7, m12, [cq+128*15] + mova [cq+128* 8], m23 + mova [cq+128* 9], m22 + mova [cq+128*10], m21 + mova [cq+128*11], m20 + mova [cq+128*12], m19 + mova [cq+128*13], m18 + mova [cq+128*14], m17 + mova [cq+128*15], m16 + ret + +cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + + PROLOGUE 4, 7, 32, -64*32, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + cmp eobd, 36 + jl .fast ; 8x8 + cmp eobd, 151 + jge .full ; 16x16 + lea r4, [idct64_mul_16bpc] + lea r6, [rsp+4*64] + mova m0, [cq+64* 1] + mova m3, [cq+64*15] + call .main_part1_fast + mova m0, [cq+64* 7] + mova m3, [cq+64* 9] + call .main_part1_fast + mova m0, [cq+64* 5] + mova m3, [cq+64*11] + call .main_part1_fast + mova m0, [cq+64* 3] + mova m3, [cq+64*13] + call .main_part1_fast + call .main_part2 + mova m0, [cq+64* 0] + mova m1, [cq+64* 8] + mova m16, [cq+64* 4] + mova m17, [cq+64*12] + call m(idct_8x16_internal_10bpc).main_fast2 + call m(idct_16x16_internal_10bpc).main_fast2 + call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub + call .pass1_load_spill + call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2 + mov r6d, 12*8 + jmp .idct64_end +.full: + lea r4, [idct64_mul_16bpc] + lea r6, [rsp+4*64] + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + call .main_part1 + mova m0, [cq+64* 7] + mova m1, [cq+64*25] + mova m2, [cq+64*23] + mova m3, [cq+64* 9] + call .main_part1 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + call .main_part1 + mova m0, [cq+64* 3] + mova m1, [cq+64*29] + mova m2, [cq+64*19] + mova m3, [cq+64*13] + call .main_part1 + call .main_part2 + mova m0, [cq+64* 0] + mova m1, [cq+64* 8] + mova m2, [cq+64*16] + mova m3, [cq+64*24] + mova m16, [cq+64* 4] + mova m17, [cq+64*12] + mova m18, [cq+64*20] + mova m19, [cq+64*28] + call m(idct_8x16_internal_10bpc).main_fast + call m(idct_16x16_internal_10bpc).main_fast + call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub + call .pass1_load_spill + mova m4, [cq+64*18] + mova m5, [cq+64*22] + mova m6, [cq+64*26] + mova m7, [cq+64*30] + call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast + mov r6d, 28*8 + jmp .idct64_end +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 16 +.dconly1: + add r6d, 640 + sar r6d, 10 +.dconly2: + vpbroadcastd m3, [o(dconly_10bpc)] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + vpbroadcastw m2, r6d + paddsw m2, m3 +.dconly_loop: + paddsw m0, m2, [dstq+64*0] + paddsw m1, m2, [dstq+64*1] + psubusw m0, m3 + psubusw m1, m3 + mova [dstq+64*0], m0 + mova [dstq+64*1], m1 + add dstq, strideq + dec r3d + jg .dconly_loop + ret +.pass1_load_spill: + mova [cq+64* 0], m0 + mova m0, [cq+64* 2] + mova [cq+64* 2], m1 + mova m1, [cq+64* 6] + mova [cq+64* 4], m2 + mova [cq+64* 6], m3 + mova m2, [cq+64*10] + mova m3, [cq+64*14] + mova [cq+64* 8], m4 + mova [cq+64*10], m5 + mova [cq+64*12], m6 + mova [cq+64*14], m7 + mova [cq+64* 1], m23 + mova [cq+64* 3], m22 + mova [cq+64* 5], m21 + mova [cq+64* 7], m20 + mova [cq+64* 9], m19 + mova [cq+64*11], m18 + mova [cq+64*13], m17 + mova [cq+64*15], m16 + ret +ALIGN function_align +.main_part1_fast_rect2: + REPX {paddd x, m13}, m0, m3 + REPX {psrad x, 12 }, m0, m3 +.main_part1_fast: + pmulld m7, m0, [r4+4*0]{bcstd} ; t63a + pmulld m0, [r4+4*1]{bcstd} ; t32a + pmulld m4, m3, [r4+4*6]{bcstd} ; t60a + pmulld m3, [r4+4*7]{bcstd} ; t35a + vpbroadcastd m10, [r4+4*8] + vpbroadcastd m11, [r4+4*9] + REPX {paddd x, m13}, m7, m0, m4, m3 + REPX {psrad x, 12 }, m7, m0, m4, m3 + mova m8, m0 + mova m1, m7 + mova m6, m3 + mova m2, m4 + jmp .main_part1b +.main_part1_rect2: + REPX {paddd x, m13}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_part1: ; idct64 steps 1-5 + ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + pmulld m7, m0, [r4+4*0]{bcstd} ; t63a + pmulld m0, [r4+4*1]{bcstd} ; t32a + pmulld m6, m1, [r4+4*2]{bcstd} ; t62a + pmulld m1, [r4+4*3]{bcstd} ; t33a + pmulld m5, m2, [r4+4*4]{bcstd} ; t61a + pmulld m2, [r4+4*5]{bcstd} ; t34a + pmulld m4, m3, [r4+4*6]{bcstd} ; t60a + pmulld m3, [r4+4*7]{bcstd} ; t35a + vpbroadcastd m10, [r4+4*8] + vpbroadcastd m11, [r4+4*9] + REPX {paddd x, m13}, m7, m0, m6, m1, m5, m2, m4, m3 + REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 + psubd m8, m0, m1 ; t33 + paddd m0, m1 ; t32 + psubd m1, m7, m6 ; t62 + paddd m7, m6 ; t63 + psubd m6, m3, m2 ; t34 + paddd m3, m2 ; t35 + psubd m2, m4, m5 ; t61 + paddd m4, m5 ; t60 +.main_part1b: + REPX {pmaxsd x, m14}, m8, m1, m6, m2 + REPX {pminsd x, m15}, m8, m1, m6, m2 + ITX_MULSUB_2D 1, 8, 5, 9, _, 13, 10, 11 ; t33a, t62a + ITX_MULSUB_2D 2, 6, 5, 9, _, 13, 10, 11, 2 ; t61a, t34a + REPX {pmaxsd x, m14}, m0, m3, m7, m4 + REPX {pminsd x, m15}, m0, m3, m7, m4 + vpbroadcastd m10, [r4+4*10] + vpbroadcastd m11, [r4+4*11] + psubd m5, m0, m3 ; t35a + paddd m0, m3 ; t32a + psubd m3, m7, m4 ; t60a + paddd m7, m4 ; t63a + psubd m4, m1, m6 ; t34 + paddd m1, m6 ; t33 + psubd m6, m8, m2 ; t61 + paddd m8, m2 ; t62 + REPX {pmaxsd x, m14}, m5, m3, m4, m6 + REPX {pminsd x, m15}, m5, m3, m4, m6 + ITX_MULSUB_2D 3, 5, 2, 9, _, 13, 10, 11 ; t35, t60 + ITX_MULSUB_2D 6, 4, 2, 9, _, 13, 10, 11 ; t34a, t61a + REPX {pmaxsd x, m14}, m0, m7, m1, m8 + REPX {pminsd x, m15}, m0, m7, m1, m8 + add r4, 4*12 + mova [r6-64*4], m0 + mova [r6+64*3], m7 + mova [r6-64*3], m1 + mova [r6+64*2], m8 + mova [r6-64*2], m6 + mova [r6+64*1], m4 + mova [r6-64*1], m3 + mova [r6+64*0], m5 + add r6, 64*8 + ret +.main_part2: ; idct64 steps 6-9 + lea r4, [r6+64*3] + sub r6, 64*4 + vpbroadcastd m10, [pd_1567] + vpbroadcastd m11, [pd_3784] +.main_part2_loop: + mova m0, [r6-64*32] ; t32a + mova m1, [r4-64*24] ; t39a + mova m2, [r4-64*32] ; t63a + mova m3, [r6-64*24] ; t56a + mova m4, [r6-64*16] ; t40a + mova m5, [r4-64* 8] ; t47a + mova m6, [r4-64*16] ; t55a + mova m7, [r6-64* 8] ; t48a + psubd m8, m0, m1 ; t39 + paddd m0, m1 ; t32 + psubd m1, m2, m3 ; t56 + paddd m2, m3 ; t63 + psubd m3, m5, m4 ; t40 + paddd m5, m4 ; t47 + psubd m4, m7, m6 ; t55 + paddd m7, m6 ; t48 + REPX {pmaxsd x, m14}, m8, m1, m3, m4 + REPX {pminsd x, m15}, m8, m1, m3, m4 + ITX_MULSUB_2D 1, 8, 6, 9, _, 13, 10, 11 ; t39a, t56a + ITX_MULSUB_2D 4, 3, 6, 9, _, 13, 10, 11, 2 ; t55a, t40a + REPX {pmaxsd x, m14}, m0, m2, m5, m7 + REPX {pminsd x, m15}, m0, m5, m2, m7 + psubd m6, m2, m7 ; t48a + paddd m2, m7 ; t63a + psubd m7, m0, m5 ; t47a + paddd m0, m5 ; t32a + psubd m5, m8, m4 ; t55 + paddd m8, m4 ; t56 + psubd m4, m1, m3 ; t40 + paddd m1, m3 ; t39 + REPX {pmaxsd x, m14}, m6, m7, m5, m4 + REPX {pminsd x, m15}, m6, m7, m5, m4 + REPX {pmulld x, m12}, m6, m7, m5, m4 + REPX {pmaxsd x, m14}, m2, m0, m8, m1 + REPX {pminsd x, m15}, m2, m0, m8, m1 + paddd m6, m13 + paddd m5, m13 + psubd m3, m6, m7 ; t47 + paddd m6, m7 ; t48 + psubd m7, m5, m4 ; t40a + paddd m5, m4 ; t55a + REPX {psrad x, 12}, m3, m6, m7, m5 + mova [r4-64* 8], m2 + mova [r6-64*32], m0 + mova [r6-64* 8], m8 + mova [r4-64*32], m1 + mova [r4-64*24], m3 + mova [r6-64*16], m6 + mova [r6-64*24], m7 + mova [r4-64*16], m5 + add r6, 64 + sub r4, 64 + cmp r6, r4 + jl .main_part2_loop + ret +.idct64_main_end: +%macro IDCT64_PASS1_END 9 + mova m%5, [%9+%1*128] ; t0+n [idct32] + idct64 rounding + psubd m%6, m%5, m%2 ; out31-n [idct32] = t31-n [idct64] + paddd m%5, m%2 ; out0+n [idct32] = t0+n [idct64] + REPX {pmaxsd x, m14}, m%6, m%5 + REPX {pminsd x, m15}, m%6, m%5 + REPX {paddd x, m11}, m%6, m%5 + mova m%2, [r3+%3*64] ; t32+n [idct64] + mova m%7, [r3+%4*64] ; t63-n [idct64] + psubd m%8, m%5, m%7 ; out63-n + paddd m%5, m%7 ; out0+n + psubd m%7, m%6, m%2 ; out32+n + paddd m%6, m%2 ; out31-n + REPX {vpsravd x, m11}, m%8, m%5, m%7, m%6 +%endmacro + +%macro IDCT64_PASS1_ENDx4 1 +%assign %%m1 %1 ; t32+n +%assign %%m2 (7-%1) ; t39-n +%assign %%m3 (8+%1) ; t40+n +%assign %%m4 (15-%1) ; t47-n +%assign %%m5 (16+%1) ; t48+n +%assign %%m6 (23-%1) ; t55-n +%assign %%m7 (24+%1) ; t56+n +%assign %%m8 (31-%1) ; t63-n + +%assign %%r1 %1 ; t16+n +%assign %%r2 (7-%1) ; t23-n +%assign %%r3 (16+%1) ; t24-n +%assign %%r4 (23-%1) ; t31-n + +%assign %%c1 (%1) ; t0/8+n +%assign %%c2 (7-%1) ; t7/15-n + + IDCT64_PASS1_END %%c1, %%r4, %%m1, %%m8, 24, 25, 26, 27, cq ; out0/31/32/63 + IDCT64_PASS1_END %%c1, %%r1, %%m4, %%m5, 28, 29, 30, 31, r4 ; out15/16/47/48 + packssdw m %+ %%r1, m24, m29 + packssdw m %+ %%r4, m28, m25 + packssdw m26, m31 + packssdw m30, m27 + mova [r3+%%m5*mmsize], m26 + mova [r3+%%m8*mmsize], m30 + IDCT64_PASS1_END %%c2, %%r3, %%m2, %%m7, 24, 25, 26, 27, cq ; out7/24/39/56 + IDCT64_PASS1_END %%c2, %%r2, %%m3, %%m6, 28, 29, 30, 31, r4 ; out8/23/40/55 + packssdw m %+ %%r2, m24, m29 + packssdw m %+ %%r3, m28, m25 + packssdw m26, m31 + packssdw m30, m27 + mova [r3+%%m6*mmsize], m26 + mova [r3+%%m7*mmsize], m30 +%endmacro + IDCT64_PASS1_ENDx4 0 + IDCT64_PASS1_ENDx4 1 + IDCT64_PASS1_ENDx4 2 + IDCT64_PASS1_ENDx4 3 + ret +.idct64_end: + vpbroadcastd m11, [o(pd_2)] + lea r4, [cq+64] + mov r3, rsp + lea r5, [o_base_8bpc] + call .idct64_main_end + + pxor m12, m12 +.zero_loop: + REPX {mova [cq+r6*8+64*x], m12}, 0, 1, 2, 3 + sub r6d, 8*4 + jge .zero_loop + + lea r3, [strideq*3] + mov r4, dstq + call .pass2 + mova m0, [rsp+16*mmsize] + mova m1, [rsp+17*mmsize] + mova m2, [rsp+18*mmsize] + mova m3, [rsp+19*mmsize] + mova m4, [rsp+20*mmsize] + mova m5, [rsp+21*mmsize] + mova m6, [rsp+22*mmsize] + mova m7, [rsp+23*mmsize] + mova m16, [rsp+24*mmsize] + mova m17, [rsp+25*mmsize] + mova m18, [rsp+26*mmsize] + mova m19, [rsp+27*mmsize] + mova m20, [rsp+28*mmsize] + mova m21, [rsp+29*mmsize] + mova m22, [rsp+30*mmsize] + mova m23, [rsp+31*mmsize] + lea dstq, [r4+64] + call .pass2 + RET +.pass2: + psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11 + psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 + call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32 + + punpckhqdq m19, m5, m16 ; 11 + punpcklqdq m5, m16 ; 10 + punpckhqdq m16, m2, m1 ; 5 + punpcklqdq m2, m1 ; 4 + punpcklqdq m1, m15, m4 ; 2 + punpckhqdq m15, m4 ; 3 + punpcklqdq m4, m14, m18 ; 8 + punpckhqdq m18, m14, m18 ; 9 + punpckhqdq m14, m0, m20 ; 1 + punpcklqdq m0, m20 ; 0 + punpckhqdq m20, m6, m17 ; 13 + punpcklqdq m6, m17 ; 12 + punpckhqdq m17, m3, m21 ; 7 + punpcklqdq m3, m21 ; 6 + punpckhqdq m21, m7, m8 ; 15 + punpcklqdq m7, m8 ; 14 + + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf +.write: + vpbroadcastd m11, [pw_2048] + pxor m12, m12 + vpbroadcastd m13, [pixel_10bpc_max] + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8 + pmulhrsw m0, m11, m14 + pmulhrsw m1, m11, m15 + pmulhrsw m2, m11, m16 + pmulhrsw m3, m11, m17 + call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 + pmulhrsw m0, m11, m18 + pmulhrsw m1, m11, m19 + pmulhrsw m2, m11, m20 + pmulhrsw m3, m11, m21 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 +.fast: ; 8x8 packed + movshdup m7, [o(permB)] + mova ym0, [cq+64*1] + mova ym2, [cq+64*5] + mova ym3, [cq+64*3] + mova ym1, [cq+64*7] + vpermt2q m0, m7, m2 ; 1 5 + vpermt2q m1, m7, m3 ; 7 3 + call .main_oddhalf_packed + mova [rsp+ 0*mmsize], m0 + mova [rsp+ 1*mmsize], m1 + mova [rsp+ 2*mmsize], m2 + mova [rsp+ 3*mmsize], m3 + mova [rsp+ 4*mmsize], m4 + mova [rsp+ 5*mmsize], m5 + mova [rsp+ 6*mmsize], m6 + mova [rsp+ 7*mmsize], m7 + mova [rsp+ 8*mmsize], m16 + mova [rsp+ 9*mmsize], m17 + mova [rsp+10*mmsize], m18 + mova [rsp+11*mmsize], m19 + mova [rsp+12*mmsize], m20 + mova [rsp+13*mmsize], m21 + mova [rsp+14*mmsize], m22 + mova [rsp+15*mmsize], m23 + + movshdup m7, [o(permB)] + mova ym0, [cq+64*0] + mova ym4, [cq+64*4] + mova ym16, [cq+64*2] + mova ym5, [cq+64*6] + vpermt2q m16, m7, m5 ; 2 6 + vpermq m0, m7, m0 ; 0 0 + vpermq m4, m7, m4 ; 4 4 + call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3 + ; m0-7,9,16-22 contain un-sumsub'ed dct32 output data + + ; zero input coefs + pxor m12, m12 + REPX {mova [cq+x*64], ym12}, 0, 1, 2, 3, 4, 5, 6, 7 + + vpbroadcastd m11, [o(pd_2)] + call .main_end + lea r3, [strideq*3] + mov r4, dstq + call .pass2_fast + mova m0, m24 + mova m1, m25 + mova m2, m26 + mova m3, m27 + mova m4, m28 + mova m5, m29 + mova m6, m30 + mova m7, m31 + lea dstq, [r4+64] + lea r5, [o_base] + call .pass2_fast + RET +.pass2_fast: + call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 + lea r5, [o_base_8bpc] + punpckhqdq m14, m0, m2 ; 1 + punpcklqdq m0, m2 ; 0 + punpcklqdq m1, m3, m4 ; 2 + punpckhqdq m15, m3, m4 ; 3 + punpcklqdq m2, m5, m7 ; 4 + punpckhqdq m16, m5, m7 ; 5 + punpcklqdq m3, m6, m8 ; 6 + punpckhqdq m17, m6, m8 ; 7 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + jmp .write +.main_end: + +%macro IDCT64_PASS1_PACKED_END 7 + psubd m%5, m%1, m%2 ; out31-n [idct32] = t31-n [idct64] + paddd m%1, m%2 ; out0+n [idct32] = t0+n [idct64] + REPX {pmaxsd x, m14}, m%5, m%1 + REPX {pminsd x, m15}, m%5, m%1 + REPX {paddd x, m11}, m%5, m%1 + mova m%2, [rsp+%6*64+gprsize] ; t32+n [idct64] + mova m%3, [rsp+%7*64+gprsize] ; t63-n [idct64] + psubd m%4, m%1, m%3 ; out63-n + paddd m%1, m%3 ; out0+n + psubd m%3, m%5, m%2 ; out32+n + paddd m%2, m%5 ; out31-n + REPX {vpsravd x, m11}, m%4, m%1, m%3, m%2 +%endmacro + + IDCT64_PASS1_PACKED_END 0, 22, 24, 10, 12, 0, 15 ; out0/1,31/30,32/33,63/62 + IDCT64_PASS1_PACKED_END 7, 9, 31, 13, 12, 7, 8 ; out15/14,16/17,47/46,48/49 + packssdw m0, m9 + packssdw m7, m22 + packssdw m24, m13 + packssdw m31, m10 + IDCT64_PASS1_PACKED_END 1, 21, 25, 10, 12, 1, 14 ; out3/2,28/29,35/34,60/61 + IDCT64_PASS1_PACKED_END 6, 16, 30, 13, 12, 6, 9 ; out12/13,19/18,44/45,51/50 + packssdw m1, m16 + packssdw m6, m21 + packssdw m25, m13 + packssdw m30, m10 + IDCT64_PASS1_PACKED_END 2, 20, 26, 10, 12, 2, 13 ; out4/5,27/26,36/37,59/58 + IDCT64_PASS1_PACKED_END 5, 17, 29, 13, 12, 5, 10 ; out11/10,20/21,43/42,52/53 + packssdw m2, m17 + packssdw m5, m20 + packssdw m26, m13 + packssdw m29, m10 + IDCT64_PASS1_PACKED_END 3, 19, 27, 10, 12, 3, 12 ; out7/6,24/25,39/38,56/57 + IDCT64_PASS1_PACKED_END 4, 18, 28, 13, 12, 4, 11 ; out8/9,23/22,40/41,55/54 + packssdw m3, m18 + packssdw m4, m19 + packssdw m27, m13 + packssdw m28, m10 + ret +.main_oddhalf_packed_rect2: + REPX {paddd x, m13}, m0, m1 + REPX {psrad x, 12 }, m0, m1 +.main_oddhalf_packed: + ; m0=in1 in5, m1=in7 in3 + vbroadcasti32x4 m2, [o(pd_101_501)] + vbroadcasti32x4 m3, [o(pd_m700_m301)] + vbroadcasti32x4 m4, [o(pd_4095_4065)] + vbroadcasti32x4 m5, [o(pd_4036_4085)] + pmulld m2, m0 + pmulld m3, m1 + pmulld m0, m4 + pmulld m1, m5 + REPX {paddd x, m13}, m2, m3, m0, m1 + REPX {psrad x, 12 }, m2, m3, m0, m1 + + ; m2=t32a t40a -> t32/33 t40/41, m3=t39a t47a -> t38/39 t46/47 + ; m0=t63a t55a -> t62/63 t54/55, m1=t56a t48a -> t56/57 t48/49 + ; end of step 1-2 + + vbroadcasti32x4 m10, [o(pd_401_1931)] + vbroadcasti32x4 m11, [o(pd_4076_3612)] + mova m4, m0 + mova m5, m2 + ITX_MULSUB_2D 4, 5, 8, 9, _, 13, 10, 11 + vbroadcasti32x4 m10, [o(pd_3166_3920)] + vbroadcasti32x4 m11, [o(pd_2598_1189)] + mova m6, m3 + mova m7, m1 + ITX_MULSUB_2D 7, 6, 8, 9, _, 13, 10, 11, 2 + + ; m4=t33a t41a -> t41/42 t33/34, m5=t63a t54a -> t61/62 t53/54 + ; m6=t38a t46a -> t37/38 t45/46, m7=t57a t49a -> t57/58 t49/50 + ; and from earlier: + ; m0=t63 t55 -> t60/63a t52/55a, m1=t56 t48 -> t56/59a t48/51a + ; m2=t32 t40 -> t32/35a t40/43a, m3=t39 t47 -> t36/39a t44/47a + ; end of step 3-4 + + punpcklqdq m22, m2, m4 ; t32a/33 or t35a/34 + punpcklqdq m21, m3, m6 ; t36a/37 or t39a/38 + punpckhqdq m18, m2, m4 ; t40a/41 or t43a/42 + punpckhqdq m17, m3, m6 ; t44a/45 or t47a/46 + punpckhqdq m6, m1, m7 ; t48a/49 or t51a/50 + punpckhqdq m19, m0, m5 ; t52a/53 or t55a/54 + punpcklqdq m8, m1, m7 ; t56a/57 or t59a/58 + punpcklqdq m23, m0, m5 ; t60a/61 or t63a/62 + mova m0, m22 + mova m7, m21 + mova m3, m18 + mova m16, m17 + mova m5, m6 + mova m4, m19 + mova m2, m8 + mova m1, m23 + ; m0/22/7/21,18/3/17/16,6/5/19/4,2/8/1/23: t32-63[a] + + ; step5 + vpbroadcastd m10, [o(pd_799)] + vpbroadcastd m11, [o(pd_4017)] + ITX_MULSUB_2D 1, 22, 20, 9, _, 13, 10, 11 ; t35/34a, t60/61a + ITX_MULSUB_2D 8, 7, 20, 9, _, 13, 10, 11, 2 ; t59/58a, t36/37a + vpbroadcastd m10, [o(pd_3406)] + vpbroadcastd m11, [o(pd_2276)] + ITX_MULSUB_2D 19, 3, 20, 9, _, 13, 10, 11 ; t43/42a, t52/53a + ITX_MULSUB_2D 5, 17, 20, 9, _, 13, 10, 11, 2 ; t51/50a, t44/45a + ; m0-1/7/21: t32-39[a], m18-19/17-16: t40-47[a] + ; m6-5/3-4: t48-55[a], m2/8/22-23: t56-63[a] + + ; step6 + psubd m20, m0, m21 ; t39/38a + paddd m0, m21 ; t32/33a + psubd m21, m1, m7 ; t36a/37 + paddd m1, m7 ; t35a/34 + REPX {pmaxsd x, m14}, m20, m0, m21, m1 + psubd m7, m16, m18 ; t40/41a + paddd m16, m18 ; t47/46a + REPX {pminsd x, m15}, m20, m0, m21, m1 + psubd m18, m17, m19 ; t43a/42 + paddd m17, m19 ; t44a/45 + REPX {pmaxsd x, m14}, m7, m16, m18, m17 + psubd m19, m6, m4 ; t55/54a + paddd m6, m4 ; t48/49a + REPX {pminsd x, m15}, m7, m16, m18, m17 + psubd m4, m5, m3 ; t52a/53 + paddd m5, m3 ; t51a/50 + REPX {pmaxsd x, m14}, m19, m6, m4, m5 + psubd m3, m23, m2 ; t56/57a + paddd m23, m2 ; t63/62a + REPX {pminsd x, m15}, m19, m6, m4, m5 + psubd m2, m22, m8 ; t59a/58 + paddd m22, m8 ; t60a/61 + REPX {pmaxsd x, m14}, m3, m23, m2, m22 + REPX {pminsd x, m15}, m3, m23, m2, m22 + ; m0-1: t32-35[a], m17-16: t44-47[a], m6-5: t48-51[a], m22-23: t60-63[a] + ; m21-20: t36-39[a], m7/18: t40-43[a], m4/19: t52-55[a], m3-2: t56-59[a] + + ; step7 + vpbroadcastd m10, [o(pd_1567)] + vpbroadcastd m11, [o(pd_3784)] + ITX_MULSUB_2D 2, 21, 8, 9, _, 13, 10, 11 ; t36/37a, t59/58a + ITX_MULSUB_2D 3, 20, 8, 9, _, 13, 10, 11 ; t39a/38, t56a/57 + ITX_MULSUB_2D 19, 7, 8, 9, _, 13, 10, 11, 2 ; t55a/54, t40a/41 + ITX_MULSUB_2D 4, 18, 8, 9, _, 13, 10, 11, 2 ; t52/53a, t43/42a + ; m0-3: t32-39[a], m7,18-16: t40-47[a], m6-4,19: t48-55[a], m20-23: t56-63[a] + + ; step8 + psubd m8, m0, m16 ; t47a/46 + paddd m0, m16 ; t32a/33 + psubd m16, m1, m17 ; t44/45a + paddd m1, m17 ; t35/34a + REPX {pmaxsd x, m14}, m8, m0, m16, m1 + psubd m17, m2, m18 ; t43a/42 + paddd m2, m18 ; t36a/37 + REPX {pminsd x, m15}, m8, m0, m16, m1 + psubd m18, m3, m7 ; t40/41a + paddd m3, m7 ; t39/38a + REPX {pmaxsd x, m14}, m17, m2, m18, m3 + psubd m7, m23, m6 ; t48a/49 + paddd m23, m6 ; t63a/62 + REPX {pminsd x, m15}, m17, m2, m18, m3 + psubd m6, m22, m5 ; t51/50a + paddd m22, m5 ; t60/61a + REPX {pmaxsd x, m14}, m7, m23, m6, m22 + psubd m5, m21, m4 ; t52a/53 + paddd m21, m4 ; t59a/58 + REPX {pminsd x, m15}, m7, m23, m6, m22 + psubd m4, m20, m19 ; t55/54a + paddd m20, m19 ; t56/57a + REPX {pmaxsd x, m14}, m5, m21, m4, m20 + REPX {pminsd x, m15}, m5, m21, m4, m20 + ; m0-3=t32-39[a], m18-16,8: t40-47[a], m7-4=t48-55[a], m20-23=t56-63[a] + + ; step9 + REPX {pmulld x, m12}, m4, m18, m5, m17, m6, m16, m7, m8 + REPX {paddd x, m13}, m4, m5, m6, m7 + paddd m19, m4, m18 ; t55a/54 + psubd m4, m18 ; t40a/41 + paddd m18, m5, m17 ; t52/53a + psubd m5, m17 ; t43/42a + paddd m17, m6, m16 ; t51a/50 + psubd m6, m16 ; t44a/45 + paddd m16, m7, m8 ; t48/49a + psubd m7, m8 ; t47/46a + REPX {psrad x, 12 }, m19, m4, m18, m5, m17, m6, m16, m7 + ; m4-7=t40-47[a], m16-19=t48-55[a] + ret + +cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + + PROLOGUE 4, 8, 32, -64*32, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + cmp eobd, 136 + jl .fast + add cq, 64 + cmp eobd, 543 + jge .full + call .pass1_fast ; bottomright 16x16 zero + mov r7d, 16*12 + jmp .lefthalf +.full: + call .pass1 + mov r7d, 16*28 +.lefthalf: + mova [cq+128* 0], m0 + mova [cq+128* 1], m1 + mova [cq+128* 2], m2 + mova [cq+128* 3], m3 + mova [cq+128* 4], m14 + mova [cq+128* 5], m15 + mova [cq+128* 6], m16 + mova [cq+128* 7], m17 + mova [cq+128* 8], m22 + mova [cq+128* 9], m23 + mova [cq+128*10], m24 + mova [cq+128*11], m25 + mova [cq+128*12], m26 + mova [cq+128*13], m27 + mova [cq+128*14], m28 + mova [cq+128*15], m29 + sub cq, 64 + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + sub rsp, 16*64 + call .pass1 + add rsp, 16*64 + lea r5, [o_base_8bpc] + call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start + mov r4, dstq + pxor m12, m12 + call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end + lea dstq, [r4+64] + mova m0, [rsp+16*mmsize] + mova m1, [rsp+17*mmsize] + mova m2, [rsp+18*mmsize] + mova m3, [rsp+19*mmsize] + mova m4, [rsp+20*mmsize] + mova m5, [rsp+21*mmsize] + mova m6, [rsp+22*mmsize] + mova m7, [rsp+23*mmsize] + mova m16, [rsp+24*mmsize] + mova m17, [rsp+25*mmsize] + mova m18, [rsp+26*mmsize] + mova m19, [rsp+27*mmsize] + mova m20, [rsp+28*mmsize] + mova m21, [rsp+29*mmsize] + mova m22, [rsp+30*mmsize] + mova m23, [rsp+31*mmsize] + call .transpose + mova [cq+128* 0+64], m0 + mova [cq+128* 1+64], m1 + mova [cq+128* 2+64], m2 + mova [cq+128* 3+64], m3 + mova [cq+128* 4+64], m14 + mova [cq+128* 5+64], m15 + mova [cq+128* 6+64], m16 + mova [cq+128* 7+64], m17 + mova [cq+128* 8+64], m22 + mova [cq+128* 9+64], m23 + mova [cq+128*10+64], m24 + mova [cq+128*11+64], m25 + mova [cq+128*12+64], m26 + mova [cq+128*13+64], m27 + mova [cq+128*14+64], m28 + mova [cq+128*15+64], m29 + mova m0, [rsp+ 0*mmsize] + mova m1, [rsp+ 1*mmsize] + mova m2, [rsp+ 2*mmsize] + mova m3, [rsp+ 3*mmsize] + mova m4, [rsp+ 4*mmsize] + mova m5, [rsp+ 5*mmsize] + mova m6, [rsp+ 6*mmsize] + mova m7, [rsp+ 7*mmsize] + mova m16, [rsp+ 8*mmsize] + mova m17, [rsp+ 9*mmsize] + mova m18, [rsp+10*mmsize] + mova m19, [rsp+11*mmsize] + mova m20, [rsp+12*mmsize] + mova m21, [rsp+13*mmsize] + mova m22, [rsp+14*mmsize] + mova m23, [rsp+15*mmsize] + call .transpose + call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start + pxor m12, m12 +.right_zero_loop: + mova [cq+r7*8+64+128*3], m12 + mova [cq+r7*8+64+128*2], m12 + mova [cq+r7*8+64+128*1], m12 + mova [cq+r7*8+64+128*0], m12 + sub r7d, 16*4 + jge .right_zero_loop + mov r7d, 16*28 + jmp .end +.fast: ; topleft 16x16 nonzero + cmp eobd, 36 + jl .fast2 + call .pass1_fast + lea r5, [o_base_8bpc] + call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start + mov r4, dstq + pxor m12, m12 + call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end + lea dstq, [r4+64] + mova m0, [rsp+16*mmsize] + mova m1, [rsp+17*mmsize] + mova m2, [rsp+18*mmsize] + mova m3, [rsp+19*mmsize] + mova m4, [rsp+20*mmsize] + mova m5, [rsp+21*mmsize] + mova m6, [rsp+22*mmsize] + mova m7, [rsp+23*mmsize] + mova m16, [rsp+24*mmsize] + mova m17, [rsp+25*mmsize] + mova m18, [rsp+26*mmsize] + mova m19, [rsp+27*mmsize] + mova m20, [rsp+28*mmsize] + mova m21, [rsp+29*mmsize] + mova m22, [rsp+30*mmsize] + mova m23, [rsp+31*mmsize] + call .transpose + call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start + mov r7d, 16*12 + pxor m12, m12 + jmp .end +.fast2: ; topleft 8x8 nonzero + movshdup m7, [o(permB)] + mova ym0, [cq+128*1] + mova ym2, [cq+128*5] + mova ym3, [cq+128*3] + mova ym1, [cq+128*7] + vpermt2q m0, m7, m2 ; 1 5 + vpermt2q m1, m7, m3 ; 7 3 + REPX {pmulld x, m12}, m0, m1 + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed_rect2 + mova [rsp+ 0*mmsize], m0 + mova [rsp+ 1*mmsize], m1 + mova [rsp+ 2*mmsize], m2 + mova [rsp+ 3*mmsize], m3 + mova [rsp+ 4*mmsize], m4 + mova [rsp+ 5*mmsize], m5 + mova [rsp+ 6*mmsize], m6 + mova [rsp+ 7*mmsize], m7 + mova [rsp+ 8*mmsize], m16 + mova [rsp+ 9*mmsize], m17 + mova [rsp+10*mmsize], m18 + mova [rsp+11*mmsize], m19 + mova [rsp+12*mmsize], m20 + mova [rsp+13*mmsize], m21 + mova [rsp+14*mmsize], m22 + mova [rsp+15*mmsize], m23 + + movshdup m7, [o(permB)] + pmulld ym0, ym12, [cq+128*0] + pmulld ym4, ym12, [cq+128*4] + mova ym16, [cq+128*2] + mova ym5, [cq+128*6] + REPX {paddd x, ym13}, ym0, ym4 + REPX {psrad x, 12 }, ym0, ym4 + vpermt2q m16, m7, m5 ; 2 6 + vpermq m0, m7, m0 ; 0 0 + vpermq m4, m7, m4 ; 4 4 + pmulld m16, m12 + paddd m16, m13 + psrad m16, 12 + call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3 + + vpbroadcastd m11, [o(pd_1)] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end + mova [rsp+16*mmsize], m24 + mova [rsp+17*mmsize], m25 + mova [rsp+18*mmsize], m26 + mova [rsp+19*mmsize], m27 + mova [rsp+20*mmsize], m28 + mova [rsp+21*mmsize], m29 + mova [rsp+22*mmsize], m30 + mova [rsp+23*mmsize], m31 + vpbroadcastd m13, [o(pd_2048)] + call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start + mov r7d, 16*4 + mov r4, dstq + pxor m12, m12 + call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end + lea dstq, [r4+64] + mova m0, [rsp+16*mmsize] + mova m1, [rsp+17*mmsize] + mova m2, [rsp+18*mmsize] + mova m3, [rsp+19*mmsize] + mova m4, [rsp+20*mmsize] + mova m5, [rsp+21*mmsize] + mova m6, [rsp+22*mmsize] + mova m7, [rsp+23*mmsize] + lea r5, [o_base] + vpbroadcastd m13, [o(pd_2048)] + call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start + pxor m12, m12 +.end: + call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end +.zero_loop: + mova [cq+r7*8+128*3], m12 + mova [cq+r7*8+128*2], m12 + mova [cq+r7*8+128*1], m12 + mova [cq+r7*8+128*0], m12 + sub r7d, 16*4 + jge .zero_loop + RET +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 + jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2 +.pass1_fast: + lea r4, [idct64_mul_16bpc] + lea r6, [rsp+4*64+gprsize] + pmulld m0, m12, [cq+128* 1] + pmulld m3, m12, [cq+128*15] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 + pmulld m0, m12, [cq+128* 7] + pmulld m3, m12, [cq+128* 9] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 + pmulld m0, m12, [cq+128* 5] + pmulld m3, m12, [cq+128*11] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 + pmulld m0, m12, [cq+128* 3] + pmulld m3, m12, [cq+128*13] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 + pmulld m0, m12, [cq+128* 0] + pmulld m1, m12, [cq+128* 8] + pmulld m16, m12, [cq+128* 4] + pmulld m17, m12, [cq+128*12] + call m(idct_8x16_internal_10bpc).main_fast2_rect2 + call m(idct_16x16_internal_10bpc).main_fast2_rect2 + call .pass1_load_spill + call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2_rect2 + jmp .pass1_end +.pass1: + lea r4, [idct64_mul_16bpc] + lea r6, [rsp+4*64+gprsize] + pmulld m0, m12, [cq+128* 1] + pmulld m1, m12, [cq+128*31] + pmulld m2, m12, [cq+128*17] + pmulld m3, m12, [cq+128*15] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 + pmulld m0, m12, [cq+128* 7] + pmulld m1, m12, [cq+128*25] + pmulld m2, m12, [cq+128*23] + pmulld m3, m12, [cq+128* 9] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 + pmulld m0, m12, [cq+128* 5] + pmulld m1, m12, [cq+128*27] + pmulld m2, m12, [cq+128*21] + pmulld m3, m12, [cq+128*11] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 + pmulld m0, m12, [cq+128* 3] + pmulld m1, m12, [cq+128*29] + pmulld m2, m12, [cq+128*19] + pmulld m3, m12, [cq+128*13] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 + pmulld m0, m12, [cq+128* 0] + pmulld m1, m12, [cq+128* 8] + pmulld m2, m12, [cq+128*16] + pmulld m3, m12, [cq+128*24] + pmulld m16, m12, [cq+128* 4] + pmulld m17, m12, [cq+128*12] + pmulld m18, m12, [cq+128*20] + pmulld m19, m12, [cq+128*28] + call m(idct_8x16_internal_10bpc).main_fast_rect2 + call m(idct_16x16_internal_10bpc).main_fast_rect2 + call .pass1_load_spill + pmulld m4, m12, [cq+128*18] + pmulld m5, m12, [cq+128*22] + pmulld m6, m12, [cq+128*26] + pmulld m7, m12, [cq+128*30] + call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2 +.pass1_end: + vpbroadcastd m11, [o(pd_1)] + lea r3, [rsp+gprsize] + lea r4, [cq+8*128] + call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end + ; transpose one half immediately, we can transpose lower half later +.transpose: + ; transpose m0-7,16-23 + psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11 + psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 + call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32 + punpckhqdq m22, m0, m20 ; 1 + punpcklqdq m0, m20 ; 0 + punpckhqdq m24, m2, m1 ; 5 + punpcklqdq m1, m2, m1 ; 4 + punpcklqdq m2, m14, m18 ; 8 + punpckhqdq m26, m14, m18 ; 9 + punpcklqdq m14, m15, m4 ; 2 + punpckhqdq m23, m15, m4 ; 3 + punpckhqdq m25, m3, m21 ; 7 + punpcklqdq m15, m3, m21 ; 6 + punpckhqdq m28, m6, m17 ; 13 + punpcklqdq m3, m6, m17 ; 12 + punpckhqdq m27, m5, m16 ; 11 + punpcklqdq m16, m5, m16 ; 10 + punpckhqdq m29, m7, m8 ; 15 + punpcklqdq m17, m7, m8 ; 14 + ret +.pass1_load_spill: + call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub + mova [cq+128* 0], m0 + mova [cq+128* 1], m1 + pmulld m0, m12, [cq+128* 2] + pmulld m1, m12, [cq+128* 6] + mova [cq+128* 2], m2 + mova [cq+128* 3], m3 + pmulld m2, m12, [cq+128*10] + pmulld m3, m12, [cq+128*14] + mova [cq+128* 4], m4 + mova [cq+128* 5], m5 + mova [cq+128* 6], m6 + mova [cq+128* 7], m7 + mova [cq+128* 8], m23 + mova [cq+128* 9], m22 + mova [cq+128*10], m21 + mova [cq+128*11], m20 + mova [cq+128*12], m19 + mova [cq+128*13], m18 + mova [cq+128*14], m17 + mova [cq+128*15], m16 + ret + +cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + + PROLOGUE 4, 9, 32, -64*32, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + cmp eobd, 136 + jl .fast + add cq, 64 + cmp eobd, 543 + jge .full + call .pass1_fast ; bottomright 16x16 zero + mov r7d, 16*12 + jmp .lefthalf +.full: + call .pass1 + mov r7d, 16*28 +.lefthalf: + mova [cq+128* 0], m27 + mova [cq+128* 1], m14 + mova [cq+128* 2], m28 + mova [cq+128* 3], m15 + mova [cq+128* 4], m22 + mova [cq+128* 5], m23 + mova [cq+128* 6], m24 + mova [cq+128* 7], m25 + mova [cq+128* 8], m0 + mova [cq+128* 9], m26 + mova [cq+128*10], m20 + mova [cq+128*11], m21 + mova [cq+128*12], m18 + mova [cq+128*13], m16 + mova [cq+128*14], m17 + mova [cq+128*15], m3 + sub cq, 64 + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + sub rsp, 16*64 + call .pass1 + sub rsp, 24*64 + call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start + mov r8, dstq + pxor m31, m31 + call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end + lea dstq, [r8+64] + mova m0, [rsp+56*mmsize] + mova m1, [rsp+57*mmsize] + mova m2, [rsp+58*mmsize] + mova m3, [rsp+59*mmsize] + mova m4, [rsp+60*mmsize] + mova m5, [rsp+61*mmsize] + mova m6, [rsp+62*mmsize] + mova m7, [rsp+63*mmsize] + mova m16, [rsp+64*mmsize] + mova m17, [rsp+65*mmsize] + mova m18, [rsp+66*mmsize] + mova m19, [rsp+67*mmsize] + mova m20, [rsp+68*mmsize] + mova m21, [rsp+69*mmsize] + mova m22, [rsp+70*mmsize] + mova m23, [rsp+71*mmsize] + call .transpose + mova [cq+128* 0+64], m27 + mova [cq+128* 1+64], m14 + mova [cq+128* 2+64], m28 + mova [cq+128* 3+64], m15 + mova [cq+128* 4+64], m22 + mova [cq+128* 5+64], m23 + mova [cq+128* 6+64], m24 + mova [cq+128* 7+64], m25 + mova [cq+128* 8+64], m0 + mova [cq+128* 9+64], m26 + mova [cq+128*10+64], m20 + mova [cq+128*11+64], m21 + mova [cq+128*12+64], m18 + mova [cq+128*13+64], m16 + mova [cq+128*14+64], m17 + mova [cq+128*15+64], m3 + mova m0, [rsp+40*mmsize] + mova m1, [rsp+41*mmsize] + mova m2, [rsp+42*mmsize] + mova m3, [rsp+43*mmsize] + mova m4, [rsp+44*mmsize] + mova m5, [rsp+45*mmsize] + mova m6, [rsp+46*mmsize] + mova m7, [rsp+47*mmsize] + mova m16, [rsp+48*mmsize] + mova m17, [rsp+49*mmsize] + mova m18, [rsp+50*mmsize] + mova m19, [rsp+51*mmsize] + mova m20, [rsp+52*mmsize] + mova m21, [rsp+53*mmsize] + mova m22, [rsp+54*mmsize] + mova m23, [rsp+55*mmsize] + add rsp, 32*64 + call .transpose + lea r5, [o_base] + call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start +.right_zero_loop: + REPX {mova [cq+r7*8+64+128*x], m31}, 0, 1, 2, 3 + sub r7d, 16*4 + jge .right_zero_loop + mov r7d, 16*28 + jmp .end +.fast: ; topleft 16x16 nonzero + cmp eobd, 36 + jl .fast2 + call .pass1_fast + sub rsp, 24*64 + vpbroadcastd m10, [o(pd_2048)] + call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start + mov r8, dstq + pxor m31, m31 + call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end + lea dstq, [r8+64] + mova m0, [rsp+40*mmsize] + mova m1, [rsp+41*mmsize] + mova m2, [rsp+42*mmsize] + mova m3, [rsp+43*mmsize] + mova m4, [rsp+44*mmsize] + mova m5, [rsp+45*mmsize] + mova m6, [rsp+46*mmsize] + mova m7, [rsp+47*mmsize] + mova m16, [rsp+48*mmsize] + mova m17, [rsp+49*mmsize] + mova m18, [rsp+50*mmsize] + mova m19, [rsp+51*mmsize] + mova m20, [rsp+52*mmsize] + mova m21, [rsp+53*mmsize] + mova m22, [rsp+54*mmsize] + mova m23, [rsp+55*mmsize] + add rsp, 16*64 + call .transpose + lea r5, [o_base] + vpbroadcastd m10, [o(pd_2048)] + call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start + mov r7d, 16*12 + jmp .end +.fast2: ; topleft 8x8 nonzero + movshdup m7, [o(permB)] + mova ym0, [cq+128*1] + mova ym2, [cq+128*5] + mova ym3, [cq+128*3] + mova ym1, [cq+128*7] + vpermt2q m0, m7, m2 ; 1 5 + vpermt2q m1, m7, m3 ; 7 3 + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed + mova [rsp+ 0*mmsize], m0 + mova [rsp+ 1*mmsize], m1 + mova [rsp+ 2*mmsize], m2 + mova [rsp+ 3*mmsize], m3 + mova [rsp+ 4*mmsize], m4 + mova [rsp+ 5*mmsize], m5 + mova [rsp+ 6*mmsize], m6 + mova [rsp+ 7*mmsize], m7 + mova [rsp+ 8*mmsize], m16 + mova [rsp+ 9*mmsize], m17 + mova [rsp+10*mmsize], m18 + mova [rsp+11*mmsize], m19 + mova [rsp+12*mmsize], m20 + mova [rsp+13*mmsize], m21 + mova [rsp+14*mmsize], m22 + mova [rsp+15*mmsize], m23 + + movshdup m7, [o(permB)] + mova ym0, [cq+128*0] + mova ym4, [cq+128*4] + mova ym16, [cq+128*2] + mova ym5, [cq+128*6] + vpermt2q m16, m7, m5 ; 2 6 + vpermq m0, m7, m0 ; 0 0 + vpermq m4, m7, m4 ; 4 4 + call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3 + + vpbroadcastd m11, [o(pd_2)] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end + sub rsp, 16*64 + mova [rsp+40*mmsize], m24 + mova [rsp+41*mmsize], m25 + mova [rsp+42*mmsize], m26 + mova [rsp+43*mmsize], m27 + mova [rsp+44*mmsize], m28 + mova [rsp+45*mmsize], m29 + mova [rsp+46*mmsize], m30 + mova [rsp+47*mmsize], m31 + call .pass2_fast2_start + mov r7d, 16*4 + mov r8, dstq + pxor m31, m31 + call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end + lea dstq, [r8+64] + mova m0, [rsp+40*mmsize] + mova m1, [rsp+41*mmsize] + mova m2, [rsp+42*mmsize] + mova m3, [rsp+43*mmsize] + mova m4, [rsp+44*mmsize] + mova m5, [rsp+45*mmsize] + mova m6, [rsp+46*mmsize] + mova m7, [rsp+47*mmsize] + add rsp, 8*64 + lea r5, [o_base] + call .pass2_fast2_start +.end: + pxor m31, m31 +.zero_loop: + REPX {mova [cq+r7*8+128*x], m31}, 0, 1, 2, 3 + sub r7d, 16*4 + jge .zero_loop + call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end + add rsp, 8*64 ; FIXME adjust stack_size_padded instead? + RET +.pass2_fast2_start: + call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 + punpcklqdq m27, m0, m2 ; 0 + punpckhqdq m0, m2 ; 1 + punpcklqdq m22, m3, m4 ; 2 + punpckhqdq m26, m3, m4 ; 3 + punpcklqdq m14, m5, m7 ; 4 + punpckhqdq m20, m5, m7 ; 5 + punpcklqdq m23, m6, m8 ; 6 + punpckhqdq m21, m6, m8 ; 7 + vpbroadcastd m10, [o(pd_2048)] + jmp m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast2_start +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 64 + jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly1 +.pass1_fast: + lea r4, [idct64_mul_16bpc] + lea r6, [rsp+4*64+gprsize] + mova m0, [cq+128* 1] + mova m3, [cq+128*15] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast + mova m0, [cq+128* 7] + mova m3, [cq+128* 9] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast + mova m0, [cq+128* 5] + mova m3, [cq+128*11] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast + mova m0, [cq+128* 3] + mova m3, [cq+128*13] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 + mova m0, [cq+128* 0] + mova m1, [cq+128* 8] + mova m16, [cq+128* 4] + mova m17, [cq+128*12] + call m(idct_8x16_internal_10bpc).main_fast2 + call m(idct_16x16_internal_10bpc).main_fast2 + call .pass1_load_spill + call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2 + jmp .pass1_end +.pass1: + lea r4, [idct64_mul_16bpc] + lea r6, [rsp+4*64+gprsize] + mova m0, [cq+128* 1] + mova m1, [cq+128*31] + mova m2, [cq+128*17] + mova m3, [cq+128*15] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 + mova m0, [cq+128* 7] + mova m1, [cq+128*25] + mova m2, [cq+128*23] + mova m3, [cq+128* 9] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 + mova m0, [cq+128* 5] + mova m1, [cq+128*27] + mova m2, [cq+128*21] + mova m3, [cq+128*11] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 + mova m0, [cq+128* 3] + mova m1, [cq+128*29] + mova m2, [cq+128*19] + mova m3, [cq+128*13] + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 + mova m0, [cq+128* 0] + mova m1, [cq+128* 8] + mova m2, [cq+128*16] + mova m3, [cq+128*24] + mova m16, [cq+128* 4] + mova m17, [cq+128*12] + mova m18, [cq+128*20] + mova m19, [cq+128*28] + call m(idct_8x16_internal_10bpc).main_fast + call m(idct_16x16_internal_10bpc).main_fast + call .pass1_load_spill + mova m4, [cq+128*18] + mova m5, [cq+128*22] + mova m6, [cq+128*26] + mova m7, [cq+128*30] + call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast +.pass1_end: + vpbroadcastd m11, [o(pd_2)] + lea r3, [rsp+gprsize] + lea r4, [cq+8*128] + call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end + ; transpose one half immediately, we can transpose lower half later +.transpose: + ; transpose m0-7,16-23 + psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11 + psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 + call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32 + punpcklqdq m27, m0, m20 ; 0 + punpckhqdq m0, m20 ; 1 + punpcklqdq m24, m5, m16 ; 10 + punpckhqdq m16, m5, m16 ; 11 + punpcklqdq m23, m3, m21 ; 6 + punpckhqdq m21, m3, m21 ; 7 + punpcklqdq m25, m7, m8 ; 14 + punpckhqdq m3, m7, m8 ; 15 + punpcklqdq m22, m15, m4 ; 2 + punpckhqdq m26, m15, m4 ; 3 + punpcklqdq m15, m6, m17 ; 12 + punpckhqdq m17, m6, m17 ; 13 + punpcklqdq m28, m14, m18 ; 8 + punpckhqdq m18, m14, m18 ; 9 + punpcklqdq m14, m2, m1 ; 4 + punpckhqdq m20, m2, m1 ; 5 + ret +.pass1_load_spill: + call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub + mova [cq+128* 0], m0 + mova [cq+128* 1], m1 + mova m0, [cq+128* 2] + mova m1, [cq+128* 6] + mova [cq+128* 2], m2 + mova [cq+128* 3], m3 + mova m2, [cq+128*10] + mova m3, [cq+128*14] + mova [cq+128* 4], m4 + mova [cq+128* 5], m5 + mova [cq+128* 6], m6 + mova [cq+128* 7], m7 + mova [cq+128* 8], m23 + mova [cq+128* 9], m22 + mova [cq+128*10], m21 + mova [cq+128*11], m20 + mova [cq+128*12], m19 + mova [cq+128*13], m18 + mova [cq+128*14], m17 + mova [cq+128*15], m16 + ret + %endif ; ARCH_X86_64 diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx_avx2.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx_avx2.asm index dd477a61b..a67f053a6 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx_avx2.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx_avx2.asm @@ -1194,13 +1194,9 @@ cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] + mov [cq], eobd pmulhrsw xm0, xm1 - movd xm2, [o(pw_2048)] - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - mova m1, m0 - jmp m(iadst_8x4_internal_8bpc).end3 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 %endif %endmacro @@ -1340,20 +1336,20 @@ cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd + or r3d, 8 +.dconly: pmulhrsw xm0, xm2 - psrlw xm2, 3 ; pw_2048 +.dconly2: + movd xm2, [pw_2048] pmulhrsw xm0, xm1 + lea r2, [strideq*3] pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 -.end: - mov r2d, 2 -.end2: - lea r3, [strideq*3] -.loop: - WRITE_8X4 0, 0, 1, 2 +.dconly_loop: + WRITE_8X4 0, 0, 1, 2, strideq*1, strideq*2, r2 lea dstq, [dstq+strideq*4] - dec r2d - jg .loop + sub r3d, 4 + jg .dconly_loop RET %endif %endmacro @@ -1543,13 +1539,8 @@ cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - psrlw xm2, 3 ; pw_2048 - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - mov r2d, 4 - jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2 + or r3d, 16 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly %endif %endmacro @@ -1902,7 +1893,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd - mov r2d, 2 + or r3d, 4 .dconly: pmulhrsw xm0, xm2 movd xm2, [pw_2048] ; intentionally rip-relative @@ -1911,17 +1902,17 @@ cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 vpbroadcastw m0, xm0 pxor m3, m3 .dconly_loop: - mova xm1, [dstq] - vinserti128 m1, [dstq+strideq], 1 + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 punpckhbw m2, m1, m3 punpcklbw m1, m3 paddw m2, m0 paddw m1, m0 packuswb m1, m2 - mova [dstq], xm1 - vextracti128 [dstq+strideq], m1, 1 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 lea dstq, [dstq+strideq*2] - dec r2d + sub r3d, 2 jg .dconly_loop RET %endif @@ -2162,7 +2153,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 4 + or r3d, 8 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly %endif %endmacro @@ -2473,7 +2464,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 8 + or r3d, 16 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly %endif %endmacro @@ -3120,13 +3111,8 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - pmulhrsw xm0, xm2 - psrlw xm2, 2 ; pw_2048 - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - mov r2d, 8 - jmp m(inv_txfm_add_dct_dct_8x8_8bpc).end2 + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly .full: REPX {pmulhrsw x, m9}, m12, m13, m14, m15 pmulhrsw m6, m9, [rsp+32*2] @@ -3290,7 +3276,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 8 + or r3d, 8 .dconly: pmulhrsw xm0, xm2 movd xm2, [pw_2048] ; intentionally rip-relative @@ -3307,7 +3293,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob packuswb m1, m2 mova [dstq], m1 add dstq, strideq - dec r2d + dec r3d jg .dconly_loop RET .normal: @@ -3672,7 +3658,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 16 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .full: mova [tmp1q-32*4], m1 @@ -3991,7 +3977,7 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 16 + or r3d, 16 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 @@ -4222,7 +4208,7 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ @@ -4486,7 +4472,7 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 32 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .normal: PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 @@ -4832,7 +4818,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 16 + or r3d, 16 .dconly: pmulhrsw xm0, xm2 movd xm2, [o(pw_2048)] @@ -4856,7 +4842,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob mova [dstq+32*0], m2 mova [dstq+32*1], m3 add dstq, strideq - dec r2d + dec r3d jg .dconly_loop RET .normal: @@ -4997,7 +4983,7 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 64 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 @@ -5200,7 +5186,7 @@ cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 - mov r2d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly .normal: PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ @@ -5381,7 +5367,7 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd - mov r2d, 64 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly .normal: PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx_avx512.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx_avx512.asm index a3d4ebdfb..a3f25d37e 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx_avx512.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/itx_avx512.asm @@ -1,5 +1,5 @@ -; Copyright © 2020, VideoLAN and dav1d authors -; Copyright © 2020, Two Orioles, LLC +; Copyright © 2020-2023, VideoLAN and dav1d authors +; Copyright © 2020-2023, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without @@ -30,6 +30,11 @@ SECTION_RODATA 64 const \ +dup16_perm, db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 + db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15 + db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23 + db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31 +const \ int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 @@ -42,10 +47,6 @@ int16_perm: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 -dup16_perm: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 - db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15 - db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23 - db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31 idtx_16x4p: db 0, 1, 4, 5, 16, 17, 20, 21, 2, 3, 6, 7, 18, 19, 22, 23 db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55 db 8, 9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31 @@ -85,7 +86,7 @@ pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11 gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13 gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10 -gather8d: dd 0, 3, 1, 2, 8, 11, 9, 10 +gather8d: dd 0, 19, 1, 18, 2, 17, 3, 16 int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 @@ -1500,8 +1501,8 @@ cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd + or r3d, 16 imul r6d, 181 - mov r3d, 16 add r6d, 128 sar r6d, 8 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly @@ -1609,6 +1610,53 @@ cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpscatterdq [r3+ym8]{k2}, m2 RET ALIGN function_align +cglobal_label .main_fast2 ; bottom three-quarters are zero + vpbroadcastd ym10, [o(pd_2048)] + vpbroadcastq ym13, [o(int_mshift)] + vpbroadcastd ym3, [o(pw_401_4076x8)] + vpbroadcastd ym5, [o(pw_799_4017x8)] + vpbroadcastd ym4, [o(pw_m1189_3920x8)] + pxor ym6, ym6 + punpckhwd ym2, ym0, ym0 + pmulhrsw ym2, ym3 ; t8a t15a + punpcklwd ym7, ym1, ym1 + pmulhrsw ym7, ym5 ; t4a t7a + punpckhwd ym1, ym1 + pmulhrsw ym4, ym1 ; t11a t12a + vpcmpub k7, ym13, ym10, 6 + punpcklwd ym9, ym6, ym0 + psubsw ym0, ym2, ym4 ; t11a t12a + paddsw ym8, ym2, ym4 ; t8a t15a + mova ym1, ym7 + jmp .main5 +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + vpbroadcastd ym10, [o(pd_2048)] + vpbroadcastq ym13, [o(int_mshift)] + pxor ym6, ym6 + punpckhwd ym8, ym0, ym0 + punpckhwd ym4, ym3, ym3 + punpckhwd ym5, ym2, ym2 + punpcklwd ym7, ym1, ym1 + punpckhwd ym1, ym1 + punpcklwd ym3, ym3 + punpcklwd ym9, ym6, ym0 + punpcklwd ym6, ym2 + vpbroadcastd ym2, [o(pw_401_4076x8)] + vpbroadcastd ym0, [o(pw_m2598_3166x8)] + vpbroadcastd ym11, [o(pw_1931_3612x8)] + vpbroadcastd ym12, [o(pw_m1189_3920x8)] + pmulhrsw ym8, ym2 ; t8a t15a + vpbroadcastd ym2, [o(pw_799_4017x8)] + pmulhrsw ym0, ym4 ; t9a t14a + vpbroadcastd ym4, [o(pw_m2276_3406x8)] + pmulhrsw ym5, ym11 ; t10a t13a + pmulhrsw ym1, ym12 ; t11a t12a + pmulhrsw ym7, ym2 ; t4a t7a + pmulhrsw ym3, ym4 ; t5a t6a + vpcmpub k7, ym13, ym10, 6 + jmp .main4 +ALIGN function_align cglobal_label .main WRAP_YMM IDCT16_1D_PACKED ret @@ -1978,7 +2026,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd - mov r3d, 8 + or r3d, 8 .dconly: imul r6d, 181 add r6d, 128 @@ -2279,8 +2327,8 @@ cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd + or r3d, 16 imul r6d, 181 - mov r3d, 16 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 @@ -2422,7 +2470,7 @@ cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vextracti32x4 [r6+r3 ], m3, 3 RET ALIGN function_align -.main_fast2: ; bottom three-quarters are zero +cglobal_label .main_fast2 ; bottom three-quarters are zero vpbroadcastd m10, [o(pd_2048)] vpbroadcastq m13, [o(int_mshift)] vpcmpub k7, m13, m10, 6 @@ -2439,7 +2487,7 @@ ALIGN function_align mova m1, m7 jmp .main5 ALIGN function_align -.main_fast: ; bottom half is zero +cglobal_label .main_fast ; bottom half is zero vpbroadcastd m10, [o(pd_2048)] .main_fast3: vpbroadcastq m13, [o(int_mshift)] @@ -2774,13 +2822,13 @@ ALIGN function_align vpermt2q m9, m12, m7 jmp m(idct_16x16_internal_8bpc).end -%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] - vpbroadcastd m%3, [o(pw_%4_%5x8)] - punpcklwd m%1, m%2, m%2 - pmulhrsw m%1, m%3 - vpbroadcastd m%3, [o(pw_%6_%7x8)] - punpckhwd m%2, m%2 - pmulhrsw m%2, m%3 +%macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4] + vpbroadcastd m%4, [o(pw_%5_%6x8)] + punpcklwd m%1, m%3, m%3 + pmulhrsw m%1, m%4 + vpbroadcastd m%4, [o(pw_%7_%8x8)] + punpckhwd m%2, m%3, m%3 + pmulhrsw m%2, m%4 %endmacro cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob @@ -2868,82 +2916,86 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6 vinserti32x4 ym14, ym16, xm17, 1 ; 1 3 vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7 - pxor ym4, ym4 vpermt2q m2, m5, m6 ; 8 10 vpermt2q m16, m5, m17 ; 9 11 - mova ym5, ym4 - mova ym6, ym4 - mova ym7, ym4 vextracti32x8 ym3, m2, 1 ; 12 14 vextracti32x8 ym17, m16, 1 ; 13 15 - call m(idct_8x16_internal_8bpc).main + call m(idct_8x16_internal_8bpc).main_fast call .main_fast .end: - vpbroadcastd ym12, strided - vpbroadcastd m13, [o(pw_2048)] - pmulld ym7, ym12, [o(gather8d)] - REPX {pmulhrsw x, m13}, m0, m1, m2, m3, m8, m9, m10, m11 + vpbroadcastd ym8, strided + pmulld ym8, [o(gather8d)] + call .main_end lea r3, [dstq+strideq*4] - shl strideq, 4 - lea r4, [dstq+strideq] - add r1, r3 kxnorb k1, k1, k1 - pxor m6, m6 + lea r4, [dstq+strideq*8] + pxor m9, m9 + lea r1, [r3+strideq*8] kmovb k2, k1 - vpgatherdq m12{k1}, [r0+ym7] + vpgatherdq m12{k1}, [r0+ym8] kmovb k1, k2 - vpgatherdq m13{k2}, [r3+ym7] + vpgatherdq m13{k2}, [r3+ym8] kmovb k2, k1 - vpgatherdq m14{k1}, [r4+ym7] + vpgatherdq m14{k1}, [r4+ym8] kmovb k1, k2 - vpgatherdq m15{k2}, [r1+ym7] - REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 - punpcklbw m4, m12, m6 - punpckhbw m12, m6 - paddw m0, m4 + vpgatherdq m15{k2}, [r1+ym8] + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklbw m11, m12, m9 + punpckhbw m12, m9 + paddw m0, m11 paddw m1, m12 packuswb m0, m1 kmovb k2, k1 - vpscatterdq [r0+ym7]{k1}, m0 - punpcklbw m4, m13, m6 - punpckhbw m13, m6 - paddw m2, m4 + vpscatterdq [r0+ym8]{k1}, m0 + punpcklbw m12, m13, m9 + punpckhbw m13, m9 + paddw m2, m12 paddw m3, m13 packuswb m2, m3 kmovb k1, k2 - vpscatterdq [r3+ym7]{k2}, m2 - punpcklbw m4, m14, m6 - punpckhbw m14, m6 - paddw m8, m4 - paddw m9, m14 - packuswb m8, m9 + vpscatterdq [r3+ym8]{k2}, m2 + punpcklbw m13, m14, m9 + punpckhbw m14, m9 + paddw m4, m13 + paddw m5, m14 + packuswb m4, m5 kmovb k2, k1 - vpscatterdq [r4+ym7]{k1}, m8 - punpcklbw m4, m15, m6 - punpckhbw m15, m6 - paddw m10, m4 - paddw m11, m15 - packuswb m10, m11 - vpscatterdq [r1+ym7]{k2}, m10 + vpscatterdq [r4+ym8]{k1}, m4 + punpcklbw m14, m15, m9 + punpckhbw m15, m9 + paddw m6, m14 + paddw m7, m15 + packuswb m6, m7 + vpscatterdq [r1+ym8]{k2}, m6 RET .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 32 + or r3d, 32 imul r6d, 181 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 INIT_YMM avx512icl ALIGN function_align -.main_fast: ; bottom half is zero - ITX_UNPACK_MULHRSW 12, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a - ITX_UNPACK_MULHRSW 21, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a - ITX_UNPACK_MULHRSW 20, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a - ITX_UNPACK_MULHRSW 19, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a +cglobal_label .main_fast2 ; bottom three-quarters are zero + ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + mova m11, m12 + mova m17, m20 + mova m15, m21 + mova m16, m14 + jmp .main4 +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a + ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a jmp .main3 ALIGN function_align -.main: +cglobal_label .main punpcklwd m12, m21, m14 ; in31 in1 punpckhwd m14, m21 ; in3 in29 punpcklwd m21, m20, m15 ; in27 in5 @@ -2970,6 +3022,7 @@ ALIGN function_align paddsw m21, m16 ; t20 t27 psubsw m16, m14, m19 ; t22 t25 paddsw m14, m19 ; t23 t24 +.main4: ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a @@ -3001,8 +3054,8 @@ ALIGN function_align REPX {pshufb x, m18}, m20, m11, m21, m19 ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25 - packssdw m18, m13 ; t23a t22 - packssdw m12, m15 ; t24a t25 + packssdw m18, m13 ; t23a t22 + packssdw m12, m15 ; t24a t25 ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27 packssdw m16, m13 ; t20 t21a @@ -3011,32 +3064,27 @@ ALIGN function_align punpckhqdq m19, m21 ; t28a t29 punpcklqdq m21, m20, m11 ; t16 t17a punpckhqdq m20, m11 ; t31 t30a - psubsw m15, m1, m19 ; out28 out29 - paddsw m1, m19 ; out3 out2 - psubsw m9, m6, m13 ; out19 out18 - paddsw m6, m13 ; out12 out13 - psubsw m10, m5, m16 ; out20 out21 - paddsw m5, m16 ; out11 out10 - psubsw m19, m3, m12 ; out24 out25 - paddsw m3, m12 ; out7 out6 - psubsw m8, m7, m21 ; out16 out17 - paddsw m7, m21 ; out15 out14 - psubsw m21, m0, m20 ; out31 out30 - paddsw m0, m20 ; out0 out1 - psubsw m11, m4, m18 ; out23 out22 - paddsw m4, m18 ; out8 out9 - psubsw m18, m2, m14 ; out27 out26 - paddsw m2, m14 ; out4 out5 INIT_ZMM avx512icl - movu m16, [o(permD+3)] - vpermt2q m0, m16, m4 ; 0 1 8 9 - vpermt2q m8, m16, m19 ; 16 17 24 25 - vpermt2q m1, m16, m5 ; 3 2 11 10 - vpermt2q m9, m16, m18 ; 19 18 27 26 - vpermt2q m2, m16, m6 ; 4 5 12 13 - vpermt2q m10, m16, m15 ; 20 21 28 29 - vpermt2q m3, m16, m7 ; 7 6 15 14 - vpermt2q m11, m16, m21 ; 23 22 31 30 + mova m15, [o(permA)] + ret +cglobal_label .main_end + vpbroadcastd m10, [o(pw_2048)] + vpermt2q m0, m15, m1 ; t0 t1 t2 t3 + vpermt2q m20, m15, m19 ; t31 t30a t29 t28a + vpermt2q m2, m15, m3 ; t4 t5 t6 t7 + vpermt2q m14, m15, m12 ; t27 t26a t25 t24a + vpermt2q m4, m15, m5 ; t8 t9 t10 t11 + vpermt2q m18, m15, m16 ; t23a t22 t21a t20 + vpermt2q m6, m15, m7 ; t12 t13 t14 t15 + vpermt2q m13, m15, m21 ; t19a t18 t17a t16 + psubsw m7, m0, m20 ; out31 out30 out29 out28 + paddsw m0, m20 ; out0 out1 out2 out3 + psubsw m5, m2, m14 ; out27 out26 out25 out24 + paddsw m2, m14 ; out4 out5 out6 out7 + psubsw m3, m4, m18 ; out23 out22 out21 out20 + paddsw m4, m18 ; out8 out9 out10 out11 + psubsw m1, m6, m13 ; out19 out18 out17 out16 + paddsw m6, m13 ; out12 out13 out14 out15 vzeroupper ret @@ -3083,16 +3131,33 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob call m(idct_8x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast .pass2: - vpbroadcastd m12, [o(pw_8192)] - vshufi32x4 m7, m3, m11, q2020 ; 7 15 23 31 - vshufi32x4 m6, m3, m11, q3131 ; 6 14 22 30 - vshufi32x4 m5, m2, m10, q3131 ; 5 13 21 29 - vshufi32x4 m4, m2, m10, q2020 ; 4 12 20 28 - vshufi32x4 m3, m1, m9, q2020 ; 3 11 19 27 - vshufi32x4 m2, m1, m9, q3131 ; 2 10 18 26 - vshufi32x4 m1, m0, m8, q3131 ; 1 9 17 15 - vshufi32x4 m0, m8, q2020 ; 0 8 16 24 - REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m10, [o(pw_8192)] + vpermt2q m0, m15, m4 ; t0 t1 t9 t8 + vpermt2q m20, m15, m18 ; t31 t30a t23a t22 + vpermt2q m3, m15, m7 ; t7 t6 t14 t15 + vpermt2q m12, m15, m21 ; t25 t24a t17a t16 + vpermt2q m2, m15, m6 ; t4 t5 t13 t12 + vpermt2q m14, m15, m13 ; t23a t22 t21a t20 + vpermt2q m1, m15, m5 ; t3 t2 t10 t11 + vpermt2q m19, m15, m16 ; t27 t26a t19a t18 + psubsw m8, m0, m20 ; out31 out30 out22 out23 + paddsw m0, m20 ; out0 out1 out9 out8 + paddsw m6, m3, m12 ; out7 out6 out14 out15 + psubsw m3, m12 ; out24 out25 out17 out16 + psubsw m5, m2, m14 ; out27 out26 out18 out19 + paddsw m4, m2, m14 ; out4 out5 out13 out12 + psubsw m7, m1, m19 ; out28 out29 out21 out20 + paddsw m2, m1, m19 ; out3 out2 out10 out11 + vzeroupper + vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25 + vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24 + vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27 + vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26 + vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29 + vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28 + vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31 + vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 call .main vpbroadcastd m8, [o(pw_2048)] @@ -3136,7 +3201,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 8 + or r3d, 8 .dconly2: imul r6d, 181 add r6d, 128+512 @@ -3162,7 +3227,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob jg .dconly_loop RET ALIGN function_align -.main: +cglobal_label .main vpbroadcastd m10, [o(pd_2048)] .main2: ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a @@ -3539,10 +3604,10 @@ ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly ALIGN function_align -.main_oddhalf_fast2: ; bottom three-quarters are zero +cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero vpbroadcastd m8, [o(pw_201_4091x8)] vpbroadcastd m20, [o(pw_m1380_3857x8)] vpbroadcastd m9, [o(pw_995_3973x8)] @@ -3557,7 +3622,7 @@ ALIGN function_align mova m16, m14 jmp .main3 ALIGN function_align -.main_oddhalf_fast: ; bottom half is zero +cglobal_label .main_oddhalf_fast ; bottom half is zero vpbroadcastd m8, [o(pw_201_4091x8)] vpbroadcastd m9, [o(pw_m2751_3035x8)] vpbroadcastd m11, [o(pw_1751_3703x8)] @@ -3576,7 +3641,7 @@ ALIGN function_align pmulhrsw m14, m12 ; t23a, t24a jmp .main2 ALIGN function_align -.main_oddhalf: +cglobal_label .main_oddhalf ITX_MUL2X_PACK 21, 8, 9, 10, 201, 4091, 5 ; t16a, t31a ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a @@ -3825,8 +3890,8 @@ ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 16 imul r6d, 181 - mov r3d, 16 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -3834,7 +3899,48 @@ ALIGN function_align sar r6d, 8+1 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 ALIGN function_align -.main_oddhalf_fast2: ; bottom three-quarters are zero +cglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero + vpbroadcastd m8, [o(pw_2896x8)] + vpbroadcastd m4, [o(pw_4076x8)] + vpbroadcastd m3, [o(pw_401x8)] + pmulhrsw m8, m0 ; t0 + pmulhrsw m4, m14 ; t15a + pmulhrsw m3, m14 ; t8a + punpcklwd m9, m3, m4 + punpckhwd m5, m3, m4 + mova m2, m10 + vpdpwssd m2, m9, [o(pw_m3784_1567)] {bcstd} + mova m1, m10 + vpdpwssd m1, m5, [o(pw_m3784_1567)] {bcstd} + mova m6, m10 + vpdpwssd m6, m5, [o(pw_1567_3784)] {bcstd} + mova m5, m10 + vpdpwssd m5, m9, [o(pw_1567_3784)] {bcstd} + vpbroadcastd m11, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + psubsw m21, m8, m4 ; out15 + paddsw m0, m8, m4 ; out0 + psubsw m14, m8, m3 ; out8 + paddsw m7, m8, m3 ; out7 + REPX {psrad x, 12}, m2, m1, m6, m5 + packssdw m2, m1 ; t9a + packssdw m5, m6 ; t14a + ITX_MULSUB_2W 4, 3, 16, 17, 10, 11, 12 ; t11, t12 + psubsw m20, m8, m5 ; out14 + paddsw m1, m8, m5 ; out1 + psubsw m15, m8, m2 ; out9 + paddsw m6, m8, m2 ; out6 + ITX_MULSUB_2W 5, 2, 16, 17, 10, 11, 12 ; t10a, t13a + psubsw m18, m8, m3 ; out12 + paddsw m3, m8 ; out3 + psubsw m17, m8, m4 ; out11 + paddsw m4, m8 ; out4 + psubsw m19, m8, m2 ; out13 + paddsw m2, m8 ; out2 + psubsw m16, m8, m5 ; out10 + paddsw m5, m8 ; out5 + ret +cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero vpbroadcastd m9, [o(pw_2896x8)] vpbroadcastd m2, [o(pw_4017x8)] vpbroadcastd m3, [o(pw_799x8)] @@ -3864,7 +3970,7 @@ ALIGN function_align paddsw m2, m9 ; idct8 out2 jmp .main3 ALIGN function_align -.main_oddhalf_fast: ; bottom half is zero +cglobal_label .main_oddhalf_fast ; bottom half is zero vpbroadcastd m5, [o(pw_m2276x8)] vpbroadcastd m11, [o(pw_3406x8)] vpbroadcastd m7, [o(pw_4017x8)] @@ -3902,7 +4008,7 @@ ALIGN function_align pmulhrsw m15, m12 ; t12a jmp .main2 ALIGN function_align -.main_oddhalf: +cglobal_label .main_oddhalf ITX_MULSUB_2W 14, 21, 8, 9, 10, 401, 4076 ; t8a, t15a ITX_MULSUB_2W 18, 17, 8, 9, 10, 3166, 2598 ; t9a, t14a ITX_MULSUB_2W 16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a @@ -4607,10 +4713,59 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 32 + or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2 ALIGN function_align -.main_oddhalf_fast2: ; bottom three-quarters are zero +cglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero + vpbroadcastd m21, [o(pw_4091x8)] + vpbroadcastd m8, [o(pw_201x8)] + vpbroadcastd m24, [o(pw_m601x8)] + vpbroadcastd m12, [o(pw_4052x8)] + pmulhrsw m21, m22 ; t31a + pmulhrsw m22, m8 ; t16a + pmulhrsw m24, m23 ; t23a + pmulhrsw m23, m12 ; t24a + + punpcklwd m9, m22, m21 + punpckhwd m8, m22, m21 + mova m15, m10 + vpdpwssd m15, m9, [o(pw_m4017_799)] {bcstd} + mova m17, m10 + vpdpwssd m17, m8, [o(pw_m4017_799)] {bcstd} + REPX {psrad x, 12}, m15, m17 + packssdw m15, m17 + mova m17, m10 + vpdpwssd m17, m8, [o(pw_799_4017)] {bcstd} + mova m8, m10 + vpdpwssd m8, m9, [o(pw_799_4017)] {bcstd} + REPX {psrad x, 12}, m17, m8 + packssdw m8, m17 + + punpcklwd m9, m24, m23 + punpckhwd m16, m24, m23 + mova m20, m10 + vpdpwssd m20, m9, [o(pw_m3406_m2276)] {bcstd} + mova m17, m10 + vpdpwssd m17, m16, [o(pw_m3406_m2276)] {bcstd} + REPX {psrad x, 12}, m20, m17 + packssdw m20, m17 + mova m17, m10 + vpdpwssd m17, m16, [o(pw_m2276_3406)] {bcstd} + mova m16, m10 + vpdpwssd m16, m9, [o(pw_m2276_3406)] {bcstd} + REPX {psrad x, 12}, m17, m16 + packssdw m16, m17 + + mova m17, m21 + mova m27, m15 + mova m25, m20 + mova m29, m8 + mova m18, m22 + mova m14, m24 + mova m28, m16 + mova m26, m23 + jmp .main4 +cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero vpbroadcastd m21, [o(pw_4091x8)] vpbroadcastd m8, [o(pw_201x8)] vpbroadcastd m18, [o(pw_m1380x8)] @@ -4637,7 +4792,7 @@ ALIGN function_align mova m20, m23 jmp .main3 ALIGN function_align -.main_oddhalf_fast: ; bottom half is zero +cglobal_label .main_oddhalf_fast ; bottom half is zero vpbroadcastd m21, [o(pw_4091x8)] vpbroadcastd m8, [o(pw_201x8)] vpbroadcastd m14, [o(pw_m2751x8)] @@ -4672,7 +4827,7 @@ ALIGN function_align pmulhrsw m23, m12 ; t24a jmp .main2 ALIGN function_align -.main_oddhalf: +cglobal_label .main_oddhalf ITX_MULSUB_2W 22, 21, 8, 9, 10, 201, 4091 ; t16a, t31a ITX_MULSUB_2W 14, 29, 8, 9, 10, 3035, 2751 ; t17a, t30a ITX_MULSUB_2W 26, 17, 8, 9, 10, 1751, 3703 ; t18a, t29a @@ -4703,8 +4858,6 @@ ALIGN function_align ITX_MULSUB_2W 25, 18, 9, 17, 10, m4017, 799 ; t18a, t29a ITX_MULSUB_2W 29, 26, 9, 17, 10, 3406, 2276 ; t21a, t26a ITX_MULSUB_2W 20, 16, 9, 17, 10, m2276, 3406 ; t22a, t25a - vpbroadcastd m12, [o(pw_m3784_1567)] - vpbroadcastd m11, [o(pw_1567_3784)] psubsw m17, m21, m27 ; t28a paddsw m21, m27 ; t31a psubsw m27, m15, m25 ; t18 @@ -4721,6 +4874,9 @@ ALIGN function_align psubsw m16, m26 ; t26 psubsw m26, m23, m19 ; t27a paddsw m23, m19 ; t24a +.main4: + vpbroadcastd m12, [o(pw_m3784_1567)] + vpbroadcastd m11, [o(pw_1567_3784)] ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28 vpbroadcastd m11, [o(pw_m1567_m3784)] @@ -5072,13 +5228,13 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 64 imul r6d, 181 - mov r3d, 64 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 ALIGN function_align -.main_oddhalf_fast: ; bottom three-quarters are zero +cglobal_label .main_oddhalf_fast ; bottom three-quarters are zero vpbroadcastd m8, [o(pw_101_4095x8)] vpbroadcastd m21, [o(pw_m1474_3822x8)] vpbroadcastd m14, [o(pw_897_3996x8)] @@ -5105,7 +5261,7 @@ ALIGN function_align mova m20, m15 jmp .main_oddhalf2 ALIGN function_align -.main_oddhalf: +cglobal_label .main_oddhalf vpbroadcastd m8, [o(pw_101_4095x8)] vpbroadcastd m9, [o(pw_m2824_2967x8)] vpbroadcastd m11, [o(pw_1660_3745x8)] @@ -5286,7 +5442,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob jnz .normal movsx r6d, word [cq] mov [cq], eobd - mov r3d, 16 + or r3d, 16 .dconly: imul r6d, 181 add r6d, 128+512 @@ -6016,8 +6172,8 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 64 imul r6d, 181 - mov r3d, 64 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -6025,7 +6181,33 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob sar r6d, 8+1 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 ALIGN function_align ; bottom three-quarters are zero -.main_part1_fast: +cglobal_label .main_part1_fast2 + vpbroadcastd m7, [o(idct64_mul+4*0)] + vpbroadcastd m8, [o(idct64_mul+4*1)] + pmulhrsw m7, m0 ; t63a + pmulhrsw m0, m8 ; t32a + + punpcklwd m4, m0, m7 + punpckhwd m6, m0, m7 + mova m1, m10 + vpdpwssd m1, m4, [o(idct64_mul+4*9)] {bcstd} + mova m9, m10 + vpdpwssd m9, m6, [o(idct64_mul+4*9)] {bcstd} + REPX {psrad x, 12}, m1, m9 + packssdw m1, m9 + mova m9, m10 + vpdpwssd m9, m6, [o(idct64_mul+4*8)] {bcstd} + mova m6, m10 + vpdpwssd m6, m4, [o(idct64_mul+4*8)] {bcstd} + REPX {psrad x, 12}, m9, m6 + packssdw m6, m9 + + mova m4, m0 + mova m3, m7 + mova m5, m1 + mova m2, m6 + jmp .main_part1c +cglobal_label .main_part1_fast vpbroadcastd m1, [o(idct64_mul+4*0)] vpbroadcastd m8, [o(idct64_mul+4*1)] vpbroadcastd m2, [o(idct64_mul+4*6)] @@ -6039,7 +6221,7 @@ ALIGN function_align ; bottom three-quarters are zero mova m6, m3 mova m5, m2 jmp .main_part1b -.main_part1: +cglobal_label .main_part1 ; idct64 steps 1-5: ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a @@ -6075,8 +6257,6 @@ ALIGN function_align ; bottom three-quarters are zero ITX_MULSUB_2W 1, 8, 4, 9, 10, 11, 12 ; t33a, t62a vpbroadcastd m11, [o(idct64_mul+4*10)] ITX_MULSUB_2W 2, 6, 4, 9, 10, 12, 11 ; t34a, t61a - vpbroadcastd m11, [o(idct64_mul+4*11)] - vpbroadcastd m12, [o(idct64_mul+4*12)] psubsw m4, m0, m3 ; t35a paddsw m0, m3 ; t32a psubsw m3, m7, m5 ; t60a @@ -6085,6 +6265,9 @@ ALIGN function_align ; bottom three-quarters are zero paddsw m1, m2 ; t33 psubsw m2, m8, m6 ; t61 paddsw m6, m8 ; t62 +.main_part1c: + vpbroadcastd m11, [o(idct64_mul+4*11)] + vpbroadcastd m12, [o(idct64_mul+4*12)] add r5, 4*13 ITX_MULSUB_2W 3, 4, 8, 9, 10, 11, 12 ; t35, t60 ITX_MULSUB_2W 2, 5, 8, 9, 10, 11, 12 ; t34a, t61a @@ -6098,7 +6281,7 @@ ALIGN function_align ; bottom three-quarters are zero mova [r4+64*5], m5 add r4, 64*8 ret -.main_part2: +cglobal_label .main_part2 vpbroadcastd m11, [o(pw_1567_3784 -16*13)] vpbroadcastd m12, [o(pw_m3784_1567 -16*13)] lea r6, [r4+64*7] @@ -6678,8 +6861,8 @@ ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd + or r3d, 32 imul r6d, 181 - mov r3d, 32 add r6d, 128 sar r6d, 8 imul r6d, 181 @@ -7121,7 +7304,7 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob .dconly: movsx r6d, word [cq] mov [cq], eobd - mov r3d, 64 + or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly ALIGN function_align .pass2_end: diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/mc16_avx2.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/mc16_avx2.asm index 8b2ec4fa9..61eeaa100 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/mc16_avx2.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/mc16_avx2.asm @@ -2650,23 +2650,14 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my %ifidn %1, put %assign isput 1 %assign isprep 0 - %if required_stack_alignment <= STACK_ALIGNMENT -cglobal put_8tap_scaled_16bpc, 4, 15, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax - %else cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax - %endif %xdefine base_reg r12 mov r7d, pxmaxm %else %assign isput 0 %assign isprep 1 - %if required_stack_alignment <= STACK_ALIGNMENT -cglobal prep_8tap_scaled_16bpc, 4, 15, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax - %xdefine tmp_stridem r14q - %else cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %define tmp_stridem qword [rsp+0xd0] - %endif %xdefine base_reg r11 %endif lea base_reg, [%1_8tap_scaled_16bpc_avx2] @@ -2698,15 +2689,9 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif - %if required_stack_alignment > STACK_ALIGNMENT - %define dsm [rsp+0x98] - %define rX r1 - %define rXd r1d - %else - %define dsm dsq - %define rX r14 - %define rXd r14d - %endif + %define dsm [rsp+0x98] + %define rX r1 + %define rXd r1d %else ; prep %if WIN64 mov r7d, hm @@ -3580,9 +3565,7 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free mov myd, mym %if isput - %if required_stack_alignment > STACK_ALIGNMENT - %define dsm [rsp+0xb8] - %endif + %define dsm [rsp+0xb8] movifnidn dsm, dsq mova [rsp+0xc0], xm7 %else diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/mc_avx2.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/mc_avx2.asm index 2719ef361..3b208033b 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/mc_avx2.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/mc_avx2.asm @@ -2721,22 +2721,13 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isprep 0 - %if required_stack_alignment <= STACK_ALIGNMENT -cglobal put_8tap_scaled_8bpc, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy - %else cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy - %endif %xdefine base_reg r12 %define rndshift 10 %else %assign isprep 1 - %if required_stack_alignment <= STACK_ALIGNMENT -cglobal prep_8tap_scaled_8bpc, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy - %xdefine tmp_stridem r14q - %else cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy - %define tmp_stridem qword [rsp+120] - %endif + %define tmp_stridem qword [rsp+120] %xdefine base_reg r11 %define rndshift 6 %endif @@ -2763,15 +2754,9 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, d DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif - %if required_stack_alignment > STACK_ALIGNMENT - %define dsm [rsp+112] - %define rX r1 - %define rXd r1d - %else - %define dsm dsq - %define rX r14 - %define rXd r14d - %endif + %define dsm [rsp+112] + %define rX r1 + %define rXd r1d %else ; prep %if WIN64 mov r7d, hm diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/msac.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/msac.asm index 92a3a731d..9f05c921a 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/msac.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/msac.asm @@ -619,7 +619,6 @@ cglobal msac_decode_symbol_adapt16, 3, 6, 6 mov t3d, [t0+msac.update_cdf] mov t4d, t2d not t2 -%if STACK_ALIGNMENT < 32 mov r5, rsp %if WIN64 and rsp, ~31 @@ -627,11 +626,6 @@ cglobal msac_decode_symbol_adapt16, 3, 6, 6 %else and r5, ~31 %define buf r5-32 -%endif -%elif WIN64 - sub rsp, 64 -%else - %define buf rsp-56 %endif psrlw m1, m0, 6 movd [buf-4], xm2 @@ -666,11 +660,7 @@ cglobal msac_decode_symbol_adapt16, 3, 6, 6 movzx t2d, word [buf+rax-2] shr eax, 1 %if WIN64 -%if STACK_ALIGNMENT < 32 mov rsp, r5 -%else - add rsp, 64 -%endif %endif vzeroupper jmp m(msac_decode_symbol_adapt4, _sse2).renorm2 diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/pal.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/pal.asm new file mode 100644 index 000000000..27187d11f --- /dev/null +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/pal.asm @@ -0,0 +1,641 @@ +; Copyright © 2023, VideoLAN and dav1d authors +; Copyright © 2023, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA 64 + +pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +%if ARCH_X86_64 + db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 + db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 +%endif +pal_idx_w8_padh: db 0, 1, 2, 3, 3, 3, 3, 3, 8, 9, 10, 11, 11, 11, 11, 11 + +pb_1_16: times 4 db 1, 16 +%if ARCH_X86_64 +pb_32: times 4 db 32 +%endif + +%macro JMP_TABLE 2-* + %xdefine %1_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1) + %%table: + %rep %0 - 1 + dd %%base %+ .w%2 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +JMP_TABLE pal_idx_finish_ssse3, 4, 8, 16, 32, 64 +%if ARCH_X86_64 +JMP_TABLE pal_idx_finish_avx2, 4, 8, 16, 32, 64 +JMP_TABLE pal_idx_finish_avx512icl, 4, 8, 16, 32, 64 +%endif + +SECTION .text + +INIT_XMM ssse3 +cglobal pal_idx_finish, 2, 7, 6, dst, src, bw, bh, w, h +%define base r6-pal_idx_finish_ssse3_table + LEA r6, pal_idx_finish_ssse3_table + tzcnt bwd, bwm + movifnidn bhd, bhm + movifnidn wd, wm + movifnidn hd, hm + movsxd bwq, [r6+bwq*4] + movddup m3, [base+pb_1_16] + add bwq, r6 + sub bhd, hd + jmp bwq +.w4: + mova m0, [srcq] + add srcq, 16 + pmaddubsw m0, m3 + packuswb m0, m0 + movq [dstq], m0 + add dstq, 8 + sub hd, 4 + jg .w4 + test bhd, bhd + jz .w4_end + pshuflw m0, m0, q3333 +.w4_padv: + movq [dstq], m0 + add dstq, 8 + sub bhd, 4 + jg .w4_padv +.w4_end: + RET +.w8_padh: + pshufb m0, m2 + pshufb m1, m2 + jmp .w8_main +.w8: + mova m2, [base+pal_idx_w8_padh] +.w8_loop: + mova m0, [srcq+16*0] + mova m1, [srcq+16*1] + cmp wd, 8 + jl .w8_padh +.w8_main: + pmaddubsw m0, m3 + pmaddubsw m1, m3 + add srcq, 16*2 + packuswb m0, m1 + movu [dstq], m0 + add dstq, 16 + sub hd, 4 + jg .w8_loop + test bhd, bhd + jz .w8_end + pshufd m0, m0, q3333 +.w8_padv: + movu [dstq], m0 + add dstq, 16 + sub bhd, 4 + jg .w8_padv +.w8_end: + RET +.w16_padh: + pshufb m0, m4 + pshufb m1, m4 + jmp .w16_main +.w16: + cmp wd, 16 + je .w16_loop + call .setup_padh +.w16_loop: + mova m0, [srcq+16*0] + mova m1, [srcq+16*1] + cmp wd, 16 + jl .w16_padh +.w16_main: + pmaddubsw m0, m3 + pmaddubsw m1, m3 + add srcq, 16*2 + packuswb m0, m1 + movu [dstq], m0 + add dstq, 16 + sub hd, 2 + jg .w16_loop + test bhd, bhd + jz .w16_end + punpckhqdq m0, m0 +.w16_padv: + movu [dstq+16*0], m0 + movu [dstq+16*1], m0 + add dstq, 16*2 + sub bhd, 4 + jg .w16_padv +.w16_end: + RET +.w32_padh: + cmp wd, 16 + jg .w32_padh2 + pshufb m1, m0, m5 + pshufb m0, m4 + jmp .w32_main +.w32_padh2: + pshufb m1, m4 + jmp .w32_main +.w32: + cmp wd, 32 + je .w32_loop + call .setup_padh +.w32_loop: + mova m0, [srcq+16*0] + mova m1, [srcq+16*1] + cmp wd, 32 + jl .w32_padh +.w32_main: + pmaddubsw m0, m3 + pmaddubsw m1, m3 + add srcq, 16*2 + packuswb m0, m1 + movu [dstq], m0 + add dstq, 16 + dec hd + jg .w32_loop + test bhd, bhd + jz .w32_end +.w32_padv: + movu [dstq+16*0], m0 + movu [dstq+16*1], m0 + movu [dstq+16*2], m0 + movu [dstq+16*3], m0 + add dstq, 16*4 + sub bhd, 4 + jg .w32_padv +.w32_end: + RET +.w64_padh: + cmp wd, 16 + jg .w64_padh2 + pshufb m1, m0, m5 + pshufb m0, m4 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + packuswb m0, m1 + packuswb m1, m1 + jmp .w64_main +.w64_padh2: + pshufb m1, m4 + pmaddubsw m0, m3 + pmaddubsw m2, m1, m3 + pshufb m1, m5 + pmaddubsw m1, m3 + packuswb m0, m2 + packuswb m1, m1 + jmp .w64_main +.w64_padh3: + cmp wd, 48 + jg .w64_padh4 + pshufb m2, m1, m5 + pshufb m1, m4 + jmp .w64_main2 +.w64_padh4: + pshufb m2, m4 + jmp .w64_main2 +.w64: + cmp wd, 64 + je .w64_loop + call .setup_padh +.w64_loop: + mova m0, [srcq+16*0] + mova m1, [srcq+16*1] + cmp wd, 32 + jle .w64_padh + pmaddubsw m0, m3 + pmaddubsw m1, m3 + packuswb m0, m1 + mova m1, [srcq+16*2] + mova m2, [srcq+16*3] + cmp wd, 64 + jl .w64_padh3 +.w64_main2: + pmaddubsw m1, m3 + pmaddubsw m2, m3 + packuswb m1, m2 +.w64_main: + add srcq, 16*4 + movu [dstq+16*0], m0 + movu [dstq+16*1], m1 + add dstq, 16*2 + dec hd + jg .w64_loop + test bhd, bhd + jz .w64_end +.w64_padv: + movu [dstq+16*0], m0 + movu [dstq+16*1], m1 + movu [dstq+16*2], m0 + movu [dstq+16*3], m1 + add dstq, 16*4 + sub bhd, 2 + jg .w64_padv +.w64_end: + RET +.setup_padh: + mova m4, [base+pb_0to63] + lea r6d, [wq-1] + and r6d, 15 + movd m5, r6d + pxor m0, m0 + pshufb m5, m0 + pminub m4, m5 + ret + +%if ARCH_X86_64 + +INIT_YMM avx2 +cglobal pal_idx_finish, 4, 7, 5, dst, src, bw, bh, w, h +%define base r6-pal_idx_finish_avx2_table + lea r6, [pal_idx_finish_avx2_table] + tzcnt bwd, bwd + movifnidn wd, wm + movifnidn hd, hm + movsxd bwq, [r6+bwq*4] + vpbroadcastd m2, [base+pb_1_16] + dec wd + add bwq, r6 + sub bhd, hd + jmp bwq +.w4: + mova xm0, [srcq] + add srcq, 16 + pmaddubsw xm0, xm2 + packuswb xm0, xm0 + movq [dstq], xm0 + add dstq, 8 + sub hd, 4 + jg .w4 + test bhd, bhd + jz .w4_end + pshuflw xm0, xm0, q3333 +.w4_padv: + movq [dstq], xm0 + add dstq, 8 + sub bhd, 4 + jg .w4_padv +.w4_end: + RET +.w8_padh: + pshufb xm0, xm3 + pshufb xm1, xm3 + jmp .w8_main +.w8: + mova xm3, [base+pal_idx_w8_padh] +.w8_loop: + mova xm0, [srcq+16*0] + mova xm1, [srcq+16*1] + cmp wd, 7 + jl .w8_padh +.w8_main: + pmaddubsw xm0, xm2 + pmaddubsw xm1, xm2 + add srcq, 16*2 + packuswb xm0, xm1 + movu [dstq], xm0 + add dstq, 16 + sub hd, 4 + jg .w8_loop + test bhd, bhd + jz .w8_end + pshufd xm0, xm0, q3333 +.w8_padv: + movu [dstq], xm0 + add dstq, 16 + sub bhd, 4 + jg .w8_padv +.w8_end: + RET +.w16_padh: + pshufb m0, m3 + pshufb m1, m3 + jmp .w16_main +.w16: + cmp wd, 15 + je .w16_loop + vbroadcasti128 m0, [base+pb_0to63] + movd xm3, wd + vpbroadcastb m3, xm3 + pminub m3, m0 +.w16_loop: + mova m0, [srcq+32*0] + mova m1, [srcq+32*1] + cmp wd, 15 + jl .w16_padh +.w16_main: + pmaddubsw m0, m2 + pmaddubsw m1, m2 + add srcq, 32*2 + packuswb m0, m1 + vpermq m1, m0, q3120 + movu [dstq], m1 + add dstq, 32 + sub hd, 4 + jg .w16_loop + test bhd, bhd + jz .w16_end + vpermq m0, m0, q3333 +.w16_padv: + movu [dstq], m0 + add dstq, 32 + sub bhd, 4 + jg .w16_padv +.w16_end: + RET +.w32_padh: + cmp wd, 15 + jg .w32_padh2 + vinserti128 m0, xm0, 1 + vinserti128 m1, xm1, 1 +.w32_padh2: + pshufb m0, m3 + pshufb m1, m3 + jmp .w32_main +.w32: + cmp wd, 31 + je .w32_loop + movd xm3, wd + vpbroadcastb m3, xm3 + pminub m3, [base+pb_0to63] +.w32_loop: + mova m0, [srcq+32*0] + mova m1, [srcq+32*1] + cmp wd, 31 + jl .w32_padh +.w32_main: + pmaddubsw m0, m2 + pmaddubsw m1, m2 + add srcq, 32*2 + packuswb m0, m1 + vpermq m1, m0, q3120 + movu [dstq], m1 + add dstq, 32 + sub hd, 2 + jg .w32_loop + test bhd, bhd + jz .w32_end + vpermq m0, m0, q3131 +.w32_padv: + movu [dstq+32*0], m0 + movu [dstq+32*1], m0 + add dstq, 32*2 + sub bhd, 4 + jg .w32_padv +.w32_end: + RET +.w64_padh: + cmp wd, 15 + jg .w64_padh2 + vinserti128 m1, m0, xm0, 1 + pshufb m0, m1, m3 + pshufb m1, m4 + jmp .w64_main +.w64_padh2: + cmp wd, 31 + jg .w64_padh3 + vperm2i128 m1, m0, m0, 0x11 + pshufb m0, m3 + pshufb m1, m4 + jmp .w64_main +.w64_padh3: + cmp wd, 47 + jg .w64_padh4 + vinserti128 m1, xm1, 1 +.w64_padh4: + pshufb m1, m3 + jmp .w64_main +.w64: + cmp wd, 63 + je .w64_loop + mov r6d, wd + and r6d, 31 + movd xm4, r6d + vpbroadcastb m4, xm4 + pminub m3, m4, [pb_0to63] +.w64_loop: + mova m0, [srcq+32*0] + mova m1, [srcq+32*1] + cmp wd, 63 + jl .w64_padh +.w64_main: + pmaddubsw m0, m2 + pmaddubsw m1, m2 + add srcq, 32*2 + packuswb m0, m1 + vpermq m0, m0, q3120 + movu [dstq], m0 + add dstq, 32 + dec hd + jg .w64_loop + test bhd, bhd + jz .w64_end +.w64_padv: + movu [dstq+32*0], m0 + movu [dstq+32*1], m0 + movu [dstq+32*2], m0 + movu [dstq+32*3], m0 + add dstq, 32*4 + sub bhd, 4 + jg .w64_padv +.w64_end: + RET + +INIT_ZMM avx512icl +cglobal pal_idx_finish, 4, 7, 7, dst, src, bw, bh, w, h +%define base r6-pal_idx_finish_avx512icl_table + lea r6, [pal_idx_finish_avx512icl_table] + tzcnt bwd, bwd + movifnidn wd, wm + movifnidn hd, hm + movsxd bwq, [r6+bwq*4] + vpbroadcastd m4, [base+pb_1_16] + dec wd + add bwq, r6 + sub bhd, hd + jmp bwq +.w4: + mova xmm0, [srcq] + add srcq, 16 + pmaddubsw xmm0, xm4 + packuswb xmm0, xmm0 + movq [dstq], xmm0 + add dstq, 8 + sub hd, 4 + jg .w4 + test bhd, bhd + jz .w4_end + pshuflw xmm0, xmm0, q3333 +.w4_padv: + movq [dstq], xmm0 + add dstq, 8 + sub bhd, 4 + jg .w4_padv +.w4_end: + RET +.w8_padh: + pshufb xmm0, xmm2 + pshufb xmm1, xmm2 + jmp .w8_main +.w8: + mova xmm2, [base+pal_idx_w8_padh] +.w8_loop: + mova xmm0, [srcq+16*0] + mova xmm1, [srcq+16*1] + cmp wd, 7 + jl .w8_padh +.w8_main: + pmaddubsw xmm0, xm4 + pmaddubsw xmm1, xm4 + add srcq, 16*2 + packuswb xmm0, xmm1 + movu [dstq], xmm0 + add dstq, 16 + sub hd, 4 + jg .w8_loop + test bhd, bhd + jz .w8_end + pshufd xmm0, xmm0, q3333 +.w8_padv: + movu [dstq], xmm0 + add dstq, 16 + sub bhd, 4 + jg .w8_padv +.w8_end: + RET +.w16_padh: + pshufb m0, m2 + jmp .w16_main +.w16: + cmp wd, 15 + je .w16_loop + vbroadcasti32x4 m2, [base+pb_0to63] + vpbroadcastb m0, wd + pminub m2, m0 +.w16_loop: + mova m0, [srcq] + cmp wd, 15 + jl .w16_padh +.w16_main: + pmaddubsw m0, m4 + add srcq, 64 + vpmovwb ym0, m0 + movu [dstq], ym0 + add dstq, 32 + sub hd, 4 + jg .w16_loop + test bhd, bhd + jz .w16_end + vpermq ym0, ym0, q3333 +.w16_padv: + movu [dstq], ym0 + add dstq, 32 + sub bhd, 4 + jg .w16_padv +.w16_end: + RET +.w32_padh: + vpermb m0, m2, m0 + vpermb m1, m2, m1 + jmp .w32_main +.w32: + mova m2, [base+pb_0to63] + paddb m3, m2, m2 + cmp wd, 31 + je .w32_loop + vpbroadcastb m0, wd + mov r6d, 0xff00 + kmovw k1, r6d + vpaddd m0{k1}, [pb_32] {1to16} + pminub m2, m0 +.w32_loop: + mova m0, [srcq+64*0] + mova m1, [srcq+64*1] + cmp wd, 31 + jl .w32_padh +.w32_main: + pmaddubsw m0, m4 + pmaddubsw m1, m4 + add srcq, 64*2 + vpermt2b m0, m3, m1 + movu [dstq], m0 + add dstq, 64 + sub hd, 4 + jg .w32_loop + test bhd, bhd + jz .w32_end + vshufi32x4 m0, m0, q3333 +.w32_padv: + movu [dstq], m0 + add dstq, 64 + sub bhd, 4 + jg .w32_padv +.w32_end: + RET +.w64_padh: + REPX {vpermb x, m5, x}, m0, m1, m2, m3 + jmp .w64_main +.w64: + mova m5, [base+pb_0to63] + paddb m6, m5, m5 + cmp wd, 63 + je .w64_loop + vpbroadcastb m0, wd + pminub m5, m0 +.w64_loop: + mova m0, [srcq+64*0] + mova m1, [srcq+64*1] + mova m2, [srcq+64*2] + mova m3, [srcq+64*3] + cmp wd, 63 + jl .w64_padh +.w64_main: + REPX {pmaddubsw x, m4}, m0, m1, m2, m3 + add srcq, 64*4 + vpermt2b m0, m6, m1 + vpermt2b m2, m6, m3 + movu [dstq+64*0], m0 + movu [dstq+64*1], m2 + add dstq, 64*2 + sub hd, 4 + jg .w64_loop + test bhd, bhd + jz .w64_end + vshufi32x4 m2, m2, q3232 +.w64_padv: + movu [dstq+64*0], m2 + movu [dstq+64*1], m2 + add dstq, 64*2 + sub bhd, 4 + jg .w64_padv +.w64_end: + RET + +%endif ; ARCH_X86_64 diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/pal.h b/prog/3rdPartyLibs/codecs/dav1d/src/x86/pal.h new file mode 100644 index 000000000..7cd2e68d5 --- /dev/null +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/pal.h @@ -0,0 +1,50 @@ +/* + * Copyright © 2023, VideoLAN and dav1d authors + * Copyright © 2023, Two Orioles, LLC + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/cpu.h" + +decl_pal_idx_finish_fn(dav1d_pal_idx_finish_ssse3); +decl_pal_idx_finish_fn(dav1d_pal_idx_finish_avx2); +decl_pal_idx_finish_fn(dav1d_pal_idx_finish_avx512icl); + +static ALWAYS_INLINE void pal_dsp_init_x86(Dav1dPalDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + c->pal_idx_finish = dav1d_pal_idx_finish_ssse3; + +#if ARCH_X86_64 + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + + c->pal_idx_finish = dav1d_pal_idx_finish_avx2; + + if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + + c->pal_idx_finish = dav1d_pal_idx_finish_avx512icl; +#endif +} diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/refmvs.asm b/prog/3rdPartyLibs/codecs/dav1d/src/x86/refmvs.asm index fb4ca1033..d95861fa1 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/refmvs.asm +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/refmvs.asm @@ -38,19 +38,238 @@ SECTION_RODATA 64 %endrep %endmacro +%macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix + %rep %1 + db %2*3 + db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \ + mangle(private_prefix %+ _save_tmvs_%3).write1 + %endrep +%endmacro + %if ARCH_X86_64 +mv_proj: dw 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340 + dw 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092 + dw 1024, 963, 910, 862, 819, 780, 744, 712 + dw 682, 655, 630, 606, 585, 564, 546, 528 splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7 db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 +%endif +save_pack0: db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0 + db 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 +save_pack1: db 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2 + db 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3 +save_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1 +cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3 +save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00 +save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00 +pb_128: times 16 db 128 +pq_8192: dq 8192 + +save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3 + SAVE_TMVS_TABLE 4, 8, ssse3 + SAVE_TMVS_TABLE 4, 4, ssse3 + SAVE_TMVS_TABLE 5, 2, ssse3 + SAVE_TMVS_TABLE 7, 1, ssse3 + +%if ARCH_X86_64 +save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2 + SAVE_TMVS_TABLE 4, 8, avx2 + SAVE_TMVS_TABLE 4, 4, avx2 + SAVE_TMVS_TABLE 5, 2, avx2 + SAVE_TMVS_TABLE 7, 1, avx2 + +save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl + SAVE_TMVS_TABLE 4, 8, avx512icl + SAVE_TMVS_TABLE 4, 4, avx512icl + SAVE_TMVS_TABLE 5, 2, avx512icl + SAVE_TMVS_TABLE 7, 1, avx512icl JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32 JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32 %endif + JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32 SECTION .text +%macro movif32 2 +%if ARCH_X86_32 + mov %1, %2 +%endif +%endmacro + +INIT_XMM ssse3 +; refmvs_temporal_block *rp, ptrdiff_t stride, +; refmvs_block **rr, uint8_t *ref_sign, +; int col_end8, int row_end8, int col_start8, int row_start8 +%if ARCH_X86_64 +cglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \ + xend, yend, xstart, ystart +%define base_reg r12 +%else +cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \ + xend, yend, xstart, ystart + movq m5, [ref_signq] + lea strided, [strided*5] + mov stridem, strided + mov r3, xstartm + mov r1, ystartm + DEFINE_ARGS b, ystart, rr, cand, xend, x +%define stridemp r1m +%define m8 [base+pb_128] +%define m9 [base+save_pack0+ 0] +%define m10 [base+save_pack0+16] +%define base_reg r6 +%endif +%define base base_reg-.write1 + LEA base_reg, .write1 +%if ARCH_X86_64 + movifnidn xendd, xendm + movifnidn yendd, yendm + mov xstartd, xstartm + mov ystartd, ystartm + movq m5, [ref_signq] +%endif + movu m4, [base+save_ref_shuf] + movddup m6, [base+save_cond0] + movddup m7, [base+save_cond1] +%if ARCH_X86_64 + mova m8, [base+pb_128] + mova m9, [base+save_pack0+ 0] + mova m10, [base+save_pack0+16] +%endif + psllq m5, 8 +%if ARCH_X86_64 + lea r9d, [xendq*5] + lea xstartd, [xstartq*5] + sub yendd, ystartd + add ystartd, ystartd + lea strideq, [strideq*5] + sub xstartq, r9 + add xendd, r9d + add rpq, r9 + DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand +%else + lea r0, [xendd*5] ; xend5 + lea r3, [r3*5] ; xstart5 + sub r3, r0 ; -w5 + mov r6m, r3 +%define xstartq r6m + add xendd, r0 ; xend6 + add r0m, r0 ; rp+xend5 + mov xendm, xendd + sub r5, r1 ; h + add r1, r1 + mov r7m, r1 + mov r5m, r5 +%define hd r5mp + jmp .loop_y_noload +%endif +.loop_y: + movif32 ystartd, r7m + movif32 xendd, xendm +.loop_y_noload: + and ystartd, 30 + mov xq, xstartq + mov bq, [rrq+ystartq*gprsize] + add ystartd, 2 + movif32 r7m, ystartd + lea bq, [bq+xendq*4] +.loop_x: +%if ARCH_X86_32 +%define rpq r3 +%define r10 r1 +%define r10d r1 +%define r11 r4 +%define r11d r4 +%endif + imul candq, xq, 0x9999 ; x / 5 * 3 + sar candq, 16 + movzx r10d, byte [bq+candq*8+22] ; cand_b->bs + movu m0, [bq+candq*8+12] ; cand_b + movzx r11d, byte [base+save_tmvs_ssse3_table+r10*2+0] + movzx r10d, byte [base+save_tmvs_ssse3_table+r10*2+1] + add r10, base_reg + add candq, r11 + jge .calc + movu m1, [bq+candq*8+12] + movzx r11d, byte [bq+candq*8+22] + movzx r11d, byte [base+save_tmvs_ssse3_table+r11*2+1] + add r11, base_reg +.calc: + movif32 rpq, r0m + ; ref check + punpckhqdq m2, m0, m1 + pshufb m2, m4 ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ... + pshufb m3, m5, m2 ; ref > 0 && res_sign[ref - 1] + ; mv check + punpcklqdq m2, m0, m1 ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ... + pabsw m2, m2 + psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 + ; res + pcmpgtd m3, m2 + pshufd m2, m3, q2301 + pand m3, m6 ; b0c0 b0c1 b1c0 b1c1 | ... + pand m2, m7 ; b0c1 b0c0 b1c1 b1c0 | ... + por m3, m2 ; b0.shuf b1.shuf | ... + pxor m3, m8 ; if cond0|cond1 == 0 => zero out + pshufb m0, m3 + pshufb m1, m3 + call r10 + jge .next_line + pshufd m0, m1, q3232 + call r11 + jl .loop_x +.next_line: + add rpq, stridemp + movif32 r0m, rpq + dec hd + jg .loop_y + RET +.write1: + movd [rpq+xq+0], m0 + psrlq m0, 8 + movd [rpq+xq+1], m0 + add xq, 5*1 + ret +.write2: + movq [rpq+xq+0], m0 + psrlq m0, 8 + movd [rpq+xq+6], m0 + add xq, 5*2 + ret +.write4: + pshufb m0, m9 + movu [rpq+xq+ 0], m0 + psrlq m0, 8 + movd [rpq+xq+16], m0 + add xq, 5*4 + ret +.write8: + pshufb m2, m0, m9 + movu [rpq+xq+ 0], m2 + pshufb m0, m10 + movu [rpq+xq+16], m0 + psrldq m2, 2 + movq [rpq+xq+32], m2 + add xq, 5*8 + ret +.write16: + pshufb m2, m0, m9 + movu [rpq+xq+ 0], m2 + pshufb m0, m10 + movu [rpq+xq+16], m0 + shufps m2, m0, q1032 + movu [rpq+xq+48], m2 + shufps m2, m0, q2121 + movu [rpq+xq+32], m2 + shufps m0, m2, q1032 + movu [rpq+xq+64], m0 + add xq, 5*16 + ret + INIT_XMM sse2 ; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4 cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 @@ -115,7 +334,333 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 RET %if ARCH_X86_64 +INIT_XMM sse4 +; refmvs_frame *rf, int tile_row_idx, +; int col_start8, int col_end8, int row_start8, int row_end8 +cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \ + stride, rp_proj, roff, troff, \ + xendi, xstarti, iw8, ih8, dst + xor r14d, r14d + cmp dword [rfq+212], 1 ; n_tile_threads + mov ih8d, [rfq+20] ; rf->ih8 + mov iw8d, [rfq+16] ; rf->iw8 + mov xstartd, xstartd + mov xendd, xendd + cmove tridxd, r14d + lea xstartid, [xstartq-8] + lea xendid, [xendq+8] + mov strideq, [rfq+184] + mov rp_projq, [rfq+176] + cmp ih8d, yendd + mov [rsp+0x30], strideq + cmovs yendd, ih8d + test xstartid, xstartid + cmovs xstartid, r14d + cmp iw8d, xendid + cmovs xendid, iw8d + mov troffq, strideq + shl troffq, 4 + imul troffq, tridxq + mov dstd, ystartd + and dstd, 15 + imul dstq, strideq + add dstq, troffq ; (16 * tridx + (ystart & 15)) * stride + lea dstq, [dstq*5] + add dstq, rp_projq + lea troffq, [troffq*5] ; 16 * tridx * stride * 5 + lea r13d, [xendq*5] + lea r12, [strideq*5] + DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \ + _, troff, xendi, xstarti, stride5, _, dst + lea w5d, [xstartq*5] + add r7, troffq ; rp_proj + tile_row_offset + mov hd, yendd + mov [rsp+0x28], r7 + add dstq, r13 + sub w5q, r13 + sub hd, ystartd +.init_xloop_start: + mov x5q, w5q + test w5b, 1 + jz .init_2blk + mov dword [dstq+x5q], 0x80008000 + add x5q, 5 + jz .init_next_row +.init_2blk: + mov dword [dstq+x5q+0], 0x80008000 + mov dword [dstq+x5q+5], 0x80008000 + add x5q, 10 + jl .init_2blk +.init_next_row: + add dstq, stride5q + dec hd + jg .init_xloop_start + DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \ + _, _, xendi, xstarti, stride5, _, n + mov r13d, [rfq+152] ; rf->n_mfmvs + test r13d, r13d + jz .ret + mov [rsp+0x0c], r13d + mov strideq, [rsp+0x30] + movddup m3, [pq_8192] + mov r9d, ystartd + mov [rsp+0x38], yendd + mov [rsp+0x20], xstartid + xor nd, nd + xor n7d, n7d + imul r9, strideq ; ystart * stride + mov [rsp+0x48], rfq + mov [rsp+0x18], stride5q + lea r7, [r9*5] + mov [rsp+0x24], ystartd + mov [rsp+0x00], r7 +.nloop: + DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \ + ref, rp_ref, xendi, xstarti, _, _, n + mov rfq, [rsp+0x48] + mov refd, [rfq+56+nq*4] ; ref2cur + cmp refd, 0x80000000 + je .next_n + mov [rsp+0x40], refd + mov offq, [rsp+0x00] ; ystart * stride * 5 + movzx refd, byte [rfq+53+nq] ; rf->mfmv_ref[n] + lea refsignq, [refq-4] + mov rp_refq, [rfq+168] + movq m2, refsignq + add offq, [rp_refq+refq*8] ; r = rp_ref[ref] + row_offset + mov [rsp+0x14], nd + mov yd, ystartd +.yloop: + mov r11d, [rsp+0x24] ; ystart + mov r12d, [rsp+0x38] ; yend + mov r14d, yd + and r14d, ~7 ; y_sb_align + cmp r11d, r14d + cmovs r11d, r14d ; imax(y_sb_align, ystart) + mov [rsp+0x44], r11d ; y_proj_start + add r14d, 8 + cmp r12d, r14d + cmovs r14d, r12d ; imin(y_sb_align + 8, yend) + mov [rsp+0x3c], r14d ; y_proj_end + DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \ + ref, x, xendi, mvx, mvy, rb, ref2ref + mov xd, [rsp+0x20] ; xstarti +.xloop: + lea rbd, [xq*5] + add rbq, srcq + movsx refd, byte [rbq+4] + test refd, refd + jz .next_x_bad_ref + mov rfq, [rsp+0x48] + lea r14d, [16+n7q+refq] + mov ref2refd, [rfq+r14*4] ; rf->mfmv_ref2ref[n][b_ref-1] + test ref2refd, ref2refd + jz .next_x_bad_ref + lea fracq, [mv_proj] + movzx fracd, word [fracq+ref2refq*2] + mov mvd, [rbq] + imul fracd, [rsp+0x40] ; ref2cur + pmovsxwq m0, [rbq] + movd m1, fracd + punpcklqdq m1, m1 + pmuldq m0, m1 ; mv * frac + pshufd m1, m0, q3311 + paddd m0, m3 + paddd m0, m1 + psrad m0, 14 ; offset = (xy + (xy >> 31) + 8192) >> 14 + pabsd m1, m0 + packssdw m0, m0 + psrld m1, 6 + packuswb m1, m1 + pxor m0, m2 ; offset ^ ref_sign + psignd m1, m0 ; apply_sign(abs(offset) >> 6, offset ^ refsign) + movq mvxq, m1 + lea mvyd, [mvxq+yq] ; ypos + sar mvxq, 32 + DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \ + ref, x, xendi, mvx, ypos, rb, ref2ref + cmp yposd, [rsp+0x44] ; y_proj_start + jl .next_x_bad_pos_y + cmp yposd, [rsp+0x3c] ; y_proj_end + jge .next_x_bad_pos_y + and yposd, 15 + add mvxq, xq ; xpos + imul yposq, [rsp+0x30] ; pos = (ypos & 15) * stride + DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \ + ref, x, xendi, xpos, pos, rb, ref2ref + mov dstq, [rsp+0x28] ; dst = rp_proj + tile_row_offset + add posq, xposq ; pos += xpos + lea posq, [posq*5] + add dstq, posq ; dst += pos5 + jmp .write_loop_entry +.write_loop: + add rbq, 5 + cmp refb, byte [rbq+4] + jne .xloop + cmp mvd, [rbq] + jne .xloop + add dstq, 5 + inc xposd +.write_loop_entry: + mov r12d, xd + and r12d, ~7 + lea r5d, [r12-8] + cmp r5d, xstartd + cmovs r5d, xstartd ; x_proj_start + cmp xposd, r5d + jl .next_xpos + add r12d, 16 + cmp xendd, r12d + cmovs r12d, xendd ; x_proj_end + cmp xposd, r12d + jge .next_xpos + mov [dstq+0], mvd + mov byte [dstq+4], ref2refb +.next_xpos: + inc xd + cmp xd, xendid + jl .write_loop +.next_y: + DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n + add srcq, [rsp+0x18] ; stride5 + inc yd + cmp yd, [rsp+0x38] ; yend + jne .yloop + mov nd, [rsp+0x14] + mov ystartd, [rsp+0x24] +.next_n: + add n7d, 7 + inc nd + cmp nd, [rsp+0x0c] ; n_mfmvs + jne .nloop +.ret: + RET +.next_x: + DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _ + add rbq, 5 + cmp refb, byte [rbq+4] + jne .xloop + cmp mvd, [rbq] + jne .xloop +.next_x_bad_pos_y: + inc xd + cmp xd, xendid + jl .next_x + jmp .next_y +.next_x_bad_ref: + inc xd + cmp xd, xendid + jl .xloop + jmp .next_y + INIT_YMM avx2 +; refmvs_temporal_block *rp, ptrdiff_t stride, +; refmvs_block **rr, uint8_t *ref_sign, +; int col_end8, int row_end8, int col_start8, int row_start8 +cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \ + xend, yend, xstart, ystart +%define base r12-.write1 + lea r12, [.write1] + movifnidn xendd, xendm + movifnidn yendd, yendm + mov xstartd, xstartm + mov ystartd, ystartm + vpbroadcastq m4, [ref_signq] + vpbroadcastq m3, [base+save_ref_shuf+8] + vpbroadcastq m5, [base+save_cond0] + vpbroadcastq m6, [base+save_cond1] + vpbroadcastd m7, [base+pb_128] + mova m8, [base+save_pack0] + mova m9, [base+save_pack1] + psllq m4, 8 + lea r9d, [xendq*5] + lea xstartd, [xstartq*5] + sub yendd, ystartd + add ystartd, ystartd + lea strideq, [strideq*5] + sub xstartq, r9 + add xendd, r9d + add rpq, r9 + DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand +.loop_y: + and ystartd, 30 + mov xq, xstartq + mov bq, [rrq+ystartq*8] + add ystartd, 2 + lea bq, [bq+xendq*4] +.loop_x: + imul candq, xq, 0x9999 + sar candq, 16 ; x / 5 * 3 + movzx r10d, byte [bq+candq*8+22] ; cand_b->bs + movu xm0, [bq+candq*8+12] ; cand_b + movzx r11d, byte [base+save_tmvs_avx2_table+r10*2+0] + movzx r10d, byte [base+save_tmvs_avx2_table+r10*2+1] + add r10, r12 + add candq, r11 + jge .calc + vinserti128 m0, [bq+candq*8+12], 1 + movzx r11d, byte [bq+candq*8+22] + movzx r11d, byte [base+save_tmvs_avx2_table+r11*2+1] + add r11, r12 +.calc: + pshufb m1, m0, m3 + pabsw m2, m0 + pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] + psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 + pcmpgtd m1, m2 + pshufd m2, m1, q2301 + pand m1, m5 ; b0.cond0 b1.cond0 + pand m2, m6 ; b0.cond1 b1.cond1 + por m1, m2 ; b0.shuf b1.shuf + pxor m1, m7 ; if cond0|cond1 == 0 => zero out + pshufb m0, m1 + call r10 + jge .next_line + vextracti128 xm0, m0, 1 + call r11 + jl .loop_x +.next_line: + add rpq, strideq + dec hd + jg .loop_y + RET +.write1: + movd [rpq+xq+ 0], xm0 + pextrb [rpq+xq+ 4], xm0, 4 + add xq, 5*1 + ret +.write2: + movq [rpq+xq+0], xm0 + psrlq xm1, xm0, 8 + movd [rpq+xq+6], xm1 + add xq, 5*2 + ret +.write4: + pshufb xm1, xm0, xm8 + movu [rpq+xq+ 0], xm1 + psrlq xm1, 8 + movd [rpq+xq+16], xm1 + add xq, 5*4 + ret +.write8: + vinserti128 m1, m0, xm0, 1 + pshufb m1, m8 + movu [rpq+xq+ 0], m1 + psrldq xm1, 2 + movq [rpq+xq+32], xm1 + add xq, 5*8 + ret +.write16: + vinserti128 m1, m0, xm0, 1 + pshufb m2, m1, m8 + movu [rpq+xq+ 0], m2 + pshufb m1, m9 + movu [rpq+xq+32], m1 + shufps xm2, xm1, q1021 + movu [rpq+xq+64], xm2 + add xq, 5*16 + ret + cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 add bx4d, bw4d tzcnt bw4d, bw4d @@ -170,6 +715,125 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 jg .loop RET +INIT_ZMM avx512icl +; refmvs_temporal_block *rp, ptrdiff_t stride, +; refmvs_block **rr, uint8_t *ref_sign, +; int col_end8, int row_end8, int col_start8, int row_start8 +cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \ + xend, yend, xstart, ystart +%define base r14-.write1 + lea r14, [.write1] + movifnidn xendd, xendm + movifnidn yendd, yendm + mov xstartd, xstartm + mov ystartd, ystartm + psllq m4, [ref_signq]{bcstq}, 8 + vpbroadcastq m3, [base+save_ref_shuf+8] + vbroadcasti32x4 m5, [base+cond_shuf512] + vbroadcasti32x4 m6, [base+save_cond0] + vpbroadcastd m7, [base+pb_128] + mova m8, [base+save_pack0] + movu xm9, [base+save_pack0+4] + lea r9d, [xendq*5] + lea xstartd, [xstartq*5] + sub yendd, ystartd + add ystartd, ystartd + lea strideq, [strideq*5] + sub xstartq, r9 + add xendd, r9d + add rpq, r9 + mov r10d, 0x1f + kmovb k2, r10d + DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand +.loop_y: + and ystartd, 30 + mov xq, xstartq + mov bq, [rrq+ystartq*8] + add ystartd, 2 + lea bq, [bq+xendq*4] +.loop_x: + imul candq, xq, 0x9999 + sar candq, 16 ; x / 5 * 3 + movzx r10d, byte [bq+candq*8+22] ; cand_b->bs + movu xm0, [bq+candq*8+12] ; cand_b + movzx r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0] + movzx r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1] + add r10, r14 + add candq, r11 + jge .calc + movzx r11d, byte [bq+candq*8+22] + vinserti32x4 ym0, [bq+candq*8+12], 1 + movzx r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0] + movzx r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1] + add r11, r14 + add candq, r12 + jge .calc + movzx r12d, byte [bq+candq*8+22] + vinserti32x4 m0, [bq+candq*8+12], 2 + movzx r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0] + movzx r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1] + add r12, r14 + add candq, r13 + jge .calc + vinserti32x4 m0, [bq+candq*8+12], 3 + movzx r13d, byte [bq+candq*8+22] + movzx r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1] + add r13, r14 +.calc: + pshufb m1, m0, m3 + pabsw m2, m0 + pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] + psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 + psubd m2, m1 + pshufb m2, m5 ; c0 c1 c1 c0 + pand m2, m6 + punpckhqdq m1, m2, m2 + vpternlogd m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80 + pshufb m2, m0, m1 + mova xm0, xm2 + call r10 + jge .next_line + vextracti32x4 xm0, m2, 1 + call r11 + jge .next_line + vextracti32x4 xm0, m2, 2 + call r12 + jge .next_line + vextracti32x4 xm0, m2, 3 + call r13 + jl .loop_x +.next_line: + add rpq, strideq + dec hd + jg .loop_y + RET +.write1: + vmovdqu8 [rpq+xq]{k2}, xm0 + add xq, 5*1 + ret +.write2: + pshufb xm0, xm8 + vmovdqu16 [rpq+xq]{k2}, xm0 + add xq, 5*2 + ret +.write4: + vpermb ym0, ym8, ym0 + vmovdqu32 [rpq+xq]{k2}, ym0 + add xq, 5*4 + ret +.write8: + vpermb m0, m8, m0 + vmovdqu64 [rpq+xq]{k2}, m0 + add xq, 5*8 + ret +.write16: + vpermb m1, m8, m0 + movu [rpq+xq+ 0], m1 + pshufb xm0, xm9 + movu [rpq+xq+64], xm0 + add xq, 5*16 + ret + INIT_ZMM avx512icl cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4 vbroadcasti32x4 m0, [aq] diff --git a/prog/3rdPartyLibs/codecs/dav1d/src/x86/refmvs.h b/prog/3rdPartyLibs/codecs/dav1d/src/x86/refmvs.h index de4124c43..c9978561e 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/src/x86/refmvs.h +++ b/prog/3rdPartyLibs/codecs/dav1d/src/x86/refmvs.h @@ -28,6 +28,12 @@ #include "src/cpu.h" #include "src/refmvs.h" +decl_load_tmvs_fn(dav1d_load_tmvs_sse4); + +decl_save_tmvs_fn(dav1d_save_tmvs_ssse3); +decl_save_tmvs_fn(dav1d_save_tmvs_avx2); +decl_save_tmvs_fn(dav1d_save_tmvs_avx512icl); + decl_splat_mv_fn(dav1d_splat_mv_sse2); decl_splat_mv_fn(dav1d_splat_mv_avx2); decl_splat_mv_fn(dav1d_splat_mv_avx512icl); @@ -39,13 +45,22 @@ static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) { c->splat_mv = dav1d_splat_mv_sse2; + if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; + + c->save_tmvs = dav1d_save_tmvs_ssse3; + + if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; #if ARCH_X86_64 + c->load_tmvs = dav1d_load_tmvs_sse4; + if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; + c->save_tmvs = dav1d_save_tmvs_avx2; c->splat_mv = dav1d_splat_mv_avx2; if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + c->save_tmvs = dav1d_save_tmvs_avx512icl; c->splat_mv = dav1d_splat_mv_avx512icl; #endif } diff --git a/prog/3rdPartyLibs/codecs/dav1d/tools/dav1d.c b/prog/3rdPartyLibs/codecs/dav1d/tools/dav1d.c index 67937b433..897b7c01f 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/tools/dav1d.c +++ b/prog/3rdPartyLibs/codecs/dav1d/tools/dav1d.c @@ -195,11 +195,18 @@ int main(const int argc, char *const *const argv) { uint64_t nspf, tfirst, elapsed; double i_fps; FILE *frametimes = NULL; - const char *version = dav1d_version(); - - if (strcmp(version, DAV1D_VERSION)) { - fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n", - version, DAV1D_VERSION); + const unsigned version = dav1d_version_api(); + const int major = DAV1D_API_MAJOR(version); + const int minor = DAV1D_API_MINOR(version); + const int patch = DAV1D_API_PATCH(version); + + if (DAV1D_API_VERSION_MAJOR != major || + DAV1D_API_VERSION_MINOR > minor) { + fprintf(stderr, "Version mismatch (library: %d.%d.%d, executable: %d.%d.%d)\n", + major, minor, patch, + DAV1D_API_VERSION_MAJOR, + DAV1D_API_VERSION_MINOR, + DAV1D_API_VERSION_PATCH); return EXIT_FAILURE; } diff --git a/prog/3rdPartyLibs/codecs/dav1d/tools/dav1d_cli_parse.c b/prog/3rdPartyLibs/codecs/dav1d/tools/dav1d_cli_parse.c index 3f68c4ccc..4d747c032 100644 --- a/prog/3rdPartyLibs/codecs/dav1d/tools/dav1d_cli_parse.c +++ b/prog/3rdPartyLibs/codecs/dav1d/tools/dav1d_cli_parse.c @@ -62,6 +62,7 @@ enum { ARG_NEG_STRIDE, ARG_OUTPUT_INVISIBLE, ARG_INLOOP_FILTERS, + ARG_DECODE_FRAME_TYPE, }; static const struct option long_opts[] = { @@ -88,6 +89,7 @@ static const struct option long_opts[] = { { "negstride", 0, NULL, ARG_NEG_STRIDE }, { "outputinvisible", 1, NULL, ARG_OUTPUT_INVISIBLE }, { "inloopfilters", 1, NULL, ARG_INLOOP_FILTERS }, + { "decodeframetype", 1, NULL, ARG_DECODE_FRAME_TYPE }, { NULL, 0, NULL, 0 }, }; @@ -145,7 +147,9 @@ static void usage(const char *const app, const char *const reason, ...) { " --negstride: use negative picture strides\n" " this is mostly meant as a developer option\n" " --outputinvisible $num: whether to output invisible (alt-ref) frames (default: 0)\n" - " --inloopfilters $str: which in-loop filters to enable (none, (no)deblock, (no)cdef, (no)restoration or all; default: all)\n"); + " --inloopfilters $str: which in-loop filters to enable (none, (no)deblock, (no)cdef, (no)restoration or all; default: all)\n" + " --decodeframetype $str: which frame types to decode (reference, intra, key or all; default: all)\n" + ); exit(1); } @@ -233,7 +237,13 @@ static const EnumParseTable inloop_filters_tbl[] = { { "restoration", DAV1D_INLOOPFILTER_RESTORATION }, { "norestoration", DAV1D_INLOOPFILTER_ALL - DAV1D_INLOOPFILTER_RESTORATION }, { "all", DAV1D_INLOOPFILTER_ALL }, - { 0 }, +}; + +static const EnumParseTable decode_frame_type_tbl[] = { + { "all", DAV1D_DECODEFRAMETYPE_ALL }, + { "reference", DAV1D_DECODEFRAMETYPE_REFERENCE }, + { "intra", DAV1D_DECODEFRAMETYPE_INTRA }, + { "key", DAV1D_DECODEFRAMETYPE_KEY }, }; #define ARRAY_SIZE(n) (sizeof(n)/sizeof(*(n))) @@ -382,6 +392,11 @@ void parse(const int argc, char *const *const argv, parse_enum(optarg, inloop_filters_tbl, ARRAY_SIZE(inloop_filters_tbl),ARG_INLOOP_FILTERS, argv[0]); break; + case ARG_DECODE_FRAME_TYPE: + lib_settings->decode_frame_type = + parse_enum(optarg, decode_frame_type_tbl, + ARRAY_SIZE(decode_frame_type_tbl), ARG_DECODE_FRAME_TYPE, argv[0]); + break; default: usage(argv[0], NULL); } diff --git a/prog/3rdPartyLibs/lottie/jamfile b/prog/3rdPartyLibs/lottie/jamfile index 8142a0ceb..276ae6e3f 100644 --- a/prog/3rdPartyLibs/lottie/jamfile +++ b/prog/3rdPartyLibs/lottie/jamfile @@ -7,6 +7,7 @@ Target = 3rdPartyLibs/lottie.lib ; include $(Root)/prog/_jBuild/defaults.jam ; _INCLUDE = ; +LOTTIE_USE_INTERNAL_STBI ?= yes ; AddIncludes = $(Root)/$(Location)/inc @@ -46,9 +47,12 @@ Sources = src/vector/freetype/v_ft_stroker.cpp src/vector/freetype/v_ft_raster.cpp src/vector/freetype/v_ft_math.cpp - src/vector/stb/stb_image.cpp ; +if $(LOTTIE_USE_INTERNAL_STBI) = yes { + Sources += src/vector/stb/stb_image.cpp ; +} + if $(Platform) in win32 win64 linux64 xboxOne scarlett ps4 ps5 || ( $(Platform) = macosx && $(MacOSXArch) = x86_64 ) { Sources += src/vector/vdrawhelper_sse2.cpp ; } diff --git a/prog/commonFx/commonFxGame/modfx/modfx_part_data.hlsli b/prog/commonFx/commonFxGame/modfx/modfx_part_data.hlsli index 182b4860a..d6bc41822 100644 --- a/prog/commonFx/commonFxGame/modfx/modfx_part_data.hlsli +++ b/prog/commonFx/commonFxGame/modfx/modfx_part_data.hlsli @@ -75,7 +75,7 @@ void modfx_load_ren_data(BufferData_cref buf, uint ofs, uint decls, DAFX_OREF(Mo o.radius = dafx_load_1f(buf, ofs); if (MODFX_RDECL_COLOR_ENABLED(decls)) - o.color = unpack_uint_to_n4f(dafx_load_1ui(buf, ofs)); + o.color = pow2(unpack_uint_to_n4f(dafx_load_1ui(buf, ofs))); // gamma correction applied if (MODFX_RDECL_ANGLE_ENABLED(decls)) o.angle = dafx_load_1f(buf, ofs); @@ -129,7 +129,7 @@ void modfx_save_ren_data(BufferData_ref buf, uint ofs, uint decls, uint sflags, dafx_store_1f(v.radius, buf, ofs); if (MODFX_RDECL_COLOR_ENABLED(decls)) - dafx_store_1ui(pack_n4f_to_uint(saturate(v.color)), buf, ofs); + dafx_store_1ui(pack_n4f_to_uint(sqrt(saturate(v.color))), buf, ofs); // gamma correction applied if (MODFX_RDECL_ANGLE_ENABLED(decls)) dafx_store_1f(v.angle, buf, ofs); diff --git a/prog/commonFx/commonFxGame/modfx/modfx_velocity.hlsli b/prog/commonFx/commonFxGame/modfx/modfx_velocity.hlsli index 3ae5e7d2b..95d52e264 100644 --- a/prog/commonFx/commonFxGame/modfx/modfx_velocity.hlsli +++ b/prog/commonFx/commonFxGame/modfx/modfx_velocity.hlsli @@ -21,6 +21,10 @@ bool modfx_scene_collision_sample( float3 wpos, GlobalData_cref gdata, float4 spos = mul( float4( wpos, 1 ), gdata.globtm ); spos.xyz /= spos.w; + o_tci = 0; + o_proj_depth = 0; + o_scene_depth = 0; + o_stc = float2( spos.xy * float2( 0.5, -0.5 ) + float2( 0.5, 0.5 ) ); if ( o_stc.x < 0 || o_stc.y < 0 || o_stc.x >= 1.f || o_stc.y >= 1 ) return false; diff --git a/prog/commonFx/commonFxGame/modfx_bboard_render.sh b/prog/commonFx/commonFxGame/modfx_bboard_render.sh index 86856acf4..c8838e34e 100644 --- a/prog/commonFx/commonFxGame/modfx_bboard_render.sh +++ b/prog/commonFx/commonFxGame/modfx_bboard_render.sh @@ -312,6 +312,7 @@ shader dafx_modfx_bboard_render, dafx_modfx_bboard_render_atest, dafx_modfx_bboa #undef MODFX_USE_GI #undef MODFX_USE_PACK_HDR #undef MODFX_USE_FOG + #undef MODFX_USE_DEPTH_MASK } } diff --git a/prog/commonFx/commonFxGame/params.gen.nut b/prog/commonFx/commonFxGame/params.gen.nut index df9f9c7be..7118fe013 100644 --- a/prog/commonFx/commonFxGame/params.gen.nut +++ b/prog/commonFx/commonFxGame/params.gen.nut @@ -83,7 +83,7 @@ let class BaseParam { -let class InvalidParam extends BaseParam { +let class InvalidParam (BaseParam) { typeName=null constructor(name, type) @@ -107,7 +107,7 @@ let class InvalidParam extends BaseParam { -let class RefSlotParam extends BaseParam { +let class RefSlotParam (BaseParam) { slotType="Unknown" constructor(name, decl) { @@ -145,7 +145,7 @@ glob_types.ref_slot <- RefSlotParam(null, { slotType="Unknown" }); -let class CubicCurveParam extends BaseParam { +let class CubicCurveParam (BaseParam) { color = Color3(255, 255, 0) constructor(name, decl) { @@ -174,7 +174,7 @@ let class CubicCurveParam extends BaseParam { glob_types.cubic_curve <- CubicCurveParam(null, {}) -let class GradientBoxParam extends BaseParam { +let class GradientBoxParam (BaseParam) { constructor(name, decl) { this.paramName=name; } @@ -200,7 +200,7 @@ glob_types.gradient_box <- GradientBoxParam(null, {}) -let class SimpleTypeParam extends BaseParam { +let class SimpleTypeParam (BaseParam) { typeName = null function generateDeclText(text) { @@ -222,7 +222,7 @@ let class SimpleTypeParam extends BaseParam { -let class E3dcolorParam extends SimpleTypeParam { +let class E3dcolorParam (SimpleTypeParam) { defVal = Color3(255, 255, 255) constructor(name, decl) { @@ -244,7 +244,7 @@ let class E3dcolorParam extends SimpleTypeParam { glob_types.E3DCOLOR <- E3dcolorParam(null, {}) -let class IntParam extends SimpleTypeParam { +let class IntParam (SimpleTypeParam) { defVal = 0 constructor(name, decl) { @@ -267,7 +267,7 @@ glob_types.int <- IntParam(null, {}) -let class RealParam extends SimpleTypeParam { +let class RealParam (SimpleTypeParam) { defVal = 0.0 constructor(name, decl) { @@ -290,7 +290,7 @@ glob_types.real <- RealParam(null, {}) -let class BoolParam extends SimpleTypeParam { +let class BoolParam (SimpleTypeParam) { defVal = false; constructor(name, decl) { @@ -315,7 +315,7 @@ let class BoolParam extends SimpleTypeParam { glob_types.bool <- BoolParam(null, {}) -let class Point2Param extends SimpleTypeParam { +let class Point2Param (SimpleTypeParam) { defVal = Point2(0, 0) constructor(name, decl) { @@ -338,7 +338,7 @@ let class Point2Param extends SimpleTypeParam { glob_types.Point2 <- Point2Param(null, {}) -let class Point3Param extends SimpleTypeParam { +let class Point3Param (SimpleTypeParam) { defVal = Point3(0, 0, 0) constructor(name, decl) { @@ -361,7 +361,7 @@ let class Point3Param extends SimpleTypeParam { glob_types.Point3 <- Point3Param(null, {}) -let class TypeRefParam extends BaseParam { +let class TypeRefParam (BaseParam) { typeRef = null constructor(name, type) { @@ -383,7 +383,7 @@ let class TypeRefParam extends BaseParam { } -let class DynArrayParam extends BaseParam { +let class DynArrayParam (BaseParam) { typeRef = null memberToShowInCaption = null @@ -435,7 +435,7 @@ glob_types.dyn_array <- DynArrayParam(null, {elemType=null}) -let class EnumParam extends BaseParam { +let class EnumParam (BaseParam) { entries=null constructor(name, decl) { @@ -489,7 +489,7 @@ glob_types.list <- EnumParam(null, {list=[]}) -let class ExternStruct extends BaseParam { +let class ExternStruct (BaseParam) { constructor(name) { this.paramName=name } @@ -501,7 +501,7 @@ let class ExternStruct extends BaseParam { -let class ParamStruct extends BaseParam { +let class ParamStruct (BaseParam) { members = null version = 0 @@ -624,8 +624,8 @@ local module_name = null let function include_decl_h(name) { local fname = name.slice(0,1).tolower()+name.slice(1) if (name!=module_name) - glob_decl_text.append("#include <"+fname+"_decl.h>\n") - glob_tools_text.append("#include <"+fname+"_decl.h>\n") + glob_decl_text.append($"#include <{fname}_decl.h>\n") + glob_tools_text.append($"#include <{fname}_decl.h>\n") } @@ -636,8 +636,8 @@ let function begin_declare_params(name) { let function end_module() { - write_declarations(module_name+"_decl.h") - write_tools_code("../commonFxTools/"+module_name+"_tools.cpp") + write_declarations($"{module_name}_decl.h") + write_tools_code($"../commonFxTools/{module_name}_tools.cpp") module_name=null } diff --git a/prog/dagorInclude/3d/dag_drv3d.h b/prog/dagorInclude/3d/dag_drv3d.h index c7317d0f9..48105b6bb 100644 --- a/prog/dagorInclude/3d/dag_drv3d.h +++ b/prog/dagorInclude/3d/dag_drv3d.h @@ -857,14 +857,6 @@ float get_display_scale(); // Override profile SLI settings, must be called before device creation. void disable_sli(); -inline void validate_sbuffer_flags(unsigned flags, const char *name) -{ - if (!((flags & SBCF_BIND_CONSTANT) != 0 || !(flags & SBCF_DYNAMIC) || (flags & SBCF_MAYBELOST) != 0)) - logerr("Buffer \"%s\" was created with SBCF_DYNAMIC flag SBCF_MAYBELOST flag is missed!", name); - if (!(flags & (SBCF_BIND_VERTEX | SBCF_BIND_INDEX | SBCF_MAYBELOST))) - logerr("Buffer \"%s\" was created without SBCF_MAYBELOST flag. It is currently allowed only for vertex and index buffers.", name); -} - static constexpr int RENDER_TO_WHOLE_ARRAY = 1023; #if !_TARGET_D3D_MULTI // Driver initialization API @@ -952,6 +944,9 @@ bool should_use_compute_for_image_processing(std::initializer_list for /// returns false if texture of the specified format can't be created bool check_texformat(int cflg); +/// returns the maximum sample count for the given texture format +int get_max_sample_count(int cflg); + unsigned get_texformat_usage(int cflg, int restype = RES3D_TEX); /// check whether specified texture creation flags result in the same format bool issame_texformat(int cflg1, int cflg2); @@ -1119,7 +1114,7 @@ PROGRAM create_program(VPROG vprog, FSHADER fsh, VDECL vdecl, unsigned *strides PROGRAM create_program(const uint32_t *vpr_native, const uint32_t *fsh_native, VDECL vdecl, unsigned *strides = 0, unsigned streams = 0); // if strides & streams are unset, will get them from VDECL -PROGRAM create_program_cs(const uint32_t *cs_native); +PROGRAM create_program_cs(const uint32_t *cs_native, CSPreloaded preloaded); bool set_program(PROGRAM); // sets both pixel and vertex shader and vertex declaration void delete_program(PROGRAM); // deletes vprog and fshader. VDECL should be deleted independently diff --git a/prog/dagorInclude/3d/dag_drv3dCmd.h b/prog/dagorInclude/3d/dag_drv3dCmd.h index e47da8438..a5de24f49 100644 --- a/prog/dagorInclude/3d/dag_drv3dCmd.h +++ b/prog/dagorInclude/3d/dag_drv3dCmd.h @@ -393,6 +393,10 @@ enum // par1: CompilePipelineSet* DRV3D_COMMAND_COMPILE_PIPELINE_SET, + // par1: Sbuffer* + // par2: uint64_t* + DRV3D_COMMAND_GET_BUFFER_GPU_ADDRESS, + DRV3D_COMMAND_USER = 1000, }; diff --git a/prog/dagorInclude/3d/dag_drv3dConsts.h b/prog/dagorInclude/3d/dag_drv3dConsts.h index e85f65f03..f56606462 100644 --- a/prog/dagorInclude/3d/dag_drv3dConsts.h +++ b/prog/dagorInclude/3d/dag_drv3dConsts.h @@ -111,10 +111,9 @@ enum enum { - SBCF_DYNAMIC = 0x0004, // Create dynamic buffer - SBCF_MAYBELOST = 0x0008, // Buffer contents may be safely lost - SBCF_ZEROMEM = 0x0010, // Make sure driver has cleared the buffer (PS4, PS5) - SBCF_INDEX32 = 0x0020, // Use 32-bit indices + SBCF_DYNAMIC = 0x0004, // Create dynamic buffer + SBCF_ZEROMEM = 0x0010, // Make sure driver has cleared the buffer (PS4, PS5) + SBCF_INDEX32 = 0x0020, // Use 32-bit indices // Uses fast discard codepath, allowing to more efficiently utilize mem management // there's few limitation - it can only be locked with discard flag, there has to be cpu write access flag @@ -156,33 +155,33 @@ enum // Buffer flag sets. // Const buffers // Such buffers could be updated from time to time. - SBCF_CB_PERSISTENT = SBCF_BIND_CONSTANT | SBCF_DYNAMIC | SBCF_MAYBELOST, + SBCF_CB_PERSISTENT = SBCF_BIND_CONSTANT | SBCF_DYNAMIC, // Such buffers must be updated every frame. Because of that we don't care about its content on device reset. - SBCF_CB_ONE_FRAME = SBCF_BIND_CONSTANT | SBCF_DYNAMIC | SBCF_MAYBELOST | SBCF_FRAMEMEM, + SBCF_CB_ONE_FRAME = SBCF_BIND_CONSTANT | SBCF_DYNAMIC | SBCF_FRAMEMEM, // UAV buffers // (RW)ByteAddressBuffer in shader. - SBCF_UA_SR_BYTE_ADDRESS = SBCF_BIND_UNORDERED | SBCF_MAYBELOST | SBCF_MISC_ALLOW_RAW | SBCF_BIND_SHADER_RES, + SBCF_UA_SR_BYTE_ADDRESS = SBCF_BIND_UNORDERED | SBCF_MISC_ALLOW_RAW | SBCF_BIND_SHADER_RES, // (RW)StructuredBuffer in shader. - SBCF_UA_SR_STRUCTURED = SBCF_BIND_UNORDERED | SBCF_MAYBELOST | SBCF_MISC_STRUCTURED | SBCF_BIND_SHADER_RES, + SBCF_UA_SR_STRUCTURED = SBCF_BIND_UNORDERED | SBCF_MISC_STRUCTURED | SBCF_BIND_SHADER_RES, // RWByteAddressBuffer in shader. - SBCF_UA_BYTE_ADDRESS = SBCF_BIND_UNORDERED | SBCF_MAYBELOST | SBCF_MISC_ALLOW_RAW, + SBCF_UA_BYTE_ADDRESS = SBCF_BIND_UNORDERED | SBCF_MISC_ALLOW_RAW, // RWStructuredBuffer in shader. - SBCF_UA_STRUCTURED = SBCF_BIND_UNORDERED | SBCF_MAYBELOST | SBCF_MISC_STRUCTURED, + SBCF_UA_STRUCTURED = SBCF_BIND_UNORDERED | SBCF_MISC_STRUCTURED, // The same as SBCF_UA_BYTE_ADDRESS but its content can be read on CPU - SBCF_UA_BYTE_ADDRESS_READBACK = SBCF_BIND_UNORDERED | SBCF_MAYBELOST | SBCF_MISC_ALLOW_RAW | SBCF_USAGE_READ_BACK, + SBCF_UA_BYTE_ADDRESS_READBACK = SBCF_BIND_UNORDERED | SBCF_MISC_ALLOW_RAW | SBCF_USAGE_READ_BACK, // The same as SBCF_UA_STRUCTURED but its content can be read on CPU - SBCF_UA_STRUCTURED_READBACK = SBCF_BIND_UNORDERED | SBCF_MAYBELOST | SBCF_MISC_STRUCTURED | SBCF_USAGE_READ_BACK, + SBCF_UA_STRUCTURED_READBACK = SBCF_BIND_UNORDERED | SBCF_MISC_STRUCTURED | SBCF_USAGE_READ_BACK, // Indirect buffer filled on GPU - SBCF_UA_INDIRECT = SBCF_BIND_UNORDERED | SBCF_MAYBELOST | SBCF_MISC_ALLOW_RAW | SBCF_MISC_DRAWINDIRECT, + SBCF_UA_INDIRECT = SBCF_BIND_UNORDERED | SBCF_MISC_ALLOW_RAW | SBCF_MISC_DRAWINDIRECT, // GPU RO buffers // Indirect buffer filled on CPU - SBCF_INDIRECT = SBCF_MAYBELOST | SBCF_MISC_DRAWINDIRECT, + SBCF_INDIRECT = SBCF_MISC_DRAWINDIRECT, // Staging buffer // A buffer for data transfer on GPU - SBCF_STAGING_BUFFER = SBCF_CPU_ACCESS_READ | SBCF_CPU_ACCESS_WRITE | SBCF_MAYBELOST + SBCF_STAGING_BUFFER = SBCF_CPU_ACCESS_READ | SBCF_CPU_ACCESS_WRITE }; //--- Render states --------- @@ -298,6 +297,12 @@ enum class HdrOutputMode // corresponding values in hdr_ps_output.sh HDR_ONLY = 3 }; +enum class CSPreloaded +{ + No, + Yes +}; + /** * A boolean bitfield that describes which optional features that are available with the used device / driver combination. @@ -762,7 +767,7 @@ struct DeviceDriverCapabilitiesXboxOne : DeviceDriverCapabilitiesBase //! \briefconstcap{true, DeviceDriverCapabilitiesBase::hasCompareSampler} static constexpr bool hasCompareSampler = true; //! \briefconstcap{false, DeviceDriverCapabilitiesBase::hasShaderFloat16Support} - static constexpr bool hasShaderFloat16Support = false; + static constexpr bool hasShaderFloat16Support = true; }; /** * \brief Optimized capabilities structure, hiding bitfield entries with static const values of known platform features for \scarlett @@ -795,8 +800,6 @@ struct DeviceDriverCapabilitiesScarlett : DeviceDriverCapabilitiesXboxOne //! \warning Documentation is contradicting it self about proper support of indirect dispatch with pipelines that use amplification //! shaders. static constexpr bool hasMeshShader = true; - //! \briefconstcap{false, DeviceDriverCapabilitiesBase::hasShaderFloat16Support} - static constexpr bool hasShaderFloat16Support = false; }; /** * \brief Optimized capabilities structure, hiding bitfield entries with static const values of known platform features for diff --git a/prog/dagorInclude/3d/dag_drv3d_buffers.h b/prog/dagorInclude/3d/dag_drv3d_buffers.h index 4cb1915a9..d53ecc6a8 100644 --- a/prog/dagorInclude/3d/dag_drv3d_buffers.h +++ b/prog/dagorInclude/3d/dag_drv3d_buffers.h @@ -117,10 +117,9 @@ inline Sbuffer *create_persistent_cb(uint32_t registers_count, const char *name) * \param buffer_init The initialization option for the buffer. * \return A pointer to the created buffer. */ -inline Sbuffer *create_one_frame_cb(uint32_t registers_count, const char *name, Init buffer_init = Init::No) +inline Sbuffer *create_one_frame_cb(uint32_t registers_count, const char *name) { - return d3d::create_sbuffer(CBUFFER_REGISTER_SIZE, registers_count, - SBCF_CB_ONE_FRAME | (buffer_init == Init::Zero ? SBCF_ZEROMEM : 0), 0, name); + return d3d::create_sbuffer(CBUFFER_REGISTER_SIZE, registers_count, SBCF_CB_ONE_FRAME, 0, name); } /*! @@ -304,7 +303,7 @@ inline Sbuffer *create_staging(uint32_t size_in_bytes, const char *name) inline Sbuffer *create_persistent_sr_tbuf(uint32_t elements_count, uint32_t format, const char *name, Init buffer_init = Init::No) { return d3d::create_sbuffer(get_tex_format_desc(format).bytesPerElement, elements_count, - SBCF_BIND_SHADER_RES | SBCF_CPU_ACCESS_WRITE | SBCF_MAYBELOST | (buffer_init == Init::Zero ? SBCF_ZEROMEM : 0), format, name); + SBCF_BIND_SHADER_RES | SBCF_CPU_ACCESS_WRITE | (buffer_init == Init::Zero ? SBCF_ZEROMEM : 0), format, name); } @@ -322,9 +321,7 @@ inline Sbuffer *create_persistent_sr_tbuf(uint32_t elements_count, uint32_t form inline Sbuffer *create_persistent_sr_byte_address(uint32_t size_in_dwords, const char *name, Init buffer_init = Init::No) { return d3d::create_sbuffer(BYTE_ADDRESS_ELEMENT_SIZE, size_in_dwords, - SBCF_BIND_SHADER_RES | SBCF_CPU_ACCESS_WRITE | SBCF_MISC_ALLOW_RAW | SBCF_MAYBELOST | - (buffer_init == Init::Zero ? SBCF_ZEROMEM : 0), - 0, name); + SBCF_BIND_SHADER_RES | SBCF_CPU_ACCESS_WRITE | SBCF_MISC_ALLOW_RAW | (buffer_init == Init::Zero ? SBCF_ZEROMEM : 0), 0, name); } @@ -346,9 +343,7 @@ inline Sbuffer *create_persistent_sr_structured(uint32_t structure_size, uint32_ Init buffer_init = Init::No) { return d3d::create_sbuffer(structure_size, elements_count, - SBCF_BIND_SHADER_RES | SBCF_CPU_ACCESS_WRITE | SBCF_MISC_STRUCTURED | SBCF_MAYBELOST | - (buffer_init == Init::Zero ? SBCF_ZEROMEM : 0), - 0, name); + SBCF_BIND_SHADER_RES | SBCF_CPU_ACCESS_WRITE | SBCF_MISC_STRUCTURED | (buffer_init == Init::Zero ? SBCF_ZEROMEM : 0), 0, name); } @@ -370,7 +365,7 @@ inline Sbuffer *create_persistent_sr_structured(uint32_t structure_size, uint32_ inline Sbuffer *create_one_frame_sr_tbuf(uint32_t elements_count, uint32_t format, const char *name) { return d3d::create_sbuffer(get_tex_format_desc(format).bytesPerElement, elements_count, - SBCF_BIND_SHADER_RES | SBCF_CPU_ACCESS_WRITE | SBCF_DYNAMIC | SBCF_FRAMEMEM | SBCF_MAYBELOST, format, name); + SBCF_BIND_SHADER_RES | SBCF_CPU_ACCESS_WRITE | SBCF_DYNAMIC | SBCF_FRAMEMEM, format, name); } @@ -388,7 +383,7 @@ inline Sbuffer *create_one_frame_sr_tbuf(uint32_t elements_count, uint32_t forma inline Sbuffer *create_one_frame_sr_byte_address(uint32_t size_in_dwords, const char *name) { return d3d::create_sbuffer(BYTE_ADDRESS_ELEMENT_SIZE, size_in_dwords, - SBCF_BIND_SHADER_RES | SBCF_CPU_ACCESS_WRITE | SBCF_MISC_ALLOW_RAW | SBCF_DYNAMIC | SBCF_FRAMEMEM | SBCF_MAYBELOST, 0, name); + SBCF_BIND_SHADER_RES | SBCF_CPU_ACCESS_WRITE | SBCF_MISC_ALLOW_RAW | SBCF_DYNAMIC | SBCF_FRAMEMEM, 0, name); } @@ -409,7 +404,7 @@ inline Sbuffer *create_one_frame_sr_byte_address(uint32_t size_in_dwords, const inline Sbuffer *create_one_frame_sr_structured(uint32_t structure_size, uint32_t elements_count, const char *name) { return d3d::create_sbuffer(structure_size, elements_count, - SBCF_BIND_SHADER_RES | SBCF_CPU_ACCESS_WRITE | SBCF_MISC_STRUCTURED | SBCF_DYNAMIC | SBCF_FRAMEMEM | SBCF_MAYBELOST, 0, name); + SBCF_BIND_SHADER_RES | SBCF_CPU_ACCESS_WRITE | SBCF_MISC_STRUCTURED | SBCF_DYNAMIC | SBCF_FRAMEMEM, 0, name); } } // namespace d3d::buffers diff --git a/prog/dagorInclude/3d/dag_drv3d_multi.h b/prog/dagorInclude/3d/dag_drv3d_multi.h index 865f4def2..71d3786c5 100644 --- a/prog/dagorInclude/3d/dag_drv3d_multi.h +++ b/prog/dagorInclude/3d/dag_drv3d_multi.h @@ -54,6 +54,7 @@ static inline bool should_use_compute_for_image_processing(std::initializer_list } static inline bool check_texformat(int cflg) { return d3di.check_texformat(cflg); } +static inline int get_max_sample_count(int cflg) { return d3di.get_max_sample_count(cflg); } static inline unsigned get_texformat_usage(int cflg, int restype = RES3D_TEX) { return d3di.get_texformat_usage(cflg, restype); } static inline bool issame_texformat(int cflg1, int cflg2) { return d3di.issame_texformat(cflg1, cflg2); } static inline bool check_cubetexformat(int cflg) { return d3di.check_cubetexformat(cflg); } @@ -159,7 +160,10 @@ static inline PROGRAM create_program(const uint32_t *vpr_native, const uint32_t return d3di.create_program_1(vpr_native, fsh_native, vdecl, strides, streams); } -static inline PROGRAM create_program_cs(const uint32_t *cs_native) { return d3di.create_program_cs(cs_native); } +static inline PROGRAM create_program_cs(const uint32_t *cs_native, CSPreloaded preloaded) +{ + return d3di.create_program_cs(cs_native, preloaded); +} static inline bool set_program(PROGRAM p) { return d3di.set_program(p); } static inline void delete_program(PROGRAM p) { return d3di.delete_program(p); } @@ -272,20 +276,14 @@ static inline bool set_cb0_data(unsigned stage, const float *data, unsigned num_ } static inline void release_cb0_data(unsigned /*stage*/) {} -static inline Sbuffer *create_vb(int sz, int f, const char *name = "") -{ - d3d::validate_sbuffer_flags(f | SBCF_BIND_VERTEX, name); - return d3di.create_vb(sz, f, name); -} +static inline Sbuffer *create_vb(int sz, int f, const char *name = "") { return d3di.create_vb(sz, f, name); } static inline Sbuffer *create_ib(int size_bytes, int flags, const char *stat_name = "ib") { - d3d::validate_sbuffer_flags(flags | SBCF_BIND_INDEX, stat_name); return d3di.create_ib(size_bytes, flags, stat_name); } static inline Sbuffer *create_sbuffer(int struct_size, int elements, unsigned flags, unsigned texfmt, const char *name = "") { - d3d::validate_sbuffer_flags(flags, name); return d3di.create_sbuffer(struct_size, elements, flags, texfmt, name); } diff --git a/prog/dagorInclude/3d/dag_drv3di.h b/prog/dagorInclude/3d/dag_drv3di.h index 99eb5bcca..4a2c152de 100644 --- a/prog/dagorInclude/3d/dag_drv3di.h +++ b/prog/dagorInclude/3d/dag_drv3di.h @@ -40,6 +40,7 @@ struct D3dInterfaceTable unsigned (*get_texformat_usage)(int cflg, int restype); bool (*check_texformat)(int cflg); + int (*get_max_sample_count)(int cflg); bool (*issame_texformat)(int cflg1, int cflg2); bool (*check_cubetexformat)(int cflg); bool (*issame_cubetexformat)(int cflg1, int cflg2); @@ -77,7 +78,7 @@ struct D3dInterfaceTable PROGRAM (*create_program_0)(VPROG, FSHADER, VDECL, unsigned *, unsigned); PROGRAM (*create_program_1)(const uint32_t *, const uint32_t *, VDECL, unsigned *, unsigned); - PROGRAM (*create_program_cs)(const uint32_t *cs_native); + PROGRAM (*create_program_cs)(const uint32_t *cs_native, CSPreloaded preloaded); bool (*set_program)(PROGRAM); void (*delete_program)(PROGRAM); diff --git a/prog/dagorInclude/3d/dag_indirectDrawcallsBuffer.h b/prog/dagorInclude/3d/dag_indirectDrawcallsBuffer.h index 03eefab68..d37147185 100644 --- a/prog/dagorInclude/3d/dag_indirectDrawcallsBuffer.h +++ b/prog/dagorInclude/3d/dag_indirectDrawcallsBuffer.h @@ -28,6 +28,8 @@ class IndirectDrawcallsBuffer static bool packDrawCallIdAsOffset() { return d3d::get_driver_code().is(d3d::ps4 || d3d::ps5 || d3d::vulkan); } + static bool useDiscardOnFill() { return d3d::get_driver_code().is(d3d::vulkan); } + static uint32_t dwordsCountPerDrawcall() { G_STATIC_ASSERT(sizeof(BaseArgsType) % sizeof(uint32_t) == 0); @@ -59,7 +61,7 @@ class IndirectDrawcallsBuffer fill_on_gpu == FillOnGPU::No ? SBCF_INDIRECT : SBCF_UA_INDIRECT, 0, bufferName.c_str()); } - const uint32_t LOCK_FLAGS = VBLOCK_WRITEONLY; + const uint32_t LOCK_FLAGS = VBLOCK_WRITEONLY | (useDiscardOnFill() ? VBLOCK_DISCARD : 0); if (usesPackedDrawCallID()) { if (auto data = lock_sbuffer(buffer.getBuf(), 0, drawcalls_data.size(), LOCK_FLAGS)) diff --git a/prog/dagorInclude/3d/dag_render.h b/prog/dagorInclude/3d/dag_render.h index ba4bcd673..2c3e14de2 100644 --- a/prog/dagorInclude/3d/dag_render.h +++ b/prog/dagorInclude/3d/dag_render.h @@ -5,28 +5,16 @@ // #pragma once -#include -#include - #include -struct DagorCurView -{ - TMatrix tm, itm; - Point3 pos; // current scene view position -}; - // global render states extern bool grs_draw_wire; -extern DagorCurView grs_cur_view; - - // compute and set gamma correction -void set_gamma(real p); -void set_gamma_shadervar(real p); +void set_gamma(float p); +void set_gamma_shadervar(float p); // returns current gamma -real get_current_gamma(); +float get_current_gamma(); #include diff --git a/prog/dagorInclude/3d/dag_resPtr.h b/prog/dagorInclude/3d/dag_resPtr.h index c45c8984e..fecc1293a 100644 --- a/prog/dagorInclude/3d/dag_resPtr.h +++ b/prog/dagorInclude/3d/dag_resPtr.h @@ -597,9 +597,9 @@ inline ResPtr create_persistent_cb(uint32_t registers_count, const char return resptr_detail::ResPtrFactory(d3d::buffers::create_persistent_cb(registers_count, name)); } // Such buffers must be updated every frame. Because of that we don't care about its content on device reset. -inline ResPtr create_one_frame_cb(uint32_t registers_count, const char *name, Init buffer_init = Init::No) +inline ResPtr create_one_frame_cb(uint32_t registers_count, const char *name) { - return resptr_detail::ResPtrFactory(d3d::buffers::create_one_frame_cb(registers_count, name, buffer_init)); + return resptr_detail::ResPtrFactory(d3d::buffers::create_one_frame_cb(registers_count, name)); } // (RW)ByteAddressBuffer in shader. diff --git a/prog/dagorInclude/3d/dag_ringDynBuf.h b/prog/dagorInclude/3d/dag_ringDynBuf.h index f7e53296e..534ea62e0 100644 --- a/prog/dagorInclude/3d/dag_ringDynBuf.h +++ b/prog/dagorInclude/3d/dag_ringDynBuf.h @@ -104,7 +104,7 @@ class RingDynamicVB : public RingDynamicBuffer void init(int v_count, int v_stride, const char *stat_name = __FILE__) { close(); - buf = d3d::create_vb(v_count * v_stride, SBCF_MAYBELOST | SBCF_DYNAMIC, stat_name); + buf = d3d::create_vb(v_count * v_stride, SBCF_DYNAMIC, stat_name); d3d_err(buf); stride = v_stride; size = v_count; @@ -119,7 +119,6 @@ class RingDynamicSB : public RingDynamicBuffer { close(); G_ASSERT(v_stride % elem_size == 0); - flags |= SBCF_MAYBELOST; Sbuffer *stagingBuf = 0; if (d3d::get_driver_desc().caps.hasNoOverwriteOnShaderResourceBuffers && !(d3d::get_driver_code().is(d3d::dx11) && (flags & SBCF_MISC_DRAWINDIRECT))) @@ -127,8 +126,8 @@ class RingDynamicSB : public RingDynamicBuffer else // not optimal, since we allocate in gpu memory too much. todo: optimize { stagingBuf = d3d::create_sbuffer(elem_size, v_count * (v_stride / elem_size), - SBCF_DYNAMIC | SBCF_BIND_VERTEX | SBCF_CPU_ACCESS_WRITE | SBCF_MAYBELOST, 0, stat_name); // we don't need SBCF_BIND_VERTEX, but - // DX driver demands it + SBCF_DYNAMIC | SBCF_BIND_VERTEX | SBCF_CPU_ACCESS_WRITE, 0, stat_name); // we don't need SBCF_BIND_VERTEX, but + // DX driver demands it d3d_err(stagingBuf); } buf = d3d::create_sbuffer(elem_size, v_count * (v_stride / elem_size), flags, format, stat_name); @@ -174,7 +173,7 @@ class RingDynamicIB : public RingDynamicBuffer void init(int i_count) { close(); - buf = d3d::create_ib(i_count * 2, SBCF_MAYBELOST | SBCF_DYNAMIC); + buf = d3d::create_ib(i_count * 2, SBCF_DYNAMIC); d3d_err(buf); stride = 2; size = i_count; diff --git a/prog/dagorInclude/3d/dag_texFlags.h b/prog/dagorInclude/3d/dag_texFlags.h index d00e99394..50a1cdd62 100644 --- a/prog/dagorInclude/3d/dag_texFlags.h +++ b/prog/dagorInclude/3d/dag_texFlags.h @@ -5,6 +5,8 @@ // #pragma once +#include + // texture creation flags enum { @@ -43,7 +45,7 @@ enum // Uses of the texture methods updateSubRegion and update require this, TEXCF_RTARGET or // TEXCF_UNORDERED usage flag to be set. - TEXCF_UPDATE_DESTINATION = 0x00080000U, + TEXCF_UPDATE_DESTINATION = 0x00004000U, TEXCF_SYSTEXCOPY = 0x00000010U, // make copy in system memory @@ -52,16 +54,19 @@ enum TEXCF_DYNAMIC = 0x00000100U, // changes frequently //D3DUSAGE_CPU_CACHED_MEMORY, XALLOC_MEMPROTECT_READWRITE TEXCF_READABLE = 0x00000200U, // can only be read, D3DLOCK_READONLY TEXCF_READONLY = 0, - TEXCF_WRITEONLY = 0x00000008U, // cpu can write (TEXLOCK_WRITE) - TEXCF_LOADONCE = 0x00000400U, // texture will be loaded only once - don't use with dynamic - TEXCF_MAYBELOST = 0x00000800U, // contents of the texture may be safely lost - they will be regenerated before using it - TEXCF_STREAMING = 0x00000000U, // should be deleted shortly, obsolete - TEXCF_SYSMEM = 0x00010000U, // texture is allocated in system memory and used only as staging texture - // TEXCF_SYSMEM|TEXCF_WRITEONLY is allocated in WC memory on PS4 - TEXCF_MULTISAMPLED = 0x00020000U, // multisampled render target format - TEXCF_MSAATARGET = 0x00008000U, // multisampled render target format. should be used with TEXCF_MULTISAMPLED. - // Texture with TEXCF_MSAATARGET will be not auto resolved to non-msaa,target, - // so it has to be resolved with update method + TEXCF_WRITEONLY = 0x00000008U, // cpu can write (TEXLOCK_WRITE) + TEXCF_LOADONCE = 0x00000400U, // texture will be loaded only once - don't use with dynamic + TEXCF_MAYBELOST = 0x00000800U, // contents of the texture may be safely lost - they will be regenerated before using it + TEXCF_STREAMING = 0x00000000U, // should be deleted shortly, obsolete + TEXCF_SYSMEM = 0x00010000U, // texture is allocated in system memory and used only as staging texture + // TEXCF_SYSMEM|TEXCF_WRITEONLY is allocated in WC memory on PS4 + TEXCF_SAMPLECOUNT_2 = 0x00020000U, // = + TEXCF_SAMPLECOUNT_4 = 0x00040000U, // | multisampled render target formats + TEXCF_SAMPLECOUNT_8 = 0x00060000U, // = + TEXCF_SAMPLECOUNT_MAX = TEXCF_SAMPLECOUNT_8, + TEXCF_SAMPLECOUNT_MASK = 0x00060000U, + TEXCF_SAMPLECOUNT_OFFSET = 17, + #if _TARGET_C1 | _TARGET_C2 @@ -70,7 +75,7 @@ enum #elif _TARGET_XBOX - TEXCF_CPU_CACHED_MEMORY = 0x00040000U, // todo: implement allocation in onion instead of garlic mem + TEXCF_CPU_CACHED_MEMORY = 0x00008000U, // todo: implement allocation in onion instead of garlic mem TEXCF_LINEAR_LAYOUT = 0x00100000U, // todo: implement without tiling TEXCF_ESRAM_ONLY = 0x00000020U, // always reside in ESRAM TEXCF_MOVABLE_ESRAM = 0x00000040U, // Create copy in DDR @@ -167,11 +172,13 @@ enum TEXFMT_FIRST_DEPTH = TEXFMT_DEPTH24, TEXFMT_LAST_DEPTH = TEXFMT_DEPTH32_S8, - // this is fake MSAA target format, to emulate forced sample count - // format has as low bit's per sample as possible, while having as much samples as possible - // typically it is 4-8 samples of R8 - // this is rtarget format, not depth format (as we hope to get R8) - TEXFMT_MSAA_MAX_SAMPLES = 0x40000000U, // unknown format, usually R8 - // TEXFMT_ =0x__000000, - TEXFMT_MASK = 0xFF000000U, + + TEXFMT_MASK = 0xFF000000U, // TEXFMT_ =0x__000000, }; + +__forceinline int get_sample_count(int flags) { return 1 << ((flags & TEXCF_SAMPLECOUNT_MASK) >> TEXCF_SAMPLECOUNT_OFFSET); } + +__forceinline int make_sample_count_flag(int sample_count) +{ + return (get_log2i(sample_count) << TEXCF_SAMPLECOUNT_OFFSET) & TEXCF_SAMPLECOUNT_MASK; +} \ No newline at end of file diff --git a/prog/dagorInclude/anim/dag_animPostBlendCtrl.h b/prog/dagorInclude/anim/dag_animPostBlendCtrl.h index fdaa56440..ec59bf79f 100644 --- a/prog/dagorInclude/anim/dag_animPostBlendCtrl.h +++ b/prog/dagorInclude/anim/dag_animPostBlendCtrl.h @@ -952,7 +952,8 @@ class AnimPostBlendEffFromAttachement : public AnimPostBlendCtrl }; int varId = -1; - int slotId = -1; + int namedSlotId = -1; + int varSlotId = -1; bool ignoreZeroWt = false; Tab destVarId; Tab nodes; @@ -1055,7 +1056,8 @@ class AnimPostBlendNodesFromAttachement : public AnimPostBlendCtrl }; int varId = -1; - int slotId = -1; + int namedSlotId = -1; + int varSlotId = -1; bool copyWtm = false; bool wScaleInverted = false; int wScaleVarId = -1; diff --git a/prog/dagorInclude/debug/dag_debug3d.h b/prog/dagorInclude/debug/dag_debug3d.h index da7cdf71a..c31fbb3f9 100644 --- a/prog/dagorInclude/debug/dag_debug3d.h +++ b/prog/dagorInclude/debug/dag_debug3d.h @@ -236,7 +236,7 @@ void draw_debug_tetrapod_buffered(const Point3 &c, real radius, E3DCOLOR col = E size_t frames = DEBUG3D_DEFAULT_FRAMES_TO_BUFFER); void draw_debug_tehedron_buffered(const Point3 &c, real radius, E3DCOLOR col = E3DCOLOR_MAKE(255, 255, 64, 255), size_t frames = DEBUG3D_DEFAULT_FRAMES_TO_BUFFER); -void flush_buffered_debug_lines(bool decriment_buffer_frames = true); +void flush_buffered_debug_lines(bool game_is_paused = false); void clear_buffered_debug_lines(); void draw_debug_capsule_buffered(const Point3 &p0, const Point3 &p1, real rad, E3DCOLOR col, int segs = 24, diff --git a/prog/dagorInclude/gameRes/dag_collisionResource.h b/prog/dagorInclude/gameRes/dag_collisionResource.h index 5644b0f1b..20db54974 100644 --- a/prog/dagorInclude/gameRes/dag_collisionResource.h +++ b/prog/dagorInclude/gameRes/dag_collisionResource.h @@ -308,7 +308,7 @@ decl_dclass_and_id(CollisionResource, DObject, CollisionGameResClassId) Point3 *out_normal, int &out_mat_id, const CollisionNodeFilter &filter, int ray_mat_id = -1, uint8_t behavior_filter = CollisionNode::TRACEABLE) const; - bool traceRay(const mat44f &tm, const Point3 &from, const Point3 &dir, float &in_out_t, Point3 *normal, int &out_mat_id, + bool traceRay(const mat44f &tm, const Point3 &from, const Point3 &dir, float &in_out_t, Point3 *out_normal, int &out_mat_id, int ray_mat_id = -1, uint8_t behavior_filter = CollisionNode::TRACEABLE) const; bool traceRay(const TMatrix &instance_tm, const GeomNodeTree *geom_node_tree, const Point3 &from, const Point3 &dir, float in_t, @@ -352,7 +352,7 @@ decl_dclass_and_id(CollisionResource, DObject, CollisionGameResClassId) VECTORCALL bool traceRayMeshNodeLocal(const CollisionNode &node, vec4f v_local_from, vec4f v_local_dir, float &in_out_t, vec4f *out_norm) const; - VECTORCALL bool rayHit(const mat44f &tm, vec3f v_from, vec3f v_dir, float t, int &out_mat_id) const; + VECTORCALL bool rayHit(const mat44f &tm, const Point3 &from, const Point3 &dir, float in_t, int ray_mat_id, int &out_mat_id) const; VECTORCALL bool rayHit(const TMatrix &instance_tm, const GeomNodeTree *geom_node_tree, const Point3 &from, const Point3 &dir, float in_t, float bsphere_scale = 1.f, const CollisionNodeMask *collision_node_mask = nullptr, int *out_mat_id = nullptr) const; diff --git a/prog/dagorInclude/generic/dag_relocatableFixedVector.h b/prog/dagorInclude/generic/dag_relocatableFixedVector.h index e1681a9c5..049c20154 100644 --- a/prog/dagorInclude/generic/dag_relocatableFixedVector.h +++ b/prog/dagorInclude/generic/dag_relocatableFixedVector.h @@ -45,9 +45,9 @@ #endif -#define CHECK_RELOCATABLE() // static_assert(dag::is_type_relocatable::value, "currently non-relocatable types are not working"); +#define CHECK_RELOCATABLE() static_assert(dag::is_type_relocatable::value, "currently non-relocatable types are not working"); #define CHECK_RELOCATABLE_OR_INPLACE() \ - // static_assert(!canOverflow || dag::is_type_relocatable::value, "currently non-relocatable types are not working"); + static_assert(!canOverflow || dag::is_type_relocatable::value, "currently non-relocatable types are not working"); #ifndef IF_CONSTEXPR @@ -83,6 +83,10 @@ class alignas(max_alignof()) RelocatableFixedData // base class, allo public: typedef Allocator allocator_type; typedef T value_type; + typedef T *pointer; + typedef const T *const_pointer; + typedef T &reference; + typedef const T &const_reference; typedef value_type *iterator; typedef const value_type *const_iterator; typedef eastl::reverse_iterator reverse_iterator; @@ -164,6 +168,9 @@ class alignas(max_alignof()) RelocatableFixedData // base class, allo size_type heapCapacity() const { return heap.capacity; } static size_type inplaceCapacity() { return (size_type)inplace_count; } + const allocator_type &get_allocator() const { return mAllocatorAndCount; } + allocator_type &get_allocator() { return mAllocatorAndCount; } + value_type *inplaceData() { return (value_type *)inplaceStor.data; } const value_type *inplaceData() const { return (const value_type *)inplaceStor.data; } @@ -174,13 +181,13 @@ class alignas(max_alignof()) RelocatableFixedData // base class, allo void doInsertAt(size_t at, size_t add_cnt, value_type *dst, const value_type *src) { - memmove(dst + at + add_cnt, src + at, sizeof(value_type) * (size() - at)); + memmove((void *)&dst[at + add_cnt], (const void *)&src[at], sizeof(value_type) * (size() - at)); } DAGOR_NOINLINE value_type *insertAt(size_t at, size_t add_cnt, value_type *dst, const value_type *src) { - memmove(dst, src, sizeof(value_type) * at); + memmove((void *)dst, (const void *)src, sizeof(value_type) * at); doInsertAt(at, add_cnt, dst, src); return dst; } @@ -219,6 +226,10 @@ class RelocatableFixedVector : public RelocatableFixedData #include -static Point3 local_solve_2bones_ik(float A, float B, const Point3 &P, const Point3 &D) +VECTORCALL VECMATH_FINLINE vec4f local_solve_2bones_ik(float A, float B, vec3f P, vec3f D) { - TMatrix Minv; - Minv.setcol(0, normalize(P)); - Minv.setcol(1, normalize(D - (D * Minv.getcol(0)) * Minv.getcol(0))); - Minv.setcol(2, Minv.getcol(0) % Minv.getcol(1)); - Minv.setcol(3, 0.f, 0.f, 0.f); - - TMatrix Mfwd = inverse(Minv); - - Point3 R = Mfwd * P; - - float c = length(R); - float d = max(0.f, min(A, (c + (A * A - B * B) / c) / 2.f)); - float e = sqrt(fabsf(A * A - d * d)); - - Point3 S = Point3(d, e, 0.f); - - return Minv * S; + mat44f Minv; + Minv.col0 = v_norm3(P); + Minv.col1 = v_norm3(v_sub(D, v_mul(v_dot3(D, Minv.col0), Minv.col0))); + Minv.col2 = v_cross3(Minv.col0, Minv.col1); + Minv.col3 = v_zero(); + + mat44f Mfwd; + v_mat44_orthonormal_inverse43(Mfwd, Minv); + + vec3f R = v_mat44_mul_vec3p(Mfwd, P); + float c = v_extract_x(v_length3_x(R)); + float d = clamp(A, 0.f, (c + (A * A - B * B) / c) / 2.f); + float e = sqrtf(fabsf(A * A - d * d)); + + vec4f S = v_make_vec4f(d, e, 0.f, 0.f); + return v_mat44_mul_vec3p(Minv, S); } static void solve_2bones_ik(mat44f &n0_wtm, mat44f &n1_wtm, mat44f &n2_wtm, const mat44f &target_tm, float length01, float length12, - const Point3 &flexion_point, float max_reach_scale = 1.0f) + vec3f flexion_point, float max_reach_scale = 1.0f) { // Set target tm to hand. - n2_wtm = target_tm; - // Solve IK in local coordinates. - - Point3 ikNode2; - v_stu_p3(&ikNode2.x, v_sub(n2_wtm.col3, n0_wtm.col3)); + vec3f ikNode2 = v_sub(n2_wtm.col3, n0_wtm.col3); float maxReachLen = max_reach_scale * (length01 + length12); - float targetLenSq = lengthSq(ikNode2); + float targetLenSq = v_extract_x(v_length3_sq_x(ikNode2)); if (SQR(maxReachLen) < targetLenSq) { - n2_wtm.col3 = v_add(n0_wtm.col3, v_mul(v_splats(maxReachLen / sqrt(targetLenSq)), v_sub(n2_wtm.col3, n0_wtm.col3))); - v_stu_p3(&ikNode2.x, v_sub(n2_wtm.col3, n0_wtm.col3)); + n2_wtm.col3 = v_add(n0_wtm.col3, v_mul(v_splats(maxReachLen / sqrtf(targetLenSq)), v_sub(n2_wtm.col3, n0_wtm.col3))); + ikNode2 = v_sub(n2_wtm.col3, n0_wtm.col3); } - Point3_vec4 ikNode1 = local_solve_2bones_ik(length01, length12, ikNode2, flexion_point); - ikNode1.resv = 0; - + vec3f ikNode1 = local_solve_2bones_ik(length01, length12, ikNode2, flexion_point); // Set forearm matrix. mat44f m0, m1; - - m1.col3 = v_add(n0_wtm.col3, v_ld(&ikNode1.x)); + m1.col3 = v_add(n0_wtm.col3, ikNode1); m1.col0 = v_norm3(v_sub(n2_wtm.col3, m1.col3)); m1.col1 = v_norm3(v_cross3(m1.col0, n1_wtm.col2)); m1.col2 = v_norm3(v_cross3(m1.col1, m1.col0)); n1_wtm = m1; - // Set upper arm matrix. m0.col3 = n0_wtm.col3; m0.col0 = v_norm3(v_sub(m1.col3, m0.col3)); diff --git a/prog/dagorInclude/math/dag_bits.h b/prog/dagorInclude/math/dag_bits.h index c33d9cba3..1c942eb01 100644 --- a/prog/dagorInclude/math/dag_bits.h +++ b/prog/dagorInclude/math/dag_bits.h @@ -13,26 +13,12 @@ extern "C" unsigned int __popcnt(unsigned int); unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); -#if _TARGET_64BIT - unsigned char _BitScanForward64(unsigned long *_Index, unsigned long long _Mask); -#endif -#if !defined(__clang__) - extern unsigned int _lzcnt_u32(unsigned int); - extern unsigned int _tzcnt_u32(unsigned int); -#if _TARGET_64BIT - extern unsigned long long _lzcnt_u64(unsigned long long); - extern unsigned long long _tzcnt_u64(unsigned long long); -#endif -#endif #ifdef __cplusplus } #endif #pragma intrinsic(__popcnt) #pragma intrinsic(_BitScanForward) #pragma intrinsic(_BitScanReverse) -#if _TARGET_64BIT -#pragma intrinsic(_BitScanForward64) -#endif #endif #define POW2_ALIGN(v, a) (((v) + ((a)-1)) & ~((a)-1)) @@ -134,347 +120,3 @@ inline int __bit_scan_reverse(unsigned long &index, unsigned int val) { return _ #else #define HAS_BIT_SCAN_FORWARD 0 #endif - -inline unsigned __ctz_unsafe(unsigned long long value) -{ -#if defined(__clang__) || defined(__GNUC__) - return __builtin_ctzll(value); // tzcnt or rep bsf or rbit + clz -#elif _TARGET_64BIT && defined(_MSC_VER) - return _tzcnt_u64(value); // tzcnt -> rep bsf -#endif - - unsigned n = 1; - // clang-format off - if((value & 0xFFFFFFFF) == 0) { n += 32; value >>= 32; } - if((value & 0x0000FFFF) == 0) { n += 16; value >>= 16; } - if((value & 0x000000FF) == 0) { n += 8; value >>= 8; } - if((value & 0x0000000F) == 0) { n += 4; value >>= 4; } - if((value & 0x00000003) == 0) { n += 2; value >>= 2; } - // clang-format on - return (n - ((unsigned)value & 1)); -} - -inline unsigned __ctz_unsafe(long long value) { return __ctz_unsafe((unsigned long long)value); } - -inline unsigned __ctz_unsafe(unsigned int value) -{ -#if defined(__clang__) || defined(__GNUC__) - return __builtin_ctz(value); // tzcnt or rep bsf -#elif defined(_MSC_VER) - return _tzcnt_u32(value); // tzcnt -> rep bsf -#endif - - unsigned n = 1; - // clang-format off - if((value & 0x0000FFFF) == 0) { n += 16; value >>= 16; } - if((value & 0x000000FF) == 0) { n += 8; value >>= 8; } - if((value & 0x0000000F) == 0) { n += 4; value >>= 4; } - if((value & 0x00000003) == 0) { n += 2; value >>= 2; } - // clang-format on - return (n - ((unsigned)value & 1)); -} - -inline unsigned __ctz_unsafe(int value) { return __ctz_unsafe((unsigned int)value); } - -inline unsigned __ctz_unsafe(unsigned long value) -{ - if constexpr (sizeof(value) == 8) - return __ctz_unsafe((unsigned long long)value); - else - return __ctz_unsafe((unsigned int)value); -} - -inline unsigned __ctz_unsafe(long value) { return __ctz_unsafe((unsigned long)value); } - -inline unsigned __ctz(unsigned long long value) -{ -#if defined(__arm64__) || defined(__ARM_ARCH) // Apple silicon or ARMv7 and above - return __builtin_ctzll(value); // rbit + clz -#else -#if _TARGET_64BIT -#if defined(__clang__) - return __builtin_ia32_tzcnt_u64(value); // bsf or tzct if -mbmi -#elif defined(__GNUC__) -#ifdef __BMI__ - return __builtin_ctzll(value); // tzcnt -#else - return value ? __builtin_ctzll(value) : 64; // bsf -#endif -#elif defined(_MSC_VER) -#if defined(__AVX2__) - return _tzcnt_u64(value); // tzcnt -#else - unsigned long r; - _BitScanForward64(&r, value); // bsf - return value ? r : 64; -#endif -#endif -#endif -#endif - - if (value) - { - unsigned n = 1; - // clang-format off - if((value & 0xFFFFFFFF) == 0) { n += 32; value >>= 32; } - if((value & 0x0000FFFF) == 0) { n += 16; value >>= 16; } - if((value & 0x000000FF) == 0) { n += 8; value >>= 8; } - if((value & 0x0000000F) == 0) { n += 4; value >>= 4; } - if((value & 0x00000003) == 0) { n += 2; value >>= 2; } - // clang-format on - return (n - ((unsigned)value & 1)); - } - - return 64; -} - -inline unsigned __ctz(long long value) { return __ctz((unsigned long long)value); } - -inline unsigned __ctz(unsigned int value) -{ -#if defined(__arm64__) || defined(__ARM_ARCH) // Apple silicon or ARMv7 and above - return __builtin_ctz(value); // rbit + clz -#else -#if _TARGET_64BIT -#if defined(__clang__) - return __builtin_ia32_tzcnt_u32(value); // bsf or tzct if -mbmi -#elif defined(__GNUC__) -#ifdef __BMI__ - return __builtin_ctz(value); // tzcnt -#else - return value ? __builtin_ctz(value) : 32; // bsf -#endif -#elif defined(_MSC_VER) -#if defined(__AVX2__) - return _tzcnt_u32(value); // tzcnt -#else - unsigned long r; - _BitScanForward(&r, value); // bsf - return value ? r : 32; -#endif -#endif -#endif -#endif - - if (value) - { - unsigned n = 1; - // clang-format off - if((value & 0x0000FFFF) == 0) { n += 16; value >>= 16; } - if((value & 0x000000FF) == 0) { n += 8; value >>= 8; } - if((value & 0x0000000F) == 0) { n += 4; value >>= 4; } - if((value & 0x00000003) == 0) { n += 2; value >>= 2; } - // clang-format on - return (n - ((unsigned)value & 1)); - } - - return 32; -} - -inline unsigned __ctz(int value) { return __ctz((unsigned int)value); } - -inline unsigned __ctz(unsigned long value) -{ - if constexpr (sizeof(value) == 8) - return __ctz((unsigned long long)value); - else - return __ctz((unsigned int)value); -} - -inline unsigned __ctz(long value) { return __ctz((unsigned long)value); } - -inline unsigned __clz_unsafe(unsigned long long value) -{ -#if defined(__arm64__) || defined(__aarch64__) // Apple silicon or ARMv8 - return __builtin_clzll(value); // clz -#else -#if _TARGET_64BIT -#if defined(__clang__) -#ifdef __LZCNT__ - return __builtin_ia32_lzcnt_u64(value); // lzcnt -#else - return __builtin_clzll(value); // bsr -#endif -#elif defined(__GNUC__) - return __builtin_clzll(value); // bsr or lznct if -mabm -#elif defined(_MSC_VER) -#if defined(__AVX2__) - return _lzcnt_u64(value); // lzcnt -#else - unsigned long r; - _BitScanForward64(&r, value); // bsr - return r ^ 63; -#endif -#endif -#endif -#endif - - unsigned n = 0; - // clang-format off - if(value & (0xFFFFFFFF00000000ull)) { n += 32; value >>= 32; } - if(value & 0xFFFF0000) { n += 16; value >>= 16; } - if(value & 0xFFFFFF00) { n += 8; value >>= 8; } - if(value & 0xFFFFFFF0) { n += 4; value >>= 4; } - if(value & 0xFFFFFFFC) { n += 2; value >>= 2; } - if(value & 0xFFFFFFFE) { n += 1; } - // clang-format on - - return n; -} - -inline unsigned __clz_unsafe(long long value) { return __clz_unsafe((unsigned long long)value); } - -inline unsigned __clz_unsafe(unsigned int value) -{ -#if defined(__ARM_ARCH) - return __builtin_clz(value); // clz -#else -#if defined(__clang__) -#ifdef __LZCNT__ - return __builtin_ia32_lzcnt_u32(value); // lzcnt -#else - return __builtin_clz(value); // bsr -#endif -#elif defined(__GNUC__) - return __builtin_clz(value); // bsr or lznct if -mabm -#elif defined(_MSC_VER) -#if defined(__AVX2__) - return _lzcnt_u32(value); // lzcnt -#else - unsigned long r; - _BitScanForward(&r, value); // bsr - return r ^ 31; -#endif -#endif -#endif - - unsigned n = 0; - // clang-format off - if(value & 0xFFFF0000) { n += 16; value >>= 16; } - if(value & 0xFFFFFF00) { n += 8; value >>= 8; } - if(value & 0xFFFFFFF0) { n += 4; value >>= 4; } - if(value & 0xFFFFFFFC) { n += 2; value >>= 2; } - if(value & 0xFFFFFFFE) { n += 1; } - // clang-format on - - return n; -} - -inline unsigned __clz_unsafe(int value) { return __clz_unsafe((unsigned int)value); } - -inline unsigned __clz_unsafe(unsigned long value) -{ - if constexpr (sizeof(value) == 8) - return __clz_unsafe((unsigned long long)value); - else - return __clz_unsafe((unsigned int)value); -} - -inline unsigned __clz_unsafe(long value) { return __clz_unsafe((unsigned long)value); } - -inline unsigned __clz(unsigned long long value) -{ -#if defined(__arm64__) || defined(__aarch64__) // Apple silicon or ARMv8 - return __builtin_clzll(value); // clz -#else -#if _TARGET_64BIT -#if defined(__clang__) -#ifdef __LZCNT__ - return __builtin_ia32_lzcnt_u64(value); // lzcnt -#else - return value ? __builtin_clzll(value) : 64; // bsr -#endif -#elif defined(__GNUC__) -#ifdef __LZCNT__ - return __builtin_clzll(value); // lzcnt -#else - return value ? __builtin_clzll(value) : 64; // bsr -#endif -#elif defined(_MSC_VER) -#if defined(__AVX2__) - return _lzcnt_u64(value); // lzcnt -#else - unsigned long r; - _BitScanForward64(&r, value); // bsr - return value ? r ^ 63 : 64; -#endif -#endif -#endif -#endif - - if (value) - { - unsigned n = 0; - // clang-format off - if(value & (0xFFFFFFFF00000000ull)) { n += 32; value >>= 32; } - if(value & 0xFFFF0000) { n += 16; value >>= 16; } - if(value & 0xFFFFFF00) { n += 8; value >>= 8; } - if(value & 0xFFFFFFF0) { n += 4; value >>= 4; } - if(value & 0xFFFFFFFC) { n += 2; value >>= 2; } - if(value & 0xFFFFFFFE) { n += 1; } - // clang-format on - - return n; - } - - return 64; -} - -inline unsigned __clz(long long value) { return __clz((unsigned long long)value); } - -inline unsigned __clz(unsigned int value) -{ -#if defined(__ARM_ARCH) - return __builtin_clz(value); // clz -#else -#if defined(__clang__) -#ifdef __LZCNT__ - return __builtin_ia32_lzcnt_u32(value); // lzcnt -#else - return value ? __builtin_clz(value) : 32; // bsr -#endif -#elif defined(__GNUC__) -#ifdef __LZCNT__ - return __builtin_clz(value); // lzcnt -#else - return value ? __builtin_clz(value) : 32; // bsr -#endif -#elif defined(_MSC_VER) -#if defined(__AVX2__) - return _lzcnt_u32(value); // lzcnt -#else - unsigned long r; - _BitScanForward(&r, value); // bsr - return value ? r ^ 31 : 32; -#endif -#endif -#endif - - if (value) - { - unsigned n = 0; - // clang-format off - if(value & 0xFFFF0000) { n += 16; value >>= 16; } - if(value & 0xFFFFFF00) { n += 8; value >>= 8; } - if(value & 0xFFFFFFF0) { n += 4; value >>= 4; } - if(value & 0xFFFFFFFC) { n += 2; value >>= 2; } - if(value & 0xFFFFFFFE) { n += 1; } - // clang-format on - - return n; - } - - return 32; -} - -inline unsigned __clz(int value) { return __clz((unsigned int)value); } - -inline unsigned __clz(unsigned long value) -{ - if constexpr (sizeof(value) == 8) - return __clz((unsigned long long)value); - else - return __clz((unsigned int)value); -} - -inline unsigned __clz(long value) { return __clz((unsigned long)value); } diff --git a/prog/dagorInclude/math/dag_intrin.h b/prog/dagorInclude/math/dag_intrin.h new file mode 100644 index 000000000..990623fea --- /dev/null +++ b/prog/dagorInclude/math/dag_intrin.h @@ -0,0 +1,390 @@ +// +// Dagor Engine 6.5 +// Copyright (C) 2023 Gaijin Games KFT. All rights reserved +// (for conditions of use see prog/license.txt) +// +#pragma once + +#if defined(_MSC_VER) && !defined(__clang__) +#include +#endif + +#if defined(__BMI__) +#include +#endif + +inline unsigned __ctz_unsafe(unsigned long long value) +{ +#if defined(__clang__) || defined(__GNUC__) + return __builtin_ctzll(value); // tzcnt or rep bsf or rbit + clz +#elif _TARGET_64BIT && defined(_MSC_VER) + return _tzcnt_u64(value); // tzcnt -> rep bsf +#endif + + unsigned n = 1; //-V779 + // clang-format off + if((value & 0xFFFFFFFF) == 0) { n += 32; value >>= 32; } + if((value & 0x0000FFFF) == 0) { n += 16; value >>= 16; } + if((value & 0x000000FF) == 0) { n += 8; value >>= 8; } + if((value & 0x0000000F) == 0) { n += 4; value >>= 4; } + if((value & 0x00000003) == 0) { n += 2; value >>= 2; } + // clang-format on + return (n - ((unsigned)value & 1)); +} + +inline unsigned __ctz_unsafe(long long value) { return __ctz_unsafe((unsigned long long)value); } + +inline unsigned __ctz_unsafe(unsigned int value) +{ +#if defined(__clang__) || defined(__GNUC__) + return __builtin_ctz(value); // tzcnt or rep bsf +#elif defined(_MSC_VER) + return _tzcnt_u32(value); // tzcnt -> rep bsf +#endif + + unsigned n = 1; //-V779 + // clang-format off + if((value & 0x0000FFFF) == 0) { n += 16; value >>= 16; } + if((value & 0x000000FF) == 0) { n += 8; value >>= 8; } + if((value & 0x0000000F) == 0) { n += 4; value >>= 4; } + if((value & 0x00000003) == 0) { n += 2; value >>= 2; } + // clang-format on + return (n - ((unsigned)value & 1)); +} + +inline unsigned __ctz_unsafe(int value) { return __ctz_unsafe((unsigned int)value); } + +inline unsigned __ctz_unsafe(unsigned long value) +{ + if constexpr (sizeof(value) == 8) + return __ctz_unsafe((unsigned long long)value); + else + return __ctz_unsafe((unsigned int)value); +} + +inline unsigned __ctz_unsafe(long value) { return __ctz_unsafe((unsigned long)value); } + +inline unsigned __ctz(unsigned long long value) +{ +#if defined(__arm64__) || defined(__ARM_ARCH) // Apple silicon or ARMv7 and above + return __builtin_ctzll(value); // rbit + clz +#else +#if _TARGET_64BIT +#if defined(__clang__) + return __builtin_ia32_tzcnt_u64(value); // bsf or tzct if -mbmi +#elif defined(__GNUC__) +#ifdef __BMI__ + return __builtin_ctzll(value); // tzcnt +#else + return value ? __builtin_ctzll(value) : 64; // bsf +#endif +#elif defined(_MSC_VER) +#if defined(__AVX2__) + return _tzcnt_u64(value); // tzcnt +#else + unsigned long r; + _BitScanForward64(&r, value); // bsf + return value ? r : 64; +#endif +#endif +#endif +#endif + + if (value) //-V779 + { + unsigned n = 1; + // clang-format off + if((value & 0xFFFFFFFF) == 0) { n += 32; value >>= 32; } + if((value & 0x0000FFFF) == 0) { n += 16; value >>= 16; } + if((value & 0x000000FF) == 0) { n += 8; value >>= 8; } + if((value & 0x0000000F) == 0) { n += 4; value >>= 4; } + if((value & 0x00000003) == 0) { n += 2; value >>= 2; } + // clang-format on + return (n - ((unsigned)value & 1)); + } + + return 64; +} + +inline unsigned __ctz(long long value) { return __ctz((unsigned long long)value); } + +inline unsigned __ctz(unsigned int value) +{ +#if defined(__arm64__) || defined(__ARM_ARCH) // Apple silicon or ARMv7 and above + return __builtin_ctz(value); // rbit + clz +#else +#if _TARGET_64BIT +#if defined(__clang__) + return __builtin_ia32_tzcnt_u32(value); // bsf or tzct if -mbmi +#elif defined(__GNUC__) +#ifdef __BMI__ + return __builtin_ctz(value); // tzcnt +#else + return value ? __builtin_ctz(value) : 32; // bsf +#endif +#elif defined(_MSC_VER) +#if defined(__AVX2__) + return _tzcnt_u32(value); // tzcnt +#else + unsigned long r; + _BitScanForward(&r, value); // bsf + return value ? r : 32; +#endif +#endif +#endif +#endif + + if (value) //-V779 + { + unsigned n = 1; + // clang-format off + if((value & 0x0000FFFF) == 0) { n += 16; value >>= 16; } + if((value & 0x000000FF) == 0) { n += 8; value >>= 8; } + if((value & 0x0000000F) == 0) { n += 4; value >>= 4; } + if((value & 0x00000003) == 0) { n += 2; value >>= 2; } + // clang-format on + return (n - ((unsigned)value & 1)); + } + + return 32; +} + +inline unsigned __ctz(int value) { return __ctz((unsigned int)value); } + +inline unsigned __ctz(unsigned long value) +{ + if constexpr (sizeof(value) == 8) + return __ctz((unsigned long long)value); + else + return __ctz((unsigned int)value); +} + +inline unsigned __ctz(long value) { return __ctz((unsigned long)value); } + +inline unsigned __clz_unsafe(unsigned long long value) +{ +#if defined(__arm64__) || defined(__aarch64__) // Apple silicon or ARMv8 + return __builtin_clzll(value); // clz +#else +#if _TARGET_64BIT +#if defined(__clang__) +#ifdef __LZCNT__ + return __builtin_ia32_lzcnt_u64(value); // lzcnt +#else + return __builtin_clzll(value); // bsr +#endif +#elif defined(__GNUC__) + return __builtin_clzll(value); // bsr or lznct if -mabm +#elif defined(_MSC_VER) +#if defined(__AVX2__) + return _lzcnt_u64(value); // lzcnt +#else + unsigned long r; + _BitScanForward64(&r, value); // bsr + return r ^ 63; +#endif +#endif +#endif +#endif + + unsigned n = 0; //-V779 + // clang-format off + if(value & (0xFFFFFFFF00000000ull)) { n += 32; value >>= 32; } + if(value & 0xFFFF0000) { n += 16; value >>= 16; } + if(value & 0xFFFFFF00) { n += 8; value >>= 8; } + if(value & 0xFFFFFFF0) { n += 4; value >>= 4; } + if(value & 0xFFFFFFFC) { n += 2; value >>= 2; } + if(value & 0xFFFFFFFE) { n += 1; } + // clang-format on + + return n; +} + +inline unsigned __clz_unsafe(long long value) { return __clz_unsafe((unsigned long long)value); } + +inline unsigned __clz_unsafe(unsigned int value) +{ +#if defined(__ARM_ARCH) + return __builtin_clz(value); // clz +#else +#if defined(__clang__) +#ifdef __LZCNT__ + return __builtin_ia32_lzcnt_u32(value); // lzcnt +#else + return __builtin_clz(value); // bsr +#endif +#elif defined(__GNUC__) + return __builtin_clz(value); // bsr or lznct if -mabm +#elif defined(_MSC_VER) +#if defined(__AVX2__) + return _lzcnt_u32(value); // lzcnt +#else + unsigned long r; + _BitScanForward(&r, value); // bsr + return r ^ 31; +#endif +#endif +#endif + + unsigned n = 0; //-V779 + // clang-format off + if(value & 0xFFFF0000) { n += 16; value >>= 16; } + if(value & 0xFFFFFF00) { n += 8; value >>= 8; } + if(value & 0xFFFFFFF0) { n += 4; value >>= 4; } + if(value & 0xFFFFFFFC) { n += 2; value >>= 2; } + if(value & 0xFFFFFFFE) { n += 1; } + // clang-format on + + return n; +} + +inline unsigned __clz_unsafe(int value) { return __clz_unsafe((unsigned int)value); } + +inline unsigned __clz_unsafe(unsigned long value) +{ + if constexpr (sizeof(value) == 8) + return __clz_unsafe((unsigned long long)value); + else + return __clz_unsafe((unsigned int)value); +} + +inline unsigned __clz_unsafe(long value) { return __clz_unsafe((unsigned long)value); } + +inline unsigned __clz(unsigned long long value) +{ +#if defined(__arm64__) || defined(__aarch64__) // Apple silicon or ARMv8 + return __builtin_clzll(value); // clz +#else +#if _TARGET_64BIT +#if defined(__clang__) +#ifdef __LZCNT__ + return __builtin_ia32_lzcnt_u64(value); // lzcnt +#else + return value ? __builtin_clzll(value) : 64; // bsr +#endif +#elif defined(__GNUC__) +#ifdef __LZCNT__ + return __builtin_clzll(value); // lzcnt +#else + return value ? __builtin_clzll(value) : 64; // bsr +#endif +#elif defined(_MSC_VER) +#if defined(__AVX2__) + return _lzcnt_u64(value); // lzcnt +#else + unsigned long r; + _BitScanForward64(&r, value); // bsr + return value ? r ^ 63 : 64; +#endif +#endif +#endif +#endif + + if (value) //-V779 + { + unsigned n = 0; + // clang-format off + if(value & (0xFFFFFFFF00000000ull)) { n += 32; value >>= 32; } + if(value & 0xFFFF0000) { n += 16; value >>= 16; } + if(value & 0xFFFFFF00) { n += 8; value >>= 8; } + if(value & 0xFFFFFFF0) { n += 4; value >>= 4; } + if(value & 0xFFFFFFFC) { n += 2; value >>= 2; } + if(value & 0xFFFFFFFE) { n += 1; } + // clang-format on + + return n; + } + + return 64; +} + +inline unsigned __clz(long long value) { return __clz((unsigned long long)value); } + +inline unsigned __clz(unsigned int value) +{ +#if defined(__ARM_ARCH) + return __builtin_clz(value); // clz +#else +#if defined(__clang__) +#ifdef __LZCNT__ + return __builtin_ia32_lzcnt_u32(value); // lzcnt +#else + return value ? __builtin_clz(value) : 32; // bsr +#endif +#elif defined(__GNUC__) +#ifdef __LZCNT__ + return __builtin_clz(value); // lzcnt +#else + return value ? __builtin_clz(value) : 32; // bsr +#endif +#elif defined(_MSC_VER) +#if defined(__AVX2__) + return _lzcnt_u32(value); // lzcnt +#else + unsigned long r; + _BitScanForward(&r, value); // bsr + return value ? r ^ 31 : 32; +#endif +#endif +#endif + + if (value) //-V779 + { + unsigned n = 0; + // clang-format off + if(value & 0xFFFF0000) { n += 16; value >>= 16; } + if(value & 0xFFFFFF00) { n += 8; value >>= 8; } + if(value & 0xFFFFFFF0) { n += 4; value >>= 4; } + if(value & 0xFFFFFFFC) { n += 2; value >>= 2; } + if(value & 0xFFFFFFFE) { n += 1; } + // clang-format on + + return n; + } + + return 32; +} + +inline unsigned __clz(int value) { return __clz((unsigned int)value); } + +inline unsigned __clz(unsigned long value) +{ + if constexpr (sizeof(value) == 8) + return __clz((unsigned long long)value); + else + return __clz((unsigned int)value); +} + +inline unsigned __clz(long value) { return __clz((unsigned long)value); } + +inline unsigned __blsr(unsigned long long value) +{ +#if defined(__BMI__) && _TARGET_64BIT + return _blsr_u64(value); +#else + return value & (value - 1); +#endif +} + +inline unsigned __blsr(long long value) { return __blsr((unsigned long long)value); } + +inline unsigned __blsr(unsigned int value) +{ +#if defined(__BMI__) + return _blsr_u32(value); +#else + return value & (value - 1); +#endif +} + +inline unsigned __blsr(int value) { return __blsr((unsigned int)value); } + +inline unsigned __blsr(unsigned long value) +{ + if constexpr (sizeof(value) == 8) + return __blsr((unsigned long long)value); + else + return __blsr((unsigned int)value); +} + +inline unsigned __blsr(long value) { return __blsr((unsigned long)value); } diff --git a/prog/dagorInclude/memory/dag_genMemAlloc.h b/prog/dagorInclude/memory/dag_genMemAlloc.h index 37ff67704..60257cfe3 100644 --- a/prog/dagorInclude/memory/dag_genMemAlloc.h +++ b/prog/dagorInclude/memory/dag_genMemAlloc.h @@ -55,6 +55,8 @@ { \ return MEM->realloc(p, sz); \ } \ + static inline void set_name(const char *) \ + {} \ } DECLARE_MEMALLOC(MidmemAlloc, midmem); diff --git a/prog/dagorInclude/osApiWrappers/dag_threads.h b/prog/dagorInclude/osApiWrappers/dag_threads.h index c0a9990d7..1356e8849 100644 --- a/prog/dagorInclude/osApiWrappers/dag_threads.h +++ b/prog/dagorInclude/osApiWrappers/dag_threads.h @@ -100,8 +100,9 @@ class DaThread pthread_t id; KRNLIMP static void *threadEntry(void *arg); -#elif _TARGET_PC | _TARGET_XBOX +#elif _TARGET_PC_WIN | _TARGET_XBOX uintptr_t id; + bool minidumpSaveStack = true; KRNLIMP static unsigned __stdcall threadEntry(void *arg); #else @@ -121,6 +122,7 @@ class DaThread KRNLIMP const void *getCurrentThreadIdPtr() const { return &id; } KRNLIMP void setAffinity(unsigned int affinity); // TODO: make affinity ctor arg and remove this method KRNLIMP void setThreadIdealProcessor(int ideal_processor_no); + KRNLIMP void stripStackInMinidump(); KRNLIMP static void terminate_all(bool wait, int timeout_ms = 3000); @@ -132,12 +134,15 @@ class DaThread KRNLIMP static const char *getCurrentThreadName(); KRNLIMP static bool applyThisThreadPriority(int prio, const char *name = nullptr); +#if _TARGET_PC | _TARGET_XBOX + KRNLIMP static bool isDaThreadWinUnsafe(uintptr_t thread_id, bool &minidump_save_stack); // no lock +#endif + KRNLIMP static void setCurrentThreadName(const char *tname); protected: KRNLIMP virtual ~DaThread(); void applyThreadPriority(); - static void setCurrentThreadName(const char *tname); void afterThreadExecution(); void applyThreadAffinity(unsigned int affinity); void doThread(); diff --git a/prog/dagorInclude/phys/dag_fastPhys.h b/prog/dagorInclude/phys/dag_fastPhys.h index 5ba3ecca1..abf586381 100644 --- a/prog/dagorInclude/phys/dag_fastPhys.h +++ b/prog/dagorInclude/phys/dag_fastPhys.h @@ -55,9 +55,6 @@ struct ClippedLine ClippedLine(int i1, int i2, int n) : p1Index(i1), p2Index(i2), numSegs(n) {} }; -void toggleDebugAnimChar(eastl::string &str); -bool checkDebugAnimChar(eastl::string &str); -void resetDebugAnimChars(); }; // namespace FastPhys diff --git a/prog/dagorInclude/shaders/dag_linearMatrixBufferAllocator.h b/prog/dagorInclude/shaders/dag_linearMatrixBufferAllocator.h index a42302285..4d08fc9ec 100644 --- a/prog/dagorInclude/shaders/dag_linearMatrixBufferAllocator.h +++ b/prog/dagorInclude/shaders/dag_linearMatrixBufferAllocator.h @@ -25,9 +25,7 @@ struct MatrixBufferHeapManager Tab bindposeArr; }; - MatrixBufferHeapManager(const char *name) : - sbufferHeapManager(name, ELEM_SIZE, SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED | SBCF_MAYBELOST, 0) - {} + MatrixBufferHeapManager(const char *name) : sbufferHeapManager(name, ELEM_SIZE, SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED, 0) {} void copy(Heap &to, size_t to_offset, const Heap &from, size_t from_offset, size_t len) { diff --git a/prog/dagorInclude/shaders/dag_postFxRenderer.h b/prog/dagorInclude/shaders/dag_postFxRenderer.h index 8979ca047..69926dcb3 100644 --- a/prog/dagorInclude/shaders/dag_postFxRenderer.h +++ b/prog/dagorInclude/shaders/dag_postFxRenderer.h @@ -24,7 +24,7 @@ class PostFxRenderer explicit PostFxRenderer(const char *shader_name); void clear(); - void init(const char *shader_name); + void init(const char *shader_name, bool is_optional = false); // Use to set shader params. ShaderMaterial *getMat() { return shmat; } diff --git a/prog/dagorInclude/shaders/dag_rendInstRes.h b/prog/dagorInclude/shaders/dag_rendInstRes.h index 78006d1de..c435fb1c9 100644 --- a/prog/dagorInclude/shaders/dag_rendInstRes.h +++ b/prog/dagorInclude/shaders/dag_rendInstRes.h @@ -118,6 +118,10 @@ decl_dclass_and_id(RenderableInstanceLodsResource, DObject, 0x0F076634u) ShaderGlobal::reset_from_vars_and_release_managed_tex(ao_smoothness); ShaderGlobal::reset_from_vars_and_release_managed_tex_verified(shadowAtlas, shadowAtlasTex); } + bool isInitialized() + { + return albedo_alpha != BAD_TEXTUREID && normal_translucency != BAD_TEXTUREID && ao_smoothness != BAD_TEXTUREID; + } ImpostorTextures() = default; ~ImpostorTextures() { close(); } ImpostorTextures(const ImpostorTextures &) = delete; @@ -176,7 +180,6 @@ decl_dclass_and_id(RenderableInstanceLodsResource, DObject, 0x0F076634u) inline float getMaxDist() const { return lods.size() ? lods.back().range : 0; } bool isBakedImpostor() const; - BaseTexture *getPreshadowTexture() const; bool setImpostorVars(ShaderMaterial * mat, int buffer_offset) const; bool setImpostorTransitionRange(ShaderMaterial * mat, float transition_lod_start, float transition_range) const; void prepareTextures(const char *name, uint32_t shadow_atlas_size, int shadow_atlas_mip_offset, int texture_format_flags); diff --git a/prog/dagorInclude/shaders/dag_shaderVar.h b/prog/dagorInclude/shaders/dag_shaderVar.h index 7d914f8e4..e4ac1dc60 100644 --- a/prog/dagorInclude/shaders/dag_shaderVar.h +++ b/prog/dagorInclude/shaders/dag_shaderVar.h @@ -6,6 +6,7 @@ #pragma once #include <3d/dag_texMgr.h> +#include <3d/dag_sampler.h> #include #include #include @@ -131,6 +132,7 @@ bool set_int4(int variable_id, const IPoint4 &v); bool set_int4_array(int variable_id, const IPoint4 *data, int count); bool set_texture(int variable_id, TEXTUREID texture_id); bool set_texture(int variable_id, const ManagedTex &texture); +bool set_sampler(int variable_id, d3d::SamplerHandle handle); bool set_buffer(int variable_id, D3DRESID buffer_id); bool set_buffer(int variable_id, const ManagedBuf &buffer); bool set_texture(const ShaderVariableInfo &, const ManagedTex &texture); diff --git a/prog/dagorInclude/shaders/dag_shaderVarType.h b/prog/dagorInclude/shaders/dag_shaderVarType.h index 2d6582309..a6d20430f 100644 --- a/prog/dagorInclude/shaders/dag_shaderVarType.h +++ b/prog/dagorInclude/shaders/dag_shaderVarType.h @@ -18,6 +18,7 @@ enum ShaderVarType SHVT_BUFFER, SHVT_INT4, SHVT_FLOAT4X4, + SHVT_SAMPLER, }; enum ShaderVarTextureType : int diff --git a/prog/dagorInclude/supp/dag_comPtr.h b/prog/dagorInclude/supp/dag_comPtr.h index 6e19a2a5a..12ed8c5fd 100644 --- a/prog/dagorInclude/supp/dag_comPtr.h +++ b/prog/dagorInclude/supp/dag_comPtr.h @@ -10,6 +10,7 @@ #include #include #include +#include // copy paste (with some small changes) from wrl/client.h // don't blame me for the stupid shit of this... diff --git a/prog/engine/_docs/render/.readthedocs.yaml b/prog/engine/_docs/render/.readthedocs.yaml index a0069709d..92538acdf 100644 --- a/prog/engine/_docs/render/.readthedocs.yaml +++ b/prog/engine/_docs/render/.readthedocs.yaml @@ -7,9 +7,9 @@ build: python: install: - - requirements: prog/engine2/_docs/render/requirements.txt + - requirements: prog/engine/_docs/render/requirements.txt sphinx: - configuration: prog/engine2/_docs/render/docs/conf.py + configuration: prog/engine/_docs/render/docs/conf.py formats: all diff --git a/prog/engine/_docs/render/docs/source/index/daBFG/das.rst b/prog/engine/_docs/render/docs/source/index/daBFG/das.rst index 8a648abe0..d31cc23df 100644 --- a/prog/engine/_docs/render/docs/source/index/daBFG/das.rst +++ b/prog/engine/_docs/render/docs/source/index/daBFG/das.rst @@ -1,5 +1,5 @@ .. - This is auto generated file. See daBfg/api/dasModules/docs + This is auto generated file. See daBfg/api/das/docs .. _stdlib_daBfg: diff --git a/prog/engine/anim/animIKCtrl.cpp b/prog/engine/anim/animIKCtrl.cpp index 44cbc6c54..0bc786316 100644 --- a/prog/engine/anim/animIKCtrl.cpp +++ b/prog/engine/anim/animIKCtrl.cpp @@ -63,7 +63,7 @@ void AnimV20::LegsIKCtrl::process(IPureAnimStateHolder &st, real wt, GeomNodeTre mat44f &foot_tm = tree.getNodeTm(foot); mat44f &leg_wtm = tree.getNodeWtmRel(nodes[i].legId); vec3f foot_p1 = transform_vec_upto_root(tree, foot_parent, foot_tm.col3); - Point3_vec4 footNewPos(0.f, 0.f, 0.f); + vec3f footNewPos = v_zero(); bool move_foot = false; const mat44f &animcharTm = tree.getRootWtmRel(); vec4f vup = V_C_UNIT_0100; @@ -82,18 +82,16 @@ void AnimV20::LegsIKCtrl::process(IPureAnimStateHolder &st, real wt, GeomNodeTre mat44f animcharTmCorrected = animcharTm; animcharTmCorrected.col0 = animcharTm.col2; animcharTmCorrected.col2 = animcharTm.col0; - Point3_vec4 dir = as_point3(&animcharTm.col2); + vec3f dir = animcharTm.col2; Point3_vec4 kneeOffset = crawlKneeOffsetVec; kneeOffset[2] *= i == 0 ? -1 : 1; - v_st(&kneeOffset, v_mat44_mul_vec3p(animcharTmCorrected, v_ldu(&kneeOffset.x))); vec4f wofsVec4 = v_and(wofs, v_cast_vec4f(V_CI_MASK1110)); - Point3_vec4 pt; float maxDist = crawlMaxRay; - v_st(&pt, v_add(v_mat44_mul_vec3p(tree.getRootWtmRel(), v_and(knee_p, v_cast_vec4f(V_CI_MASK1110))), wofs)); - pt += kneeOffset; - if (ctx.irq(GIRQT_TraceFootStepDir, (intptr_t)(void *)&pt, (intptr_t)(void *)&dir, (intptr_t)(void *)&maxDist) == GIRQR_TraceOK) + vec3f pt = v_add(v_mat44_mul_vec3p(tree.getRootWtmRel(), v_and(knee_p, v_cast_vec4f(V_CI_MASK1110))), wofs); + pt = v_add(pt, v_mat44_mul_vec3p(animcharTmCorrected, v_ldu(&kneeOffset.x))); + if (ctx.irq(GIRQT_TraceFootStepDir, (intptr_t)(char *)&pt, (intptr_t)(char *)&dir, (intptr_t)(char *)&maxDist) == GIRQR_TraceOK) { - footNewPos = pt + dir * (maxDist - crawlFootOffset) - as_point3(&wofsVec4); + footNewPos = v_sub(v_madd(dir, v_splats(maxDist - crawlFootOffset), pt), wofsVec4); nodes[i].da = crawlFootAngle + acosf((crawlMaxRay - maxDist) / crawlMaxRay); move_foot = true; } @@ -185,10 +183,10 @@ void AnimV20::LegsIKCtrl::process(IPureAnimStateHolder &st, real wt, GeomNodeTre float len0 = v_extract_x(v_length3_x(v_sub(leg_wtm.col3, knee_wtm.col3))); float len1 = v_extract_x(v_length3_x(v_sub(foot_wtm.col3, knee_wtm.col3))); if (isCrawl) - as_point3(&foot_wtm.col3) = footNewPos; + foot_wtm.col3 = footNewPos; else foot_wtm.col3 = v_perm_xyzd(v_madd(vup, v_splats(nodes[i].dy), move_foot ? foot_p1 : foot_wtm.col3), V_C_UNIT_0001); - solve_2bones_ik(leg_wtm, knee_wtm, foot_wtm, foot_wtm, len0, len1, as_point3(&knee_wtm.col1)); + solve_2bones_ik(leg_wtm, knee_wtm, foot_wtm, foot_wtm, len0, len1, knee_wtm.col1); if (ctx.acScale) { vec3f s = *ctx.acScale; diff --git a/prog/engine/anim/animPostBlendCtrl.cpp b/prog/engine/anim/animPostBlendCtrl.cpp index 61539712a..bffb860b2 100644 --- a/prog/engine/anim/animPostBlendCtrl.cpp +++ b/prog/engine/anim/animPostBlendCtrl.cpp @@ -169,10 +169,14 @@ static bool load_math_op(const DataBlock &blk, int &op, float &p0, float &p1) op = lup(opStr, opNames, countof(opNames), -1); if (const char *v = blk.getStr("named_p0", NULL)) p0 = AnimV20::getEnumValueByName(v); + else if (const char *v = blk.getStr("slot_p0", NULL)) + p0 = AnimCharV20::addSlotId(v); else p0 = blk.getReal("p0", 0); if (const char *v = blk.getStr("named_p1", NULL)) p1 = AnimV20::getEnumValueByName(v); + else if (const char *v = blk.getStr("slot_p1", NULL)) + p1 = AnimCharV20::addSlotId(v); else p1 = blk.getReal("p1", 0); return op >= 0; @@ -2679,7 +2683,7 @@ void AnimPostBlendNodeLookatNodeCtrl::createNode(AnimationGraph &graph, const Da } AnimPostBlendEffFromAttachement::AnimPostBlendEffFromAttachement(AnimationGraph &g) : - AnimPostBlendCtrl(g), nodes(midmem), destVarId(midmem), slotId(-1), varId(-1) + AnimPostBlendCtrl(g), nodes(midmem), destVarId(midmem), namedSlotId(-1), varSlotId(-1), varId(-1) {} void AnimPostBlendEffFromAttachement::init(IPureAnimStateHolder &st, const GeomNodeTree &tree) { @@ -2697,6 +2701,7 @@ void AnimPostBlendEffFromAttachement::process(IPureAnimStateHolder &st, real wt, return; LocalData &ldata = *(LocalData *)st.getInlinePtr(varId); + const int slotId = varSlotId >= 0 ? (int)st.getParam(varSlotId) : namedSlotId; GeomNodeTree *att_tree = slotId < 0 ? &tree : ctx.ac->getAttachedSkeleton(slotId); if (!att_tree) @@ -2805,10 +2810,13 @@ void AnimPostBlendEffFromAttachement::createNode(AnimationGraph &graph, const Da AnimPostBlendEffFromAttachement *node = new AnimPostBlendEffFromAttachement(graph); - if (blk.getBool("localNode", false)) - node->slotId = -1; - else - node->slotId = AnimCharV20::addSlotId(blk.getStr("slot")); + if (!blk.getBool("localNode", false)) + { + if (const char *val = blk.getStr("slot", NULL)) + node->namedSlotId = AnimCharV20::addSlotId(val); + else if (const char *val = blk.getStr("varSlot", NULL)) + node->varSlotId = graph.addParamId(val, IPureAnimStateHolder::PT_ScalarParam); + } node->ignoreZeroWt = blk.getBool("ignoreZeroWt", false); for (int j = 0, nid = blk.getNameId("node"); j < blk.blockCount(); j++) if (blk.getBlock(j)->getBlockNameId() == nid) @@ -2840,7 +2848,7 @@ void AnimPostBlendEffFromAttachement::createNode(AnimationGraph &graph, const Da AnimPostBlendNodesFromAttachement::AnimPostBlendNodesFromAttachement(AnimationGraph &g) : - AnimPostBlendCtrl(g), nodes(midmem), varId(-1), slotId(-1), copyWtm(false) + AnimPostBlendCtrl(g), nodes(midmem), varId(-1), namedSlotId(-1), varSlotId(-1), copyWtm(false) {} void AnimPostBlendNodesFromAttachement::reset(IPureAnimStateHolder &st) { @@ -2868,6 +2876,7 @@ void AnimPostBlendNodesFromAttachement::process(IPureAnimStateHolder &st, real w return; LocalData &ldata = *(LocalData *)st.getInlinePtr(varId); + const int slotId = varSlotId >= 0 ? (int)st.getParam(varSlotId) : namedSlotId; GeomNodeTree *att_tree = ctx.ac->getAttachedSkeleton(slotId); unsigned att_uid = ctx.ac->getAttachmentUid(slotId); @@ -2922,7 +2931,10 @@ void AnimPostBlendNodesFromAttachement::createNode(AnimationGraph &graph, const AnimPostBlendNodesFromAttachement *node = new AnimPostBlendNodesFromAttachement(graph); - node->slotId = AnimCharV20::addSlotId(blk.getStr("slot")); + if (const char *val = blk.getStr("slot", NULL)) + node->namedSlotId = AnimCharV20::addSlotId(val); + else if (const char *val = blk.getStr("varSlot", NULL)) + node->varSlotId = graph.addParamId(val, IPureAnimStateHolder::PT_ScalarParam); node->copyWtm = blk.getBool("copyWtm", false); G_ASSERTF(!node->copyWtm, "copyWtm:b=yes not implemented, PBC %s", name); for (int j = 0, nid = blk.getNameId("node"); j < blk.blockCount(); j++) @@ -3362,6 +3374,8 @@ void AnimPostBlendSetParam::createNode(AnimationGraph &graph, const DataBlock &b } else if (const char *val = blk.getStr("namedValue", NULL)) node->val = AnimV20::getEnumValueByName(val); + else if (const char *val = blk.getStr("slotIdValue", NULL)) + node->val = AnimCharV20::addSlotId(val); else node->val = blk.getReal("value", 0); diff --git a/prog/engine/baseUtil/safeArg.cpp b/prog/engine/baseUtil/safeArg.cpp index 4c86ddaf1..cd43d9daf 100644 --- a/prog/engine/baseUtil/safeArg.cpp +++ b/prog/engine/baseUtil/safeArg.cpp @@ -445,7 +445,7 @@ int DagorSafeArg::count_len(const char *fmt, const DagorSafeArg *arg, int anum) int DagorSafeArg::print_fmt(char *buf, int len, const char *fmt, const DagorSafeArg *arg, int anum) { - G_ASSERT(buf && fmt && len > 0); + G_FAST_ASSERT(buf && fmt && len > 0); // Don't use regular `G_ASSERT` since it might create unwanted recursion if (!anum) { diff --git a/prog/engine/drv/drv3d_DX11/basetex.cpp b/prog/engine/drv/drv3d_DX11/basetex.cpp index cfb32afa1..cf7be5325 100644 --- a/prog/engine/drv/drv3d_DX11/basetex.cpp +++ b/prog/engine/drv/drv3d_DX11/basetex.cpp @@ -143,7 +143,7 @@ static bool createResView(TextureView &tv, BaseTex *tex, uint32_t face, uint32_t else desc.Format = dxgi_format_for_res(tex->format, tex->cflg); - if ((tex->cflg & TEXCF_MULTISAMPLED) == 0 || tex->tex.resolvedTex) + if ((tex->cflg & TEXCF_SAMPLECOUNT_MASK) == 0 || tex->tex.resolvedTex) { if (type == RES3D_TEX) { @@ -295,7 +295,7 @@ static bool createTexView(TextureView &tv, BaseTex *tex, uint32_t face, uint32_t desc.Format = (cflg & TEXCF_SRGBWRITE) ? DXGI_FORMAT_R8G8B8A8_UNORM_SRGB : DXGI_FORMAT_R8G8B8A8_UNORM; } - if ((cflg & TEXCF_MULTISAMPLED) == 0) + if ((cflg & TEXCF_SAMPLECOUNT_MASK) == 0) { if (type == RES3D_TEX) { @@ -400,7 +400,7 @@ static bool createTexView(TextureView &tv, BaseTex *tex, uint32_t face, uint32_t else { G_ASSERT(face == 0 && mip_level == 0 && mip_count == 1); - if ((tex->cflg & TEXCF_MULTISAMPLED) == 0) + if ((tex->cflg & TEXCF_SAMPLECOUNT_MASK) == 0) { desc.ViewDimension = D3D11_DSV_DIMENSION_TEXTURE2D; desc.Texture2D.MipSlice = 0; @@ -466,7 +466,7 @@ void BaseTex::release() { releaseTex(false); } void BaseTex::resolve(ID3D11Resource *dst, DXGI_FORMAT dst_format) { - G_ASSERT(cflg & (TEXCF_MSAATARGET | TEXCF_MULTISAMPLED)); + G_ASSERT(cflg & (TEXCF_SAMPLECOUNT_MASK)); switch (dst_format) { @@ -520,7 +520,7 @@ int BaseTex::update(BaseTexture *base_src) return 0; // arr rect - if (src->cflg & (TEXCF_MSAATARGET | TEXCF_MULTISAMPLED)) + if (src->cflg & TEXCF_SAMPLECOUNT_MASK) src->resolve(tex.texRes, format); else { @@ -584,7 +584,9 @@ int BaseTex::updateSubRegionImpl(BaseTexture *base_src, int src_subres_idx, int and pDstResource parameters, should have identical sample count values. */ - auto isDepthOrMS = [](const TextureInfo &info) { return is_depth_format_flg(info.cflg) || !!(info.cflg & TEXCF_MULTISAMPLED); }; + auto isDepthOrMS = [](const TextureInfo &info) { + return is_depth_format_flg(info.cflg) || !!(info.cflg & TEXCF_SAMPLECOUNT_MASK); + }; TextureInfo srcInfo, dstInfo; base_src->getinfo(srcInfo, src_subres_idx); diff --git a/prog/engine/drv/drv3d_DX11/buffers.cpp b/prog/engine/drv/drv3d_DX11/buffers.cpp index 6e3ccc17f..a9b7f3e61 100644 --- a/prog/engine/drv/drv3d_DX11/buffers.cpp +++ b/prog/engine/drv/drv3d_DX11/buffers.cpp @@ -12,6 +12,8 @@ #include +#include + #define USE_NVAPI_MULTIDRAW 0 // GPU hangs on SLI and under the Nsight. #if HAS_NVAPI && USE_NVAPI_MULTIDRAW #include @@ -293,6 +295,7 @@ Vbuffer *d3d::create_vb(int size, int flg, const char *name) { GenericBuffer *buf = new GenericBuffer(); flg |= SBCF_BIND_VERTEX; + validate_sbuffer_flags(flg, name); G_ASSERT(((flg & SBCF_BIND_MASK) & ~(SBCF_BIND_VERTEX | SBCF_BIND_SHADER_RES)) == 0); bool res = g_buffers.createObj(buf, size, flg, (flg & SBCF_BIND_MASK) >> 16, name); if (!res) @@ -308,6 +311,7 @@ Ibuffer *d3d::create_ib(int size, int flg, const char *stat_name) { GenericBuffer *buf = new GenericBuffer(); flg |= SBCF_BIND_INDEX; + validate_sbuffer_flags(flg, stat_name); G_ASSERT(((flg & SBCF_BIND_MASK) & ~(SBCF_BIND_INDEX | SBCF_BIND_SHADER_RES)) == 0); bool res = g_buffers.createObj(buf, size, flg, (flg & SBCF_BIND_MASK) >> 16, stat_name); if (!res) @@ -321,6 +325,7 @@ Ibuffer *d3d::create_ib(int size, int flg, const char *stat_name) Vbuffer *d3d::create_sbuffer(int struct_size, int elements, unsigned flags, unsigned format, const char *name) { + validate_sbuffer_flags(flags, name); if (flags & SBCF_BIND_INDEX) { G_ASSERTF(format == 0, "Index buffer can't have a format"); @@ -334,7 +339,6 @@ Vbuffer *d3d::create_sbuffer(int struct_size, int elements, unsigned flags, unsi GenericBuffer *buf = new GenericBuffer(); if (flags & SBCF_BIND_CONSTANT) { - G_ASSERTF((flags & (SBCF_MAYBELOST | SBCF_DYNAMIC)), "constant buffers have to be either dynamic or maybelost only."); G_ASSERTF((flags & SBCF_BIND_SHADER_RES) == 0, "constant buffers shouldn't be bound as shader resource."); } bool res = g_buffers.createSObj(buf, struct_size, elements, flags, format, name); //== fixme: |SBCF_BIND_VERTEX diff --git a/prog/engine/drv/drv3d_DX11/driver.h b/prog/engine/drv/drv3d_DX11/driver.h index deda389dd..356fd1bdf 100644 --- a/prog/engine/drv/drv3d_DX11/driver.h +++ b/prog/engine/drv/drv3d_DX11/driver.h @@ -179,14 +179,6 @@ extern int gpuAcquireRefCount; extern bool mt_enabled; extern D3D_FEATURE_LEVEL featureLevelsSupported; extern __declspec(thread) HRESULT last_hres; -extern int max_aa_samples; -struct MsaaMaxSamplesDesc -{ - DXGI_FORMAT format; - int samples; -}; -extern MsaaMaxSamplesDesc max_samples_format; -extern bool disable_backbuffer_aa; extern bool is_backbuffer_samplable_depth; extern bool is_nvapi_initialized; extern int default_display_index; diff --git a/prog/engine/drv/drv3d_DX11/drvmain.cpp b/prog/engine/drv/drv3d_DX11/drvmain.cpp index 45962200d..08e5ffe3a 100644 --- a/prog/engine/drv/drv3d_DX11/drvmain.cpp +++ b/prog/engine/drv/drv3d_DX11/drvmain.cpp @@ -942,7 +942,7 @@ FSHADER d3d::create_pixel_shader_asm(const char * /*asm_text*/) { return BAD_FSH bool d3d::setscissor(int x, int y, int w, int h) { - G_ASSERT(w > 0 && h > 0); + G_ASSERTF(w > 0 && h > 0, "%s(%d, %d, %d, %d)", __FUNCTION__, x, y, w, h); g_render_state.nextRasterizerState.scissor_x = x; g_render_state.nextRasterizerState.scissor_y = y; g_render_state.nextRasterizerState.scissor_w = w; diff --git a/prog/engine/drv/drv3d_DX11/genericBuffer.cpp b/prog/engine/drv/drv3d_DX11/genericBuffer.cpp index 80a2fdccc..13243e0fe 100644 --- a/prog/engine/drv/drv3d_DX11/genericBuffer.cpp +++ b/prog/engine/drv/drv3d_DX11/genericBuffer.cpp @@ -254,18 +254,6 @@ bool GenericBuffer::create(uint32_t bufsize, int buf_flags, UINT bind_flags, con { return createBuf(); } - if ((buf_flags & SBCF_BIND_UNORDERED) && !(buf_flags & SBCF_MAYBELOST)) - { -#if DAGOR_DBGLEVEL > 0 - logerr("buffer <%s> with unordered access view is required to be created with system copy." - " Doesn't make any sense, add SBCF_MAYBELOST", - getResName()); -#endif - buf_flags |= SBCF_MAYBELOST; - } - - if (!(buf_flags & SBCF_MAYBELOST)) - systemCopy = new char[bufsize]; return true; // delayed create } @@ -299,8 +287,7 @@ bool GenericBuffer::recreateBuf(Sbuffer *sb) TEXQL_ON_BUF_ALLOC(this); if (bufFlags & (SBCF_BIND_INDEX | SBCF_BIND_VERTEX)) { - if (bufFlags & (SBCF_DYNAMIC | SBCF_MAYBELOST)) - result |= createBuf(); + result |= createBuf(); } else { @@ -383,9 +370,6 @@ bool GenericBuffer::copyTo(Sbuffer *dest, uint32_t dst_ofs_bytes, uint32_t src_o if (!dest) return false; GenericBuffer *destvb = (GenericBuffer *)dest; - G_ASSERTF(((dest->getFlags() & SBCF_BIND_MASK) != SBCF_BIND_VERTEX && (dest->getFlags() & SBCF_BIND_MASK) != SBCF_BIND_INDEX) || - (dest->getFlags() & SBCF_MAYBELOST), // it is valid to copy to non-immutable buffer - "destination index/vertex buffer is immutable"); if (!destvb->buffer) return false; D3D11_BOX box; @@ -428,21 +412,19 @@ bool GenericBuffer::updateData(uint32_t ofs_bytes, uint32_t size_bytes, const vo logerr("buffer lock in updateData during reset %s", getResName()); return false; } - if (!buffer && (bufFlags & SBCF_DYNAMIC) == 0 && (bufFlags & SBCF_MAYBELOST)) + if (!buffer && (bufFlags & SBCF_DYNAMIC) == 0) createBuf(); if (!buffer) { if ((bufFlags & SBCF_DYNAMIC) == 0) - return updateDataWithLock(ofs_bytes, size_bytes, src, lockFlags); // case of delayed create. todo: we can also optimize rare case - // of SBCF_MAYBELOST, and not create system copy + return updateDataWithLock(ofs_bytes, size_bytes, src, lockFlags); // case of delayed create. logerr("buffer not created %s", getResName()); return false; } if ((bufFlags & (SBCF_DYNAMIC | SBCF_CPU_ACCESS_WRITE)) || // dynamic and immutable are not updateable, // https://msdn.microsoft.com/en-us/library/windows/desktop/ff476486(v=vs.85).aspx - (lockFlags & (VBLOCK_NOOVERWRITE | VBLOCK_DISCARD)) || - !(bufFlags & SBCF_MAYBELOST)) // if it is not maybelost, we can't lost data on reset and have to update with sysmem copy + (lockFlags & (VBLOCK_NOOVERWRITE | VBLOCK_DISCARD))) { return updateDataWithLock(ofs_bytes, size_bytes, src, lockFlags); } diff --git a/prog/engine/drv/drv3d_DX11/init.cpp b/prog/engine/drv/drv3d_DX11/init.cpp index b472a6bfd..ecc45befe 100644 --- a/prog/engine/drv/drv3d_DX11/init.cpp +++ b/prog/engine/drv/drv3d_DX11/init.cpp @@ -234,9 +234,6 @@ bool immutable_textures = true; bool resetting_device_now = false; HRESULT device_is_lost = S_OK; unsigned texture_sysmemcopy_usage = 0, vbuffer_sysmemcopy_usage = 0, ibuffer_sysmemcopy_usage = 0; -int max_aa_samples = 1; -MsaaMaxSamplesDesc max_samples_format = {DXGI_FORMAT_UNKNOWN, 1}; -bool disable_backbuffer_aa = true; bool is_backbuffer_samplable_depth = false; static DXGI_ADAPTER_DESC adapterDesc; bool ignore_resource_leaks_on_exit = false; @@ -787,10 +784,10 @@ bool init_device(Driver3dInitCallback *cb, HWND window_hwnd, int screen_wdt, int bool inWin = dgs_get_window_mode() != WindowMode::FULLSCREEN_EXCLUSIVE; - DXGI_SWAP_EFFECT swapEffect = DXGI_SWAP_EFFECT_DISCARD; // The only flag with multisampling support. - if ((inWin && disable_backbuffer_aa && !_no_vsync && blk_dx.getBool("flipPresent", true)) // Use the best supported presentation - // model. - || hdr_enabled // HDR requires FLIP swap effect or fullscreen + DXGI_SWAP_EFFECT swapEffect = DXGI_SWAP_EFFECT_DISCARD; // The only flag with multisampling support. + if ((inWin && !_no_vsync && blk_dx.getBool("flipPresent", true)) // Use the best supported presentation + // model. + || hdr_enabled // HDR requires FLIP swap effect or fullscreen || used_flip_model_before) // If the window was used with a flip model, it cannot be used with another model, or Present will // silently fail. { // https://devblogs.microsoft.com/directx/dxgi-flip-model/ and @@ -830,7 +827,7 @@ bool init_device(Driver3dInitCallback *cb, HWND window_hwnd, int screen_wdt, int scd.BufferDesc.RefreshRate.Denominator = 1; // While been correct according to MSDN, 0 crashes in some drivers. scd.BufferDesc.ScanlineOrdering = DXGI_MODE_SCANLINE_ORDER_UNSPECIFIED; // DXGI_MODE_SCANLINE_ORDER_PROGRESSIVE; scd.BufferDesc.Scaling = DXGI_MODE_SCALING_UNSPECIFIED; // DXGI_MODE_SCALING_STRETCHED;//DXGI_MODE_SCALING_CENTERED;// - scd.SampleDesc.Count = disable_backbuffer_aa ? 1 : max_aa_samples; + scd.SampleDesc.Count = 1; scd.SampleDesc.Quality = 0; scd.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT | DXGI_USAGE_SHADER_INPUT; // To stretch from. scd.BufferCount = is_flip_model(swapEffect) ? 2 : 1; @@ -1776,42 +1773,6 @@ bool init_device(Driver3dInitCallback *cb, HWND window_hwnd, int screen_wdt, int DXGI_FORMAT_R24G8_TYPELESS // MSAA support should be the same as for the DXGI_FORMAT_D24_UNORM_S8_UINT }; - for (int formatNo = 0; formatNo < sizeof(msaaFormats) / sizeof(msaaFormats[0]); formatNo++) - { - for (; max_aa_samples > 1; max_aa_samples--) - { - uint32_t numQualityLevels = 0; - dx_device->CheckMultisampleQualityLevels(msaaFormats[formatNo], max_aa_samples, &numQualityLevels); - if (numQualityLevels > 0) - break; - } - } - - DXGI_FORMAT maxMsaaFormats[] = { - DXGI_FORMAT_R8_UNORM, // sort by bit count - DXGI_FORMAT_R16_UNORM, - DXGI_FORMAT_R8G8_UNORM, - DXGI_FORMAT_B8G8R8A8_UNORM, - }; - int bestSamples = 1; - const int maxSamples = 8; - for (int formatNo = 0; formatNo < sizeof(maxMsaaFormats) / sizeof(maxMsaaFormats[0]); formatNo++) - { - for (int samples = maxSamples; samples > bestSamples; samples--) - { - uint32_t numQualityLevels = 0; - dx_device->CheckMultisampleQualityLevels(maxMsaaFormats[formatNo], samples, &numQualityLevels); - if (numQualityLevels > 0) - { - max_samples_format.format = maxMsaaFormats[formatNo]; - max_samples_format.samples = bestSamples = samples; - break; - } - } - } - debug("best possible aa samples count is %d, format %s", max_samples_format.samples, - dxgi_format_to_string(max_samples_format.format)); - nvlowlatency::init(dx_device, false); /* @@ -2379,13 +2340,7 @@ bool d3d::init_video(void *hinst, main_wnd_f *mwf, const char *wcname, int ncmds // debug("re sc %d %d",scr_wd,scr_ht); int screenBpp = blk_video.getBool("bits16", false) ? 16 : 32; - max_aa_samples = blk_dx.getInt("maxaa", 1); - if (cb && cb->desiredStereoRender()) - max_aa_samples = 1; - if (max_aa_samples < 1) - max_aa_samples = 1; - disable_backbuffer_aa = blk_video.getBool("disable_backbuffer_aa", true); is_backbuffer_samplable_depth = blk_video.getBool("backbuffer_sampleable_depth", false); // false by default, to be tested if it // slowing anything diff --git a/prog/engine/drv/drv3d_DX11/rtarget.cpp b/prog/engine/drv/drv3d_DX11/rtarget.cpp index f1f1ad09e..be5f20bf0 100644 --- a/prog/engine/drv/drv3d_DX11/rtarget.cpp +++ b/prog/engine/drv/drv3d_DX11/rtarget.cpp @@ -65,7 +65,7 @@ ID3D11DepthStencilView *getDepthStencilView(const Driver3dRenderTarget &rtState) { BaseTex *depthTex = (BaseTex *)g_driver_state.depthTextures[depthNo]; if (depthTex->width == colorTexWidth && depthTex->height == colorTexHeight && - (depthTex->cflg & TEXCF_MULTISAMPLED) == (colorTex->cflg & TEXCF_MULTISAMPLED) && + (depthTex->cflg & TEXCF_SAMPLECOUNT_MASK) == (colorTex->cflg & TEXCF_SAMPLECOUNT_MASK) && ((depthTex->restype() == RES3D_CUBETEX) == (colorTex->restype() == RES3D_CUBETEX) || featureLevelsSupported > D3D_FEATURE_LEVEL_10_0)) { @@ -78,13 +78,13 @@ ID3D11DepthStencilView *getDepthStencilView(const Driver3dRenderTarget &rtState) { if (colorTex->restype() == RES3D_CUBETEX && featureLevelsSupported <= D3D_FEATURE_LEVEL_10_0) { - g_driver_state.depthTextures.push_back( - d3d::create_cubetex(colorTexWidth, TEXFMT_DEPTH24 | TEXCF_RTARGET | (colorTex->cflg & TEXCF_MULTISAMPLED), 1, "cubedepth")); + g_driver_state.depthTextures.push_back(d3d::create_cubetex(colorTexWidth, + TEXFMT_DEPTH24 | TEXCF_RTARGET | (colorTex->cflg & TEXCF_SAMPLECOUNT_MASK), 1, "cubedepth")); } else { g_driver_state.depthTextures.push_back(d3d::create_tex(NULL, colorTexWidth, colorTexHeight, - TEXFMT_DEPTH24 | TEXCF_RTARGET | (colorTex->cflg & TEXCF_MULTISAMPLED), 1, "depth")); + TEXFMT_DEPTH24 | TEXCF_RTARGET | (colorTex->cflg & TEXCF_SAMPLECOUNT_MASK), 1, "depth")); } tex = (BaseTex *)(g_driver_state.depthTextures.back()); @@ -392,7 +392,7 @@ bool d3d::set_render_target() rs.modified = rs.rtModified = true; rs.viewModified = VIEWMOD_FULL; rs.nextRtState.setBackbufColor(); - rs.nextRtState.setBackbufDepth(); + rs.nextRtState.removeDepth(); return true; } @@ -727,7 +727,7 @@ static bool try_copy_tex(BaseTexture *from, BaseTexture *to, RectInt *from_rect, BaseTex *fromBase = (BaseTex *)from; BaseTex *toBase = (BaseTex *)to; - if (fromBase->format != toBase->format || (fromBase->cflg & TEXCF_MULTISAMPLED) != (toBase->cflg & TEXCF_MULTISAMPLED)) + if (fromBase->format != toBase->format || (fromBase->cflg & TEXCF_SAMPLECOUNT_MASK) != (toBase->cflg & TEXCF_SAMPLECOUNT_MASK)) return false; int srcw, srch; @@ -847,7 +847,6 @@ bool d3d::stretch_rect(BaseTexture *from, BaseTexture *to, RectInt *from_rect, R return false; } d3d::set_render_target(); - d3d::set_depth(NULL, DepthAccess::RW); stretch_prepare(from); d3d::draw_up(PRIM_TRILIST, 1, fullScrTri, sizeof(fullScrTri[0])); rs.restore(); @@ -867,7 +866,6 @@ bool d3d::stretch_rect(BaseTexture *from, BaseTexture *to, RectInt *from_rect, R d3d::set_render_target(); if (to) d3d::set_render_target((Texture *)to, 0); - d3d::set_depth(NULL, DepthAccess::RW); stretch_prepare(from); if (to_rect || from_rect) { diff --git a/prog/engine/drv/drv3d_DX11/shaders.cpp b/prog/engine/drv/drv3d_DX11/shaders.cpp index 980184a87..642f8ad18 100644 --- a/prog/engine/drv/drv3d_DX11/shaders.cpp +++ b/prog/engine/drv/drv3d_DX11/shaders.cpp @@ -1475,7 +1475,7 @@ PROGRAM d3d::create_program(const uint32_t *vpr_native, const uint32_t *fsh_nati return BAD_PROGRAM; } -PROGRAM d3d::create_program_cs(const uint32_t *native_code) +PROGRAM d3d::create_program_cs(const uint32_t *native_code, CSPreloaded) { SHADER_ID compute_shader = BAD_SHADER_ID; diff --git a/prog/engine/drv/drv3d_DX11/states.cpp b/prog/engine/drv/drv3d_DX11/states.cpp index 852162961..a6edf5cb4 100644 --- a/prog/engine/drv/drv3d_DX11/states.cpp +++ b/prog/engine/drv/drv3d_DX11/states.cpp @@ -59,7 +59,8 @@ shaders::DriverRenderStateId current_render_state; shaders::DriverRenderStateId stretch_prepare_render_state; eastl::vector_map clear_view_states; -KeyMapHashed, BlendState::Key, 128> blend_state_cache; // currently we have ~70 in game, so we use hash +KeyMapWideHashed, BlendState::Key, 128> blend_state_cache; // currently we have ~70 in game, so we use + // hash KeyMapWideHashed, RasterizerState::Key, 32> rasterizer_state_cache; static int rasterizer_state_cacheCount = 0; KeyMap, DepthStencilState::Key, 16> depth_stencil_state_cache; // currently we have ~10 depth @@ -353,51 +354,53 @@ ID3D11BlendState *BlendState::getStateObject() ZeroMemory(&desc, sizeof(desc)); desc.AlphaToCoverageEnable = toBOOL(alphaToCoverage); - - uint32_t ablendEnableMask = ablendEnable ? 0xFF : 0; // fixme: we should use enabling blend per target. //ablendEnableMrt & - // (ablendEnable ? 0xFF : 0); - - bool useIndependentBlend = toBOOL(ablendEnableMask != 0 && ablendEnableMask != 0xFF); // -V560 always false + desc.IndependentBlendEnable = independentBlendEnabled; for (int i = 1; i < D3D11_SIMULTANEOUS_RENDER_TARGET_COUNT; i++) - useIndependentBlend |= (writeMask[0] != writeMask[i]); - desc.IndependentBlendEnable = useIndependentBlend; - - bool blendEnable = ablendEnable | sepAblendEnable; - - // available dagor blend modes directly corresponds to D3D11_BLEND - D3D11_BLEND ablendSrcASet, ablendDstASet; - if (!sepAblendEnable) { - static D3D11_BLEND blendRemap[20] = { - D3D11_BLEND_ZERO, D3D11_BLEND_ZERO, D3D11_BLEND_ONE, - D3D11_BLEND_SRC_ALPHA, // D3D11_BLEND_SRC_COLOR = 3, - D3D11_BLEND_INV_SRC_ALPHA, // D3D11_BLEND_INV_SRC_COLOR = 4, - D3D11_BLEND_SRC_ALPHA, D3D11_BLEND_INV_SRC_ALPHA, D3D11_BLEND_DEST_ALPHA, D3D11_BLEND_INV_DEST_ALPHA, - D3D11_BLEND_DEST_ALPHA, // D3D11_BLEND_DEST_COLOR = 9, - D3D11_BLEND_INV_DEST_ALPHA, // D3D11_BLEND_INV_DEST_COLOR = 10, - D3D11_BLEND_SRC_ALPHA_SAT, D3D11_BLEND_ZERO, D3D11_BLEND_ZERO, D3D11_BLEND_BLEND_FACTOR, D3D11_BLEND_INV_BLEND_FACTOR, - D3D11_BLEND_SRC1_ALPHA, // D3D11_BLEND_SRC1_COLOR, - D3D11_BLEND_INV_SRC1_ALPHA, // D3D11_BLEND_INV_SRC1_COLOR , - D3D11_BLEND_SRC1_ALPHA, - D3D11_BLEND_INV_SRC1_ALPHA // 19 - }; - - ablendSrcASet = (D3D11_BLEND)blendRemap[ablendSrc], ablendDstASet = (D3D11_BLEND)blendRemap[ablendDst]; + desc.IndependentBlendEnable |= (writeMask[0] != writeMask[i]); } - else - ablendSrcASet = (D3D11_BLEND)ablendSrcA, ablendDstASet = (D3D11_BLEND)ablendDstA; + for (int i = 0; i < D3D11_SIMULTANEOUS_RENDER_TARGET_COUNT; i++) { + uint32_t blendParamsId = independentBlendEnabled && (i < shaders::RenderState::NumIndependentBlendParameters) ? i : 0; + const auto &blendParams = params[blendParamsId]; + + // available dagor blend modes directly corresponds to D3D11_BLEND + D3D11_BLEND ablendSrcASet, ablendDstASet; + if (!blendParams.sepAblendEnable) + { + static D3D11_BLEND blendRemap[20] = { + D3D11_BLEND_ZERO, D3D11_BLEND_ZERO, D3D11_BLEND_ONE, + D3D11_BLEND_SRC_ALPHA, // D3D11_BLEND_SRC_COLOR = 3, + D3D11_BLEND_INV_SRC_ALPHA, // D3D11_BLEND_INV_SRC_COLOR = 4, + D3D11_BLEND_SRC_ALPHA, D3D11_BLEND_INV_SRC_ALPHA, D3D11_BLEND_DEST_ALPHA, D3D11_BLEND_INV_DEST_ALPHA, + D3D11_BLEND_DEST_ALPHA, // D3D11_BLEND_DEST_COLOR = 9, + D3D11_BLEND_INV_DEST_ALPHA, // D3D11_BLEND_INV_DEST_COLOR = 10, + D3D11_BLEND_SRC_ALPHA_SAT, D3D11_BLEND_ZERO, D3D11_BLEND_ZERO, D3D11_BLEND_BLEND_FACTOR, D3D11_BLEND_INV_BLEND_FACTOR, + D3D11_BLEND_SRC1_ALPHA, // D3D11_BLEND_SRC1_COLOR, + D3D11_BLEND_INV_SRC1_ALPHA, // D3D11_BLEND_INV_SRC1_COLOR , + D3D11_BLEND_SRC1_ALPHA, + D3D11_BLEND_INV_SRC1_ALPHA // 19 + }; + ablendSrcASet = blendRemap[blendParams.ablendSrc]; + ablendDstASet = blendRemap[blendParams.ablendDst]; + } + else + { + ablendSrcASet = static_cast(blendParams.ablendSrcA); + ablendDstASet = static_cast(blendParams.ablendDstA); + } + D3D11_RENDER_TARGET_BLEND_DESC &rtb = desc.RenderTarget[i]; - rtb.BlendEnable = toBOOL(ablendEnableMask & (1 << i)); + rtb.BlendEnable = toBOOL(blendParams.ablendEnable); if (rtb.BlendEnable) { - rtb.SrcBlend = (D3D11_BLEND)ablendSrc; // same as D3D11_BLEND - rtb.DestBlend = (D3D11_BLEND)ablendDst; - rtb.BlendOp = (D3D11_BLEND_OP)ablendOp; // same as D3D11_BLEND_OP + rtb.SrcBlend = static_cast(blendParams.ablendSrc); // same as D3D11_BLEND + rtb.DestBlend = static_cast(blendParams.ablendDst); + rtb.BlendOp = static_cast(blendParams.ablendOp); // same as D3D11_BLEND_OP rtb.SrcBlendAlpha = ablendSrcASet; rtb.DestBlendAlpha = ablendDstASet; - rtb.BlendOpAlpha = D3D11_BLEND_OP(sepAblendEnable ? ablendOpA : ablendOp); + rtb.BlendOpAlpha = static_cast(blendParams.sepAblendEnable ? blendParams.ablendOpA : blendParams.ablendOp); } else { @@ -411,7 +414,6 @@ ID3D11BlendState *BlendState::getStateObject() rtb.RenderTargetWriteMask = writeMask[i]; // D3D11_COLOR_WRITE_ENABLE }; - { HRESULT hr = dx_device->CreateBlendState(&desc, &p.obj); if (hr == S_OK) @@ -912,14 +914,18 @@ bool d3d::set_blend_factor(E3DCOLOR c) static DriverRenderState shader_render_state_to_driver_render_state(const shaders::RenderState &state) { BlendState blendState; - blendState.ablendEnable = state.blendParams[0].ablend; - blendState.sepAblendEnable = state.blendParams[0].sepablend; - blendState.ablendOp = state.blendParams[0].blendOp; - blendState.ablendOpA = state.blendParams[0].sepablendOp; - blendState.ablendSrc = state.blendParams[0].ablendFactors.src; - blendState.ablendDst = state.blendParams[0].ablendFactors.dst; - blendState.ablendSrcA = state.blendParams[0].sepablendFactors.src; - blendState.ablendDstA = state.blendParams[0].sepablendFactors.dst; + blendState.independentBlendEnabled = state.independentBlendEnabled; + for (uint32_t i = 0; i < shaders::RenderState::NumIndependentBlendParameters; i++) + { + blendState.params[i].ablendEnable = state.blendParams[i].ablend; + blendState.params[i].sepAblendEnable = state.blendParams[i].sepablend; + blendState.params[i].ablendOp = state.blendParams[i].blendOp; + blendState.params[i].ablendOpA = state.blendParams[i].sepablendOp; + blendState.params[i].ablendSrc = state.blendParams[i].ablendFactors.src; + blendState.params[i].ablendDst = state.blendParams[i].ablendFactors.dst; + blendState.params[i].ablendSrcA = state.blendParams[i].sepablendFactors.src; + blendState.params[i].ablendDstA = state.blendParams[i].sepablendFactors.dst; + } blendState.alphaToCoverage = state.alphaToCoverage; uint32_t writeMask = state.colorWr; for (size_t i = 0; i < Driver3dRenderTarget::MAX_SIMRT; i++, writeMask >>= 4) diff --git a/prog/engine/drv/drv3d_DX11/states.h b/prog/engine/drv/drv3d_DX11/states.h index 6d3d4a24f..df9d993e3 100644 --- a/prog/engine/drv/drv3d_DX11/states.h +++ b/prog/engine/drv/drv3d_DX11/states.h @@ -132,98 +132,127 @@ struct RasterizerState return k; }; }; -/* - enum - { - ABLEND_MRT_MASK0 = (1<<0), - ABLEND_MRT_MASK1 = (1<<1), - ABLEND_MRT_MASK2 = (1<<2), - ABLEND_MRT_MASK3 = (1<<3), - ABLEND_MRT_ALL = (ABLEND_MRT_MASK0 | ABLEND_MRT_MASK1 | ABLEND_MRT_MASK2 | ABLEND_MRT_MASK3), - ABLEND_MRT_ILLEGAL = (1<<7) - }; -*/ + struct BlendState { - bool ablendEnable; // target 0 - bool sepAblendEnable; - uint8_t ablendEnableMrt; // 1bit for every RT + struct BlendParams + { + bool ablendEnable; + bool sepAblendEnable; + uint8_t ablendOp; // D3D11_BLEND_OP 3 bits + uint8_t ablendOpA; + uint8_t ablendSrc; // D3D11_BLEND 5 bits + uint8_t ablendDst; + uint8_t ablendSrcA; + uint8_t ablendDstA; + }; + uint8_t alphaToCoverage; + bool independentBlendEnabled; + BlendParams params[shaders::RenderState::NumIndependentBlendParameters]; //-V730_NOINIT + uint8_t writeMask[Driver3dRenderTarget::MAX_SIMRT]; // D3D11_COLOR_WRITE_ENABLE 4 bits - uint8_t ablendOp; // D3D11_BLEND_OP 3 bits - uint8_t ablendOpA; - uint8_t ablendSrc; // D3D11_BLEND_ 5 bits - uint8_t ablendDst; - uint8_t ablendSrcA; - uint8_t ablendDstA; + struct Key + { + private: + friend struct BlendState; + + union BlendKey + { + struct + { + uint32_t ablendEnable : 1; + uint32_t sepAblendEnable : 1; + uint32_t ablendOp : 3; // D3D11_BLEND_OP 3 bits + uint32_t ablendOpA : 3; + uint32_t ablendSrc : 5; // D3D11_BLEND 5 bits + uint32_t ablendDst : 5; + uint32_t ablendSrcA : 5; + uint32_t ablendDstA : 5; + }; + uint32_t bits; + }; - uint8_t writeMask[Driver3dRenderTarget::MAX_SIMRT]; // D3D11_COLOR_WRITE_ENABLE 4 bits + union + { + struct + { + uint32_t writeMask0 : 4; + uint32_t writeMask1 : 4; + uint32_t writeMask2 : 4; + uint32_t writeMask3 : 4; + uint32_t writeMask4 : 4; + uint32_t writeMask5 : 4; + uint32_t writeMask6 : 4; + uint32_t writeMask7 : 4; + }; + uint32_t writeMaskBits; + }; - typedef uint64_t Key; + BlendKey blendKeys[shaders::RenderState::NumIndependentBlendParameters]; - ID3D11BlendState *getStateObject(); + uint32_t alphaToCoverage : 1; - BlendState() : - ablendEnable(false), - sepAblendEnable(false), - ablendEnableMrt(0x0), - alphaToCoverage(0), // alphaToOne(0), - ablendOp(BLENDOP_ADD), - ablendSrc(BLEND_ONE), - ablendDst(BLEND_ZERO), - ablendOpA(BLENDOP_ADD), - ablendSrcA(BLEND_ONE), - ablendDstA(BLEND_ZERO) + public: + Key() + { + // makes sure all the bits of the key set to zero + memset(this, 0, sizeof(Key)); + } + inline uint32_t getHash() const { return hash32shiftmult(writeMaskBits + blendKeys[0].bits); } + }; + + static_assert(sizeof(Key) == 4 + shaders::RenderState::NumIndependentBlendParameters * sizeof(Key::BlendKey) + 4); + + ID3D11BlendState *getStateObject(); + BlendState() : alphaToCoverage(0), independentBlendEnabled(false) { + for (auto &blendParams : params) + { + blendParams.ablendEnable = false; + blendParams.sepAblendEnable = false; + blendParams.ablendOp = BLENDOP_ADD; + blendParams.ablendSrc = BLEND_ONE; + blendParams.ablendDst = BLEND_ZERO; + blendParams.ablendOpA = BLENDOP_ADD; + blendParams.ablendSrcA = BLEND_ONE; + blendParams.ablendDstA = BLEND_ZERO; + } + for (size_t i = 0; i < countof(writeMask); ++i) writeMask[i] = D3D11_COLOR_WRITE_ENABLE_ALL; } Key makeKey() { - union + Key key; + key.alphaToCoverage = alphaToCoverage; + + for (uint32_t i = 0; i < shaders::RenderState::NumIndependentBlendParameters; i++) { - Key k; - struct - { - uint64_t ablendEnableMrt : 7; // should be 8, for 8 mrt - uint64_t ablendEnable : 1; - uint64_t sepAblendEnable : 1; - uint64_t alphaToCoverage : 1; - uint64_t ablendOp_ablendOpA : 5; // D3D11_BLEND_OP 3 bits per parameter, but each param has only 5 variants (5 bits for all - // combinations) - uint64_t ablendSrc_Dst_SrcA_DstA : 17; // D3D11_BLEND_ 5 bits per parameter, but each param has only 17 variants (17 bits for - // all combinations) - uint64_t writeMask0 : 4; - uint64_t writeMask1 : 4; - uint64_t writeMask2 : 4; - uint64_t writeMask3 : 4; - uint64_t writeMask4 : 4; - uint64_t writeMask5 : 4; - uint64_t writeMask6 : 4; - uint64_t writeMask7 : 4; - } s; - } u; - G_STATIC_ASSERT(sizeof(u) == sizeof(Key)); - u.k = 0; - COPY_KEY(ablendEnable); - COPY_KEY(sepAblendEnable); - COPY_KEY(alphaToCoverage); - u.s.ablendEnableMrt = uint32_t(ablendEnableMrt & (uint32_t(1 << 8) - 1)); - const uint32_t BLEND_OP_COUNT = 5; // https://docs.microsoft.com/en-us/windows/win32/api/d3d11/ne-d3d11-d3d11_blend_op - u.s.ablendOp_ablendOpA = ablendOp * BLEND_OP_COUNT + ablendOpA; - const uint32_t BLEND_COUNT = 17; // https://docs.microsoft.com/en-us/windows/win32/api/d3d11/ne-d3d11-d3d11_blend - u.s.ablendSrc_Dst_SrcA_DstA = ((ablendSrc * BLEND_COUNT + ablendDst) * BLEND_COUNT + ablendSrcA) * BLEND_COUNT + ablendDstA; - u.s.writeMask0 = writeMask[0]; // MAX_SIMRT=4 - u.s.writeMask1 = writeMask[1]; - u.s.writeMask2 = writeMask[2]; - u.s.writeMask3 = writeMask[3]; - u.s.writeMask4 = writeMask[4]; - u.s.writeMask5 = writeMask[5]; - u.s.writeMask6 = writeMask[6]; - u.s.writeMask7 = writeMask[7]; - return u.k; + auto &blendKey = key.blendKeys[i]; + const auto &blendParams = params[i]; + + blendKey.ablendEnable = blendParams.ablendEnable; + blendKey.sepAblendEnable = blendParams.sepAblendEnable; + blendKey.ablendOp = blendParams.ablendOp; + blendKey.ablendOpA = blendParams.ablendOpA; + blendKey.ablendSrc = blendParams.ablendSrc; + blendKey.ablendDst = blendParams.ablendDst; + blendKey.ablendSrcA = blendParams.ablendSrcA; + blendKey.ablendDstA = blendParams.ablendDstA; + } + + key.writeMask0 = writeMask[0]; + key.writeMask1 = writeMask[1]; + key.writeMask2 = writeMask[2]; + key.writeMask3 = writeMask[3]; + key.writeMask4 = writeMask[4]; + key.writeMask5 = writeMask[5]; + key.writeMask6 = writeMask[6]; + key.writeMask7 = writeMask[7]; + return key; }; }; diff --git a/prog/engine/drv/drv3d_DX11/texture.cpp b/prog/engine/drv/drv3d_DX11/texture.cpp index 0098cf162..28ba78542 100644 --- a/prog/engine/drv/drv3d_DX11/texture.cpp +++ b/prog/engine/drv/drv3d_DX11/texture.cpp @@ -469,7 +469,6 @@ DXGI_FORMAT dxgi_format_from_flags(uint32_t cflg) case TEXFMT_DEPTH16: return DXGI_FORMAT_D16_UNORM; case TEXFMT_DEPTH32_S8: return DXGI_FORMAT_D32_FLOAT_S8X24_UINT; case TEXFMT_DEPTH32: return DXGI_FORMAT_D32_FLOAT; - case TEXFMT_MSAA_MAX_SAMPLES: return max_samples_format.format; } G_ASSERTF(0, "unknown texfmt %08x", cflg & TEXFMT_MASK); @@ -877,14 +876,6 @@ static void fixup_tex_params(int w, int h, int32_t &flg, int &levels) flg |= TEXCF_SRGBREAD; // only supports srgbwrite from srgb surfaces! } } - /* - if (adjust_tex2d_size(w, h)) - logerr("texture size changed, as it was bigger, than maximum tex size"); - if (((flg & (TEXCF_ZBUF | TEXCF_RTARGET)) == (TEXCF_ZBUF | TEXCF_RTARGET)) && - !is_depth_format(d3d_format)) - if (align_tex_zbuf_size(w, h, flg & TEXCF_MULTISAMPLED)) - logerr("render target size changed, due to requirement of zbuf"); - */ if (rt && (flg & TEXFMT_MASK) == TEXFMT_R5G6B5) flg = (flg & ~TEXFMT_MASK) | TEXFMT_A8R8G8B8; @@ -920,36 +911,12 @@ void set_tex_params(BaseTex *tex, int w, int h, int d, uint32_t flg, int levels, tex->setTexName(stat_name); } -static bool fix_format_msaa(uint32_t &flg, DXGI_SAMPLE_DESC &sampleDesc) -{ - if ((flg & TEXFMT_MASK) == TEXFMT_MSAA_MAX_SAMPLES) - { - if (max_samples_format.samples <= 1) - return false; - flg |= (TEXCF_MULTISAMPLED | TEXCF_MSAATARGET | TEXCF_RTARGET); - sampleDesc.Quality = 0; - sampleDesc.Count = max_samples_format.samples; - } - else if (flg & (TEXCF_MULTISAMPLED | TEXCF_MSAATARGET)) - { - sampleDesc.Quality = 0; - sampleDesc.Count = max_aa_samples ? max_aa_samples : 1; - if (max_aa_samples < 1) // no multisampling - flg &= ~(TEXCF_MULTISAMPLED | TEXCF_MSAATARGET); - } - else - { - sampleDesc.Quality = 0; - sampleDesc.Count = 1; - } - return true; -} bool create_tex2d(BaseTex::D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_t h, uint32_t levels, bool cube, D3D11_SUBRESOURCE_DATA *initial_data, int array_size = 1, bool tmp_tex = false) { uint32_t &flg = bt_in->cflg; - G_ASSERT(!((flg & TEXCF_MULTISAMPLED) && initial_data != NULL)); - G_ASSERT(!((flg & TEXCF_MULTISAMPLED) && (flg & TEXCF_CLEAR_ON_CREATE))); + G_ASSERT(!((flg & TEXCF_SAMPLECOUNT_MASK) && initial_data != NULL)); + G_ASSERT(!((flg & TEXCF_SAMPLECOUNT_MASK) && (flg & TEXCF_CLEAR_ON_CREATE))); G_ASSERT(!((flg & TEXCF_LOADONCE) && (flg & (TEXCF_DYNAMIC | TEXCF_RTARGET)))); D3D11_TEXTURE2D_DESC desc = {0}; @@ -959,8 +926,8 @@ bool create_tex2d(BaseTex::D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_ tex.realMipLevels = levels; desc.ArraySize = (cube ? 6 : 1) * array_size; desc.Format = dxgi_format_for_create(dxgi_format_from_flags(flg)); - if (!fix_format_msaa(flg, desc.SampleDesc)) - return false; + desc.SampleDesc.Quality = 0; + desc.SampleDesc.Count = get_sample_count(flg); G_ASSERT(desc.Format != DXGI_FORMAT_UNKNOWN); D3D11_USAGE usage = D3D11_USAGE_DEFAULT; // GPU R/W @@ -980,7 +947,7 @@ bool create_tex2d(BaseTex::D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_ // The depth texture is intended to be multisampled only as a pair to a multisampled color texture. // There is no need to sample from depth in that case. DX10.0 HW does not support it anyway. desc.BindFlags = - (isDepth && (flg & TEXCF_MULTISAMPLED) && !g_device_desc.caps.hasReadMultisampledDepth) ? 0 : D3D11_BIND_SHADER_RESOURCE; + (isDepth && (flg & TEXCF_SAMPLECOUNT_MASK) && !g_device_desc.caps.hasReadMultisampledDepth) ? 0 : D3D11_BIND_SHADER_RESOURCE; desc.BindFlags |= (isRT ? (isDepth ? D3D11_BIND_DEPTH_STENCIL : D3D11_BIND_RENDER_TARGET) : 0); desc.BindFlags |= (flg & TEXCF_UNORDERED) ? D3D11_BIND_UNORDERED_ACCESS : 0; @@ -1011,37 +978,11 @@ bool create_tex2d(BaseTex::D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_ if (!tmp_tex) TEXQL_PRE_CLEAN(bt_in->ressize()); - if (flg & TEXCF_MULTISAMPLED) + if ((flg & TEXCF_SAMPLECOUNT_MASK)) { uint32_t numQualities; - - { - HRESULT hr = dx_device->CheckMultisampleQualityLevels(desc.Format, desc.SampleDesc.Count, &numQualities); - if (SUCCEEDED(hr) && numQualities > 0) - { - if (!isDepth && !(flg & TEXCF_MSAATARGET)) - { - HRESULT hr = dx_device->CreateTexture2D(&desc, NULL, &tex.resolvedTex); - if (!device_should_reset(hr, "CreateTexture2D MSAA")) - { - DXFATAL(hr, "CreateTexture2D(resolvedTex)"); - } - } - - // 0 and numQualities - 1 and D3D11_STANDARD_MULTISAMPLE_PATTERN give the same result on ATI. - // 0 and D3D11_STANDARD_MULTISAMPLE_PATTERN give the same result on NV. - // D3D11_STANDARD_MULTISAMPLE_PATTERN is not supported on GF200. - // desc.SampleDesc.Quality = 0; - } - else - { - // Multisampling of this particular format is not supported. Pair with non-multisampled depth. - debug("<%s>texture format do not support multisampling", bt_in->getResName()); - flg &= ~(TEXCF_MULTISAMPLED | TEXCF_MSAATARGET); - desc.SampleDesc.Quality = 0; - desc.SampleDesc.Count = 1; - } - } + if (FAILED(dx_device->CheckMultisampleQualityLevels(desc.Format, desc.SampleDesc.Count, &numQualities)) || numQualities == 0) + return false; } { @@ -1118,7 +1059,7 @@ bool create_tex2d(BaseTex::D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_ bool create_tex3d(BaseTex::D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_t h, uint32_t d, uint32_t flg, uint32_t levels, D3D11_SUBRESOURCE_DATA *initial_data) { - G_ASSERT((flg & TEXCF_MULTISAMPLED) == 0); + G_ASSERT((flg & TEXCF_SAMPLECOUNT_MASK) == 0); G_ASSERT(!((flg & TEXCF_LOADONCE) && (flg & TEXCF_DYNAMIC))); D3D11_TEXTURE3D_DESC desc = {0}; @@ -1434,6 +1375,19 @@ static bool check_texformat(int cflg, int resType) bool d3d::check_texformat(int cflg) { return check_texformat(cflg, RES3D_TEX); } +int d3d::get_max_sample_count(int cflg) +{ + DXGI_FORMAT dxgiFormat = dxgi_format_from_flags(cflg); + for (int samples = get_sample_count(TEXCF_SAMPLECOUNT_MAX); samples; samples >>= 1) + { + uint32_t numQualityLevels = 0; + if (SUCCEEDED(dx_device->CheckMultisampleQualityLevels(dxgiFormat, samples, &numQualityLevels)) && numQualityLevels > 0) + return samples; + } + + return 1; +} + bool d3d::issame_texformat(int f1, int f2) { return dxgi_format_from_flags(f1) == dxgi_format_from_flags(f2); } bool d3d::check_cubetexformat(int f) { return check_texformat(f, RES3D_CUBETEX); } @@ -1477,14 +1431,12 @@ Texture *drv3d_dx11::create_backbuffer_tex(int id, IDXGI_SWAP_CHAIN *swap_chain) ID3D11Texture2D *texRes; DXFATAL(swap_chain->GetBuffer(id, __uuidof(ID3D11Texture2D), (void **)&texRes), "SCC"); - uint32_t msaaFlags = (disable_backbuffer_aa || max_aa_samples <= 1) ? 0 : TEXCF_MULTISAMPLED | TEXCF_MSAATARGET; - return create_d3d_tex(texRes, "backbuffer", TEXFMT_DEFAULT | TEXCF_RTARGET | msaaFlags); + return create_d3d_tex(texRes, "backbuffer", TEXFMT_DEFAULT | TEXCF_RTARGET); } Texture *drv3d_dx11::create_backbuffer_depth_tex(uint32_t w, uint32_t h) { - uint32_t msaaFlags = (disable_backbuffer_aa || max_aa_samples <= 1) ? 0 : TEXCF_MULTISAMPLED | TEXCF_MSAATARGET; - BaseTex *tex = BaseTex::create_tex(TEXFMT_DEPTH24 | TEXCF_RTARGET | msaaFlags, RES3D_TEX); + BaseTex *tex = BaseTex::create_tex(TEXFMT_DEPTH24 | TEXCF_RTARGET, RES3D_TEX); D3D11_TEXTURE2D_DESC desc; ZeroMemory(&desc, sizeof(desc)); @@ -1496,7 +1448,7 @@ Texture *drv3d_dx11::create_backbuffer_depth_tex(uint32_t w, uint32_t h) desc.Format = is_backbuffer_samplable_depth ? dxgi_format_for_create(DXGI_FORMAT_D24_UNORM_S8_UINT) : DXGI_FORMAT_D24_UNORM_S8_UINT; // DXGI_FORMAT_D32_FLOAT;// d32_float is enough for // everything, but no stencil - desc.SampleDesc.Count = disable_backbuffer_aa ? 1 : max_aa_samples; + desc.SampleDesc.Count = 1; desc.SampleDesc.Quality = 0; desc.Usage = D3D11_USAGE_DEFAULT; desc.BindFlags = D3D11_BIND_DEPTH_STENCIL; diff --git a/prog/engine/drv/drv3d_DX12/bindless.h b/prog/engine/drv/drv3d_DX12/bindless.h index e344d81b7..ade913726 100644 --- a/prog/engine/drv/drv3d_DX12/bindless.h +++ b/prog/engine/drv/drv3d_DX12/bindless.h @@ -1,10 +1,30 @@ #pragma once +#include <3d/dag_drv3d.h> +#include +#include +#include +#include +#include + +#include "image_view_state.h" +#include "sampler_state.h" +#include "container_mutex_wrapper.h" + + namespace drv3d_dx12 { + +class Device; +class DeviceContext; + struct BaseTex; +class Image; class ShaderResourceViewDescriptorHeapManager; class SamplerDescriptorHeapManager; + +struct NullResourceTable; + namespace frontend { class BindlessManager diff --git a/prog/engine/drv/drv3d_DX12/bitfield.h b/prog/engine/drv/drv3d_DX12/bitfield.h index ee9b7894c..6575275d9 100644 --- a/prog/engine/drv/drv3d_DX12/bitfield.h +++ b/prog/engine/drv/drv3d_DX12/bitfield.h @@ -1,5 +1,8 @@ #pragma once +#include + + // duplicated in vulkan backend... // taken from https://github.com/preshing/cpp11-on-multicore/blob/master/common/ @@ -171,4 +174,21 @@ struct BitFieldArray #define END_BITFIELD_TYPE() \ } \ - ; \ No newline at end of file + ; + +// TODO: this should be replaced with a constexpr function. It will compile a lot faster. +template +struct BitsNeeded +{ + static constexpr int VALUE = BitsNeeded::VALUE + 1; +}; +template <> +struct BitsNeeded<0> +{ + static constexpr int VALUE = 1; +}; +template <> +struct BitsNeeded<1> +{ + static constexpr int VALUE = 1; +}; diff --git a/prog/engine/drv/drv3d_DX12/buffer.h b/prog/engine/drv/drv3d_DX12/buffer.h index 0e98a0bfd..3a4e9d3bc 100644 --- a/prog/engine/drv/drv3d_DX12/buffer.h +++ b/prog/engine/drv/drv3d_DX12/buffer.h @@ -2,6 +2,12 @@ #include +#include "device_memory_class.h" +#include "format_store.h" +#include "constants.h" +#include "pipeline.h" + + namespace drv3d_dx12 { struct PlatformBufferInterfaceConfig; diff --git a/prog/engine/drv/drv3d_DX12/byte_units.h b/prog/engine/drv/drv3d_DX12/byte_units.h new file mode 100644 index 000000000..5102eef41 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/byte_units.h @@ -0,0 +1,58 @@ +#pragma once + +#include + + +inline uint32_t size_to_unit_table(uint64_t sz) +{ + uint32_t unitIndex = 0; + unitIndex += sz >= (1024 * 1024 * 1024); + unitIndex += sz >= (1024 * 1024); + unitIndex += sz >= (1024); + return unitIndex; +} + +inline const char *get_unit_name(uint32_t index) +{ + static const char *unitTable[] = {"Bytes", "KiBytes", "MiBytes", "GiBytes"}; + return unitTable[index]; +} + +inline float compute_unit_type_size(uint64_t sz, uint32_t unit_index) { return static_cast(sz) / (powf(1024, unit_index)); } + +class ByteUnits +{ + uint64_t size = 0; + +public: + ByteUnits() = default; + + ByteUnits(const ByteUnits &) = default; + ByteUnits &operator=(const ByteUnits &) = default; + + + ByteUnits(uint64_t v) : size{v} {} + ByteUnits &operator=(uint64_t v) + { + size = v; + return *this; + } + + ByteUnits &operator+=(ByteUnits o) + { + size += o.size; + return *this; + } + ByteUnits &operator-=(ByteUnits o) + { + size -= o.size; + return *this; + } + + friend ByteUnits operator+(ByteUnits l, ByteUnits r) { return {l.size + r.size}; } + friend ByteUnits operator-(ByteUnits l, ByteUnits r) { return {l.size - r.size}; } + + uint64_t value() const { return size; } + float units() const { return compute_unit_type_size(size, size_to_unit_table(size)); } + const char *name() const { return get_unit_name(size_to_unit_table(size)); } +}; diff --git a/prog/engine/drv/drv3d_DX12/command_list.h b/prog/engine/drv/drv3d_DX12/command_list.h index 01cf5678e..a5a0fd3d0 100644 --- a/prog/engine/drv/drv3d_DX12/command_list.h +++ b/prog/engine/drv/drv3d_DX12/command_list.h @@ -1,5 +1,12 @@ #pragma once +#include +#include + +#include "driver.h" +#include "extents.h" + + namespace drv3d_dx12 { #define DX12_BEGIN_VALIATION() bool hadError = false @@ -1735,6 +1742,12 @@ class BasicGraphicsCommandListImplementation : public RaytraceCommandList this->list->ClearRenderTargetView(render_target_view, color_rgba, num_rects, rects); } + void resolveSubresource(ID3D12Resource *dst_resource, UINT dst_subresource, ID3D12Resource *src_resource, UINT src_subresource, + DXGI_FORMAT format) + { + this->list->ResolveSubresource(dst_resource, dst_subresource, src_resource, src_subresource, format); + } + void clearDepthStencilView(D3D12_CPU_DESCRIPTOR_HANDLE depth_stencil_view, D3D12_CLEAR_FLAGS clear_flags, FLOAT depth, UINT8 stencil, UINT num_rects, const D3D12_RECT *rects) { @@ -1851,6 +1864,20 @@ class BasicGraphicsCommandListParameterValidataion : public BasicGraphicsCommand BaseType::clearRenderTargetView(render_target_view, color_rgba, num_rects, rects); } + // It validates the following properties of the inputs: + // - If resources is not 0 + void resolveSubresource(ID3D12Resource *dst_resource, UINT dst_subresource, ID3D12Resource *src_resource, UINT src_subresource, + DXGI_FORMAT format) + { +#define DX12_VALIDATAION_CONTEXT "resolveSubresource" + DX12_BEGIN_VALIATION(); + DX12_VALIDATE_CONDITION(dst_resource != nullptr, "dst resource can not be 0"); + DX12_VALIDATE_CONDITION(src_resource != nullptr, "src resource can not be 0"); + DX12_FINALIZE_VALIDATAION(); +#undef DX12_VALIDATAION_CONTEXT + this->list->ResolveSubresource(dst_resource, dst_subresource, src_resource, src_subresource, format); + } + // It validates the following properties of the inputs: // - If render_target_view is not 0 // - When num_rects is not 0 then rects is not null diff --git a/prog/engine/drv/drv3d_DX12/const_register_type.h b/prog/engine/drv/drv3d_DX12/const_register_type.h new file mode 100644 index 000000000..b97833d72 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/const_register_type.h @@ -0,0 +1,21 @@ +#pragma once + +#include +#include + +#include "constants.h" + + +namespace drv3d_dx12 +{ + +struct ConstRegisterType +{ + uint32_t components[SHADER_REGISTER_ELEMENTS]; +}; +inline bool operator==(const ConstRegisterType &l, const ConstRegisterType &r) +{ + return eastl::equal(eastl::begin(l.components), eastl::end(l.components), eastl::begin(r.components)); +} + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/constants.h b/prog/engine/drv/drv3d_DX12/constants.h index c7124e003..421a36b97 100644 --- a/prog/engine/drv/drv3d_DX12/constants.h +++ b/prog/engine/drv/drv3d_DX12/constants.h @@ -1,46 +1,49 @@ #pragma once +#include "drvCommonConsts.h" + + namespace drv3d_dx12 { #if _TARGET_PC_WIN -constexpr uint32_t FRAME_FRAME_BACKLOG_LENGTH = 4; +inline constexpr uint32_t FRAME_FRAME_BACKLOG_LENGTH = 4; #else -constexpr uint32_t FRAME_FRAME_BACKLOG_LENGTH = 2; +inline constexpr uint32_t FRAME_FRAME_BACKLOG_LENGTH = 2; #endif // has to be two to avoid constantly triggering waits on gpu and cpu -constexpr uint32_t FRAME_LATENCY = 2; -constexpr uint32_t DYNAMIC_BUFFER_DISCARD_BASE_COUNT = FRAME_FRAME_BACKLOG_LENGTH; -constexpr uint32_t DYNAMIC_CONST_BUFFER_DISCARD_BASE_COUNT = DYNAMIC_BUFFER_DISCARD_BASE_COUNT * 3; -constexpr uint32_t DEFAULT_BUFFER_DISCARD_BASE_COUNT = 1; +inline constexpr uint32_t FRAME_LATENCY = 2; +inline constexpr uint32_t DYNAMIC_BUFFER_DISCARD_BASE_COUNT = FRAME_FRAME_BACKLOG_LENGTH; +inline constexpr uint32_t DYNAMIC_CONST_BUFFER_DISCARD_BASE_COUNT = DYNAMIC_BUFFER_DISCARD_BASE_COUNT * 3; +inline constexpr uint32_t DEFAULT_BUFFER_DISCARD_BASE_COUNT = 1; -constexpr uint32_t MAX_VERTEX_ATTRIBUTES = 16; -constexpr uint32_t MAX_VERTEX_INPUT_STREAMS = 4; +inline constexpr uint32_t MAX_VERTEX_ATTRIBUTES = 16; +inline constexpr uint32_t MAX_VERTEX_INPUT_STREAMS = 4; -constexpr uint32_t MAX_VIEW_INSTANCES = 4; +inline constexpr uint32_t MAX_VIEW_INSTANCES = 4; -constexpr uint32_t SHADER_REGISTER_ELEMENTS = 4; +inline constexpr uint32_t SHADER_REGISTER_ELEMENTS = 4; typedef float ShaderRegisterElementType; -constexpr uint32_t SHADER_REGISTER_ELEMENT_SIZE = sizeof(ShaderRegisterElementType); -constexpr uint32_t SHADER_REGISTER_SIZE = SHADER_REGISTER_ELEMENTS * SHADER_REGISTER_ELEMENT_SIZE; +inline constexpr uint32_t SHADER_REGISTER_ELEMENT_SIZE = sizeof(ShaderRegisterElementType); +inline constexpr uint32_t SHADER_REGISTER_SIZE = SHADER_REGISTER_ELEMENTS * SHADER_REGISTER_ELEMENT_SIZE; -constexpr uint32_t DEFAULT_WAIT_SPINS = 4000; +inline constexpr uint32_t DEFAULT_WAIT_SPINS = 4000; -constexpr uint32_t UNIFORM_BUFFER_BLOCK_SIZE = 1024 * 1024 * 12; -constexpr uint32_t USER_POINTER_VERTEX_BLOCK_SIZE = 1024 * 1024 * 8; -constexpr uint32_t USER_POINTER_INDEX_BLOCK_SIZE = 1024 * 1024 * 2; -constexpr uint32_t INITIAL_UPDATE_BUFFER_BLOCK_SIZE = 1024 * 1024 * 2; +inline constexpr uint32_t UNIFORM_BUFFER_BLOCK_SIZE = 1024 * 1024 * 12; +inline constexpr uint32_t USER_POINTER_VERTEX_BLOCK_SIZE = 1024 * 1024 * 8; +inline constexpr uint32_t USER_POINTER_INDEX_BLOCK_SIZE = 1024 * 1024 * 2; +inline constexpr uint32_t INITIAL_UPDATE_BUFFER_BLOCK_SIZE = 1024 * 1024 * 2; // can be adjusted as needed, but be careful, too many may degrade performance because of spilling -constexpr uint32_t MAX_ROOT_CONSTANTS = 4; -constexpr uint32_t ROOT_CONSTANT_BUFFER_INDEX = 8; +inline constexpr uint32_t MAX_ROOT_CONSTANTS = 4; +inline constexpr uint32_t ROOT_CONSTANT_BUFFER_INDEX = 8; -constexpr uint32_t MAX_OBJECT_NAME_LENGTH = 512; +inline constexpr uint32_t MAX_OBJECT_NAME_LENGTH = 512; // After half a second we give up and assume lockup because of an error -constexpr uint32_t MAX_WAIT_OBJECT_TIMEOUT_MS = 500; +inline constexpr uint32_t MAX_WAIT_OBJECT_TIMEOUT_MS = 500; -constexpr uint32_t MAX_COMPUTE_CONST_REGISTERS = 4096; -constexpr uint32_t MIN_COMPUTE_CONST_REGISTERS = DEF_CS_CONSTS; -constexpr uint32_t VERTEX_SHADER_MAX_REGISTERS = 4096; -constexpr uint32_t VERTEX_SHADER_MIN_REGISTERS = MAX_VS_CONSTS_BONES; -constexpr uint32_t PIXEL_SHADER_REGISTERS = MAX_PS_CONSTS; +inline constexpr uint32_t MAX_COMPUTE_CONST_REGISTERS = 4096; +inline constexpr uint32_t MIN_COMPUTE_CONST_REGISTERS = DEF_CS_CONSTS; +inline constexpr uint32_t VERTEX_SHADER_MAX_REGISTERS = 4096; +inline constexpr uint32_t VERTEX_SHADER_MIN_REGISTERS = MAX_VS_CONSTS_BONES; +inline constexpr uint32_t PIXEL_SHADER_REGISTERS = MAX_PS_CONSTS; } // namespace drv3d_dx12 \ No newline at end of file diff --git a/prog/engine/drv/drv3d_DX12/container_mutex_wrapper.h b/prog/engine/drv/drv3d_DX12/container_mutex_wrapper.h new file mode 100644 index 000000000..5927cae10 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/container_mutex_wrapper.h @@ -0,0 +1,61 @@ +#pragma once + +#include + + +// Very simple wrapper to make a non thread safe container thread safe with the help of a paired mutex. +// Access is grated with a AccessToken type, which grants access to the containers interface. +template +class ContainerMutexWrapper +{ + MTX mtx; + T container; + + void lock() { mtx.lock(); } + + void unlock() { mtx.unlock(); } + + T &data() { return container; } + +public: + ContainerMutexWrapper() = default; + ~ContainerMutexWrapper() = default; + + ContainerMutexWrapper(const ContainerMutexWrapper &) = delete; + ContainerMutexWrapper &operator=(const ContainerMutexWrapper &) = delete; + + ContainerMutexWrapper(ContainerMutexWrapper &&) = delete; + ContainerMutexWrapper &operator=(ContainerMutexWrapper &&) = delete; + + class AccessToken + { + ContainerMutexWrapper *parent = nullptr; + + public: + AccessToken() = default; + ~AccessToken() + { + if (parent) + { + parent->unlock(); + } + } + + AccessToken(ContainerMutexWrapper &p) : parent{&p} { parent->lock(); } + + AccessToken(const AccessToken &) = delete; + AccessToken &operator=(const AccessToken &) = delete; + + AccessToken(AccessToken &&other) : parent{other.parent} { other.parent = nullptr; } + AccessToken &operator=(AccessToken &&other) + { + eastl::swap(parent, other.parent); + return *this; + } + + T &operator*() { return parent->data(); } + T *operator->() { return &parent->data(); } + }; + + AccessToken access() { return {*this}; } +}; diff --git a/prog/engine/drv/drv3d_DX12/d3d12_d3d_translation.h b/prog/engine/drv/drv3d_DX12/d3d12_d3d_translation.h new file mode 100644 index 000000000..9ff7ee939 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/d3d12_d3d_translation.h @@ -0,0 +1,72 @@ +#pragma once + +#include <3d/dag_drv3dConsts.h> +#include <3d/dag_sampler.h> + +#include "driver.h" + + +namespace drv3d_dx12 +{ + +// Stuff in this file translates various flags between DX12 and dagor d3d formats + +inline D3D12_COMPARISON_FUNC translate_compare_func_to_dx12(int cmp) { return static_cast(cmp); } +inline D3D12_STENCIL_OP translate_stencil_op_to_dx12(int so) { return static_cast(so); } +inline D3D12_BLEND translate_alpha_blend_mode_to_dx12(int b) { return static_cast(b); } +inline D3D12_BLEND translate_rgb_blend_mode_to_dx12(int b) { return static_cast(b); } +inline D3D12_BLEND_OP translate_blend_op_to_dx12(int bo) { return static_cast(bo); } + +inline D3D12_TEXTURE_ADDRESS_MODE translate_texture_address_mode_to_dx12(int mode) +{ + return static_cast(mode); +} + +inline int translate_texture_address_mode_to_engine(D3D12_TEXTURE_ADDRESS_MODE mode) { return static_cast(mode); } + +inline D3D12_FILTER_TYPE translate_filter_type_to_dx12(int ft) +{ + return (ft == TEXFILTER_POINT || ft == TEXFILTER_NONE) ? D3D12_FILTER_TYPE_POINT : D3D12_FILTER_TYPE_LINEAR; +} + +inline D3D12_FILTER_TYPE translate_mip_filter_type_to_dx12(int ft) +{ + return (ft == TEXMIPMAP_POINT || ft == TEXMIPMAP_NONE) ? D3D12_FILTER_TYPE_POINT : D3D12_FILTER_TYPE_LINEAR; +} + +inline D3D12_PRIMITIVE_TOPOLOGY translate_primitive_topology_to_dx12(int value) +{ + // G_ASSERTF(value < PRIM_TRIFAN, "primitive topology was %u", value); +#if _TARGET_XBOX + if (value == PRIM_QUADLIST) + return D3D_PRIMITIVE_TOPOLOGY_QUADLIST; +#endif + return static_cast(value); +} + +#if !_TARGET_XBOXONE +inline D3D12_SHADING_RATE_COMBINER map_shading_rate_combiner_to_dx12(VariableRateShadingCombiner combiner) +{ + G_STATIC_ASSERT(D3D12_SHADING_RATE_COMBINER_PASSTHROUGH == static_cast(VariableRateShadingCombiner::VRS_PASSTHROUGH)); + G_STATIC_ASSERT(D3D12_SHADING_RATE_COMBINER_OVERRIDE == static_cast(VariableRateShadingCombiner::VRS_OVERRIDE)); + G_STATIC_ASSERT(D3D12_SHADING_RATE_COMBINER_MIN == static_cast(VariableRateShadingCombiner::VRS_MIN)); + G_STATIC_ASSERT(D3D12_SHADING_RATE_COMBINER_MAX == static_cast(VariableRateShadingCombiner::VRS_MAX)); + G_STATIC_ASSERT(D3D12_SHADING_RATE_COMBINER_SUM == static_cast(VariableRateShadingCombiner::VRS_SUM)); + return static_cast(combiner); +} + +inline D3D12_SHADING_RATE make_shading_rate_from_int_values(unsigned x, unsigned y) +{ + G_ASSERTF_RETURN(x <= 4 && y <= 4, D3D12_SHADING_RATE_1X1, "Variable Shading Rate can not exceed 4"); + G_ASSERTF_RETURN(x != 3 && y != 3, D3D12_SHADING_RATE_1X1, "Variable Shading Rate can not be 3"); + G_ASSERTF_RETURN(abs(int(x / 2) - int(y / 2)) < 2, D3D12_SHADING_RATE_1X1, + "Variable Shading Rate invalid combination of x=%u and y=%u shading rates", x, y); + G_STATIC_ASSERT(D3D12_SHADING_RATE_X_AXIS_SHIFT == 2); + G_STATIC_ASSERT(D3D12_SHADING_RATE_VALID_MASK == 3); + // simple formula (x-rate / 2) << 2 | (y-rage / 2) + // valid range for x and y are 1, 2 and 4 + return static_cast(((x >> 1) << D3D12_SHADING_RATE_X_AXIS_SHIFT) | (y >> 1)); +} +#endif + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/d3d12_debug_names.h b/prog/engine/drv/drv3d_DX12/d3d12_debug_names.h new file mode 100644 index 000000000..68b0aa43d --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/d3d12_debug_names.h @@ -0,0 +1,45 @@ +#pragma once + +#include +#include + +#include "driver.h" + + +namespace drv3d_dx12 +{ + +inline wchar_t *lazyToWchar(const char *str, wchar_t *buf, size_t max_len) +{ + auto ed = str + max_len - 1; + auto at = buf; + for (; *str && str != ed; ++str, ++at) + *at = *str; + *at = L'\0'; + return buf; +} + +// NOTE: This is intended for debug only, this is possibly slow, so use with care! +template +inline char *get_resource_name(ID3D12Resource *res, char (&cbuf)[N]) +{ +#if !_TARGET_XBOXONE + wchar_t wcbuf[N]; + UINT cnt = sizeof(wcbuf); + res->GetPrivateData(WKPDID_D3DDebugObjectNameW, &cnt, wcbuf); + eastl::copy(wcbuf, wcbuf + cnt / sizeof(wchar_t), cbuf); + cbuf[min(cnt, N - 1)] = '\0'; +#else + G_UNUSED(res); + cbuf[0] = 0; +#endif + return cbuf; +} + +#if DX12_DOES_SET_DEBUG_NAMES +#define DX12_SET_DEBUG_OBJ_NAME(obj, name) obj->SetName(name) +#else +#define DX12_SET_DEBUG_OBJ_NAME(obj, name) +#endif + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/d3d12_error_handling.h b/prog/engine/drv/drv3d_DX12/d3d12_error_handling.h new file mode 100644 index 000000000..d724ab59e --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/d3d12_error_handling.h @@ -0,0 +1,167 @@ +#pragma once + +#include + +#include "driver.h" +#include "d3d12_utils.h" + + +namespace drv3d_dx12 +{ + +#define D3D12_ERROR_INVALID_HOST_EXE_SDK_VERSION _HRESULT_TYPEDEF_(0x887E0003L) + +void report_oom_info(); +void set_last_error(HRESULT error); +HRESULT get_last_error_code(); +inline const char *dxgi_error_code_to_string(HRESULT ec) +{ +#define ENUM_CASE(Name) \ + case Name: return #Name + switch (ec) + { + ENUM_CASE(E_FAIL); // returned by init code if a step fails in a fatal way + ENUM_CASE(DXGI_ERROR_INVALID_CALL); + ENUM_CASE(DXGI_ERROR_NOT_FOUND); + ENUM_CASE(DXGI_ERROR_MORE_DATA); + ENUM_CASE(DXGI_ERROR_UNSUPPORTED); + ENUM_CASE(DXGI_ERROR_DEVICE_REMOVED); + ENUM_CASE(DXGI_ERROR_DEVICE_HUNG); + ENUM_CASE(DXGI_ERROR_DEVICE_RESET); + ENUM_CASE(DXGI_ERROR_WAS_STILL_DRAWING); + ENUM_CASE(DXGI_ERROR_FRAME_STATISTICS_DISJOINT); + ENUM_CASE(DXGI_ERROR_GRAPHICS_VIDPN_SOURCE_IN_USE); + ENUM_CASE(DXGI_ERROR_DRIVER_INTERNAL_ERROR); + ENUM_CASE(DXGI_ERROR_NONEXCLUSIVE); + ENUM_CASE(DXGI_ERROR_NOT_CURRENTLY_AVAILABLE); + ENUM_CASE(DXGI_ERROR_REMOTE_CLIENT_DISCONNECTED); + ENUM_CASE(DXGI_ERROR_REMOTE_OUTOFMEMORY); + ENUM_CASE(DXGI_ERROR_ACCESS_LOST); + ENUM_CASE(DXGI_ERROR_WAIT_TIMEOUT); + ENUM_CASE(DXGI_ERROR_SESSION_DISCONNECTED); + ENUM_CASE(DXGI_ERROR_RESTRICT_TO_OUTPUT_STALE); + ENUM_CASE(DXGI_ERROR_CANNOT_PROTECT_CONTENT); + ENUM_CASE(DXGI_ERROR_ACCESS_DENIED); + ENUM_CASE(DXGI_ERROR_NAME_ALREADY_EXISTS); + ENUM_CASE(DXGI_STATUS_UNOCCLUDED); + ENUM_CASE(DXGI_STATUS_DDA_WAS_STILL_DRAWING); + ENUM_CASE(DXGI_ERROR_MODE_CHANGE_IN_PROGRESS); + ENUM_CASE(E_INVALIDARG); + ENUM_CASE(E_OUTOFMEMORY); +#if _TARGET_PC_WIN + ENUM_CASE(D3D12_ERROR_ADAPTER_NOT_FOUND); + ENUM_CASE(D3D12_ERROR_DRIVER_VERSION_MISMATCH); + ENUM_CASE(D3D12_ERROR_INVALID_HOST_EXE_SDK_VERSION); +#endif + } +#undef ENUM_CASE + + return ""; +} + +inline HRESULT dx12_check_result_no_oom_report(HRESULT result, const char *DAGOR_HAS_LOGS(expr), const char *DAGOR_HAS_LOGS(file), + int DAGOR_HAS_LOGS(line)) +{ + if (SUCCEEDED(result)) + return result; + + set_last_error(result); + + auto resultStr = dxgi_error_code_to_string(result); + if ('\0' == resultStr[0]) + { + logerr("%s returned unknown return code %u, %s %u", expr, result, file, line); + } + else + { + logerr("%s returned %s, %s %u", expr, resultStr, file, line); + } + + return result; +} + +inline bool is_oom_error_code(HRESULT result) { return E_OUTOFMEMORY == result; } + +inline HRESULT dx12_check_result(HRESULT result, const char *DAGOR_HAS_LOGS(expr), const char *DAGOR_HAS_LOGS(file), + int DAGOR_HAS_LOGS(line)) +{ + if (SUCCEEDED(result)) + return result; + + if (is_oom_error_code(result)) + { + report_oom_info(); + } + + set_last_error(result); + + auto resultStr = dxgi_error_code_to_string(result); + if ('\0' == resultStr[0]) + { + logerr("%s returned unknown return code %u, %s %u", expr, result, file, line); + } + else + { + logerr("%s returned %s, %s %u", expr, resultStr, file, line); + } + + return result; +} + +inline bool is_recoverable_error(HRESULT error) +{ + switch (error) + { + default: return true; + // any device error is not recoverable + case DXGI_ERROR_DEVICE_REMOVED: + case DXGI_ERROR_DEVICE_HUNG: + case DXGI_ERROR_DEVICE_RESET: return false; + } +} + +inline HRESULT dx12_debug_result(HRESULT result, const char *DAGOR_HAS_LOGS(expr), const char *DAGOR_HAS_LOGS(file), + int DAGOR_HAS_LOGS(line)) +{ + if (SUCCEEDED(result)) + return result; + + set_last_error(result); + + auto resultStr = dxgi_error_code_to_string(result); + if ('\0' == resultStr[0]) + { + debug("%s returned unknown return code %u, %s %u", expr, result, file, line); + } + else + { + debug("%s returned %s, %s %u", expr, resultStr, file, line); + } + + return result; +} + +#define DX12_DEBUG_RESULT(r) drv3d_dx12::dx12_debug_result(r, #r, __FILE__, __LINE__) +#define DX12_DEBUG_OK(r) SUCCEEDED(DX12_DEBUG_RESULT(r)) +#define DX12_DEBUG_FAIL(r) FAILED(DX12_DEBUG_RESULT(r)) + +#define DX12_CHECK_RESULT(r) drv3d_dx12::dx12_check_result(r, #r, __FILE__, __LINE__) +#define DX12_CHECK_OK(r) SUCCEEDED(DX12_CHECK_RESULT(r)) +#define DX12_CHECK_FAIL(r) FAILED(DX12_CHECK_RESULT(r)) +#define DX12_EXIT_ON_FAIL(r) \ + if (DX12_CHECK_FAIL(r)) \ + { \ + /* no-op */ \ + } + +#define DX12_CHECK_RESULT_NO_OOM_CHECK(r) drv3d_dx12::dx12_check_result_no_oom_report(r, #r, __FILE__, __LINE__) + +inline void report_resource_alloc_info_error(const D3D12_RESOURCE_DESC &desc) +{ + logerr("DX12: Error while querying resource allocation info, resource desc: %s, %u, %u x %u x " + "%u, %u, %s, %u by %u, %u, %08X", + to_string(desc.Dimension), desc.Alignment, desc.Width, desc.Height, desc.DepthOrArraySize, desc.MipLevels, + dxgi_format_name(desc.Format), desc.SampleDesc.Count, desc.SampleDesc.Quality, desc.Layout, desc.Flags); +} + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/d3d12_utils.h b/prog/engine/drv/drv3d_DX12/d3d12_utils.h new file mode 100644 index 000000000..e409aa8aa --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/d3d12_utils.h @@ -0,0 +1,208 @@ +#pragma once + +#include + +#include "driver.h" +#include "util.h" + + +inline bool operator==(D3D12_CPU_DESCRIPTOR_HANDLE l, D3D12_CPU_DESCRIPTOR_HANDLE r) { return l.ptr == r.ptr; } +inline bool operator!=(D3D12_CPU_DESCRIPTOR_HANDLE l, D3D12_CPU_DESCRIPTOR_HANDLE r) { return !(l == r); } + +inline bool operator==(D3D12_GPU_DESCRIPTOR_HANDLE l, D3D12_GPU_DESCRIPTOR_HANDLE r) { return l.ptr == r.ptr; } +inline bool operator!=(D3D12_GPU_DESCRIPTOR_HANDLE l, D3D12_GPU_DESCRIPTOR_HANDLE r) { return !(l == r); } + +inline D3D12_RECT asRect(const D3D12_VIEWPORT &vp) +{ + D3D12_RECT rect; + rect.left = vp.TopLeftX; + rect.top = vp.TopLeftY; + rect.right = vp.TopLeftX + vp.Width; + rect.bottom = vp.TopLeftY + vp.Height; + return rect; +} +#if !_TARGET_XBOXONE +inline bool operator==(const D3D12_RECT &l, const D3D12_RECT &r) +{ + return l.left == r.left && l.top == r.top && l.right == r.right && l.bottom == r.bottom; +} +inline bool operator!=(const D3D12_RECT &l, const D3D12_RECT &r) { return !(l == r); } +#endif + +inline const char *to_string(D3D12_RESOURCE_DIMENSION dim) +{ + switch (dim) + { + default: return ""; + case D3D12_RESOURCE_DIMENSION_UNKNOWN: return "unknown"; + case D3D12_RESOURCE_DIMENSION_BUFFER: return "buffer"; + case D3D12_RESOURCE_DIMENSION_TEXTURE1D: return "texture 1D"; + case D3D12_RESOURCE_DIMENSION_TEXTURE2D: return "texture 2D"; + case D3D12_RESOURCE_DIMENSION_TEXTURE3D: return "texture 3D"; + } +} + +inline const char *to_string(D3D12_HEAP_TYPE type) +{ + switch (type) + { + case D3D12_HEAP_TYPE_DEFAULT: return "default"; + case D3D12_HEAP_TYPE_UPLOAD: return "upload"; + case D3D12_HEAP_TYPE_READBACK: return "read back"; + case D3D12_HEAP_TYPE_CUSTOM: return "custom"; + } + return "??"; +} + +inline const char *to_string(D3D12_CPU_PAGE_PROPERTY property) +{ + switch (property) + { + case D3D12_CPU_PAGE_PROPERTY_UNKNOWN: return "unknown"; + case D3D12_CPU_PAGE_PROPERTY_NOT_AVAILABLE: return "not available"; + case D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE: return "write combine"; + case D3D12_CPU_PAGE_PROPERTY_WRITE_BACK: return "write back"; + } + return "??"; +} + +inline const char *to_string(D3D12_MEMORY_POOL pool) +{ + switch (pool) + { + case D3D12_MEMORY_POOL_UNKNOWN: return "unknown"; + case D3D12_MEMORY_POOL_L0: return "L0"; + case D3D12_MEMORY_POOL_L1: return "L1"; + } + return "??"; +} + +inline D3D12_GPU_DESCRIPTOR_HANDLE operator+(D3D12_GPU_DESCRIPTOR_HANDLE l, uint64_t r) { return {l.ptr + r}; } +inline D3D12_CPU_DESCRIPTOR_HANDLE operator+(D3D12_CPU_DESCRIPTOR_HANDLE l, size_t r) { return {l.ptr + r}; } + +template +inline char *resource_state_mask_as_string(D3D12_RESOURCE_STATES mask, char (&cbuf)[N]) +{ + auto at = cbuf; + auto ed = cbuf + N - 1; + if (mask == D3D12_RESOURCE_STATE_COMMON) + { + at = append_literal(at, ed, "COMMON"); + } + else + { +#define CHECK_MASK(name) \ + if (D3D12_RESOURCE_STATE_##name == (mask & D3D12_RESOURCE_STATE_##name)) \ + { \ + at = append_or_mask_value_name(cbuf, at, ed, #name); \ + mask ^= D3D12_RESOURCE_STATE_##name; \ + } + // combined state, has to be first + CHECK_MASK(GENERIC_READ) + // single state + CHECK_MASK(VERTEX_AND_CONSTANT_BUFFER) + CHECK_MASK(INDEX_BUFFER) + CHECK_MASK(RENDER_TARGET) + CHECK_MASK(UNORDERED_ACCESS) + CHECK_MASK(DEPTH_WRITE) + CHECK_MASK(DEPTH_READ) + CHECK_MASK(NON_PIXEL_SHADER_RESOURCE) + CHECK_MASK(PIXEL_SHADER_RESOURCE) + CHECK_MASK(STREAM_OUT) + CHECK_MASK(INDIRECT_ARGUMENT) + CHECK_MASK(COPY_DEST) + CHECK_MASK(COPY_SOURCE) + CHECK_MASK(RESOLVE_DEST) + CHECK_MASK(RESOLVE_SOURCE) +#if !_TARGET_XBOXONE + CHECK_MASK(RAYTRACING_ACCELERATION_STRUCTURE) + CHECK_MASK(SHADING_RATE_SOURCE) +#endif + CHECK_MASK(PREDICATION) + CHECK_MASK(VIDEO_DECODE_READ) + CHECK_MASK(VIDEO_DECODE_WRITE) + CHECK_MASK(VIDEO_PROCESS_READ) + CHECK_MASK(VIDEO_PROCESS_WRITE) + CHECK_MASK(VIDEO_ENCODE_READ) + CHECK_MASK(VIDEO_ENCODE_WRITE) +#undef CHECK_MASK + } + *at = '\0'; + return cbuf; +} + +inline D3D12_PRIMITIVE_TOPOLOGY pimitive_type_to_primtive_topology(D3D_PRIMITIVE pt, D3D12_PRIMITIVE_TOPOLOGY initial) +{ + if (pt >= D3D_PRIMITIVE_1_CONTROL_POINT_PATCH && pt <= D3D_PRIMITIVE_32_CONTROL_POINT_PATCH) + return static_cast( + D3D_PRIMITIVE_TOPOLOGY_1_CONTROL_POINT_PATCHLIST + pt - D3D_PRIMITIVE_1_CONTROL_POINT_PATCH); + return initial; +} + +inline D3D12_PRIMITIVE_TOPOLOGY_TYPE topology_to_topology_type(D3D12_PRIMITIVE_TOPOLOGY top) +{ + if (D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST <= top && D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP_ADJ >= top) + return D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; + if (D3D_PRIMITIVE_TOPOLOGY_POINTLIST == top) + return D3D12_PRIMITIVE_TOPOLOGY_TYPE_POINT; + if (D3D_PRIMITIVE_TOPOLOGY_LINELIST == top || D3D_PRIMITIVE_TOPOLOGY_LINESTRIP == top) + return D3D12_PRIMITIVE_TOPOLOGY_TYPE_LINE; + if (D3D_PRIMITIVE_TOPOLOGY_UNDEFINED == top) + return D3D12_PRIMITIVE_TOPOLOGY_TYPE_UNDEFINED; +#if _TARGET_XBOX + if (D3D_PRIMITIVE_TOPOLOGY_QUADLIST == top) + return PRIMITIVE_TOPOLOGY_TYPE_QUAD; +#endif + return D3D12_PRIMITIVE_TOPOLOGY_TYPE_PATCH; +} + +#if _TARGET_PC_WIN +inline DXGI_QUERY_VIDEO_MEMORY_INFO max(const DXGI_QUERY_VIDEO_MEMORY_INFO &l, const DXGI_QUERY_VIDEO_MEMORY_INFO &r) +{ + DXGI_QUERY_VIDEO_MEMORY_INFO result; + result.Budget = max(l.Budget, r.Budget); + result.CurrentUsage = max(l.CurrentUsage, r.CurrentUsage); + result.AvailableForReservation = max(l.AvailableForReservation, r.AvailableForReservation); + result.CurrentReservation = max(l.CurrentReservation, r.CurrentReservation); + return result; +} +#endif + +inline bool is_valid_allocation_info(const D3D12_RESOURCE_ALLOCATION_INFO &info) +{ + // On error DX12 returns ~0 in the SizeInBytes member. + return 0 != ~info.SizeInBytes; +} + +inline uint64_t get_next_resource_alignment(uint64_t alignment, uint32_t samples) +{ + if (D3D12_SMALL_RESOURCE_PLACEMENT_ALIGNMENT == alignment) + { + return D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT; + } + if (samples > 1 && (D3D12_SMALL_MSAA_RESOURCE_PLACEMENT_ALIGNMENT == alignment)) + { + return D3D12_DEFAULT_MSAA_RESOURCE_PLACEMENT_ALIGNMENT; + } + + return alignment; +} + +// NOTE: may adjust desc.Alignment if the requested alignment could not be used +inline D3D12_RESOURCE_ALLOCATION_INFO get_resource_allocation_info(ID3D12Device *device, D3D12_RESOURCE_DESC &desc) +{ + G_ASSERTF(desc.Alignment != 0, "DX12: desc.Alignment should not be 0!"); + auto result = device->GetResourceAllocationInfo(0, 1, &desc); + if (!is_valid_allocation_info(result)) + { + auto nextAlignment = get_next_resource_alignment(desc.Alignment, desc.SampleDesc.Count); + if (nextAlignment != desc.Alignment) + { + desc.Alignment = nextAlignment; + result = device->GetResourceAllocationInfo(0, 1, &desc); + } + } + return result; +} + +const char *dxgi_format_name(DXGI_FORMAT fmt); diff --git a/prog/engine/drv/drv3d_DX12/d3d_cap_set_xmacro.h b/prog/engine/drv/drv3d_DX12/d3d_cap_set_xmacro.h new file mode 100644 index 000000000..6516e11ea --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/d3d_cap_set_xmacro.h @@ -0,0 +1,70 @@ +#pragma once + +// used for de and encoding into blk's +#define DX12_D3D_CAP_SET \ + DX12_D3D_CAP(hasAnisotropicFilter); \ + DX12_D3D_CAP(hasDepthReadOnly); \ + DX12_D3D_CAP(hasStructuredBuffers); \ + DX12_D3D_CAP(hasNoOverwriteOnShaderResourceBuffers); \ + DX12_D3D_CAP(hasForcedSamplerCount); \ + DX12_D3D_CAP(hasVolMipMap); \ + DX12_D3D_CAP(hasAsyncCompute); \ + DX12_D3D_CAP(hasOcclusionQuery); \ + DX12_D3D_CAP(hasConstBufferOffset); \ + DX12_D3D_CAP(hasDepthBoundsTest); \ + DX12_D3D_CAP(hasConditionalRender); \ + DX12_D3D_CAP(hasResourceCopyConversion); \ + DX12_D3D_CAP(hasAsyncCopy); \ + DX12_D3D_CAP(hasReadMultisampledDepth); \ + DX12_D3D_CAP(hasInstanceID); \ + DX12_D3D_CAP(hasConservativeRassterization); \ + DX12_D3D_CAP(hasQuadTessellation); \ + DX12_D3D_CAP(hasGather4); \ + DX12_D3D_CAP(hasAlphaCoverage); \ + DX12_D3D_CAP(hasWellSupportedIndirect); \ + DX12_D3D_CAP(hasRaytracing); \ + DX12_D3D_CAP(hasRaytracingT11); \ + DX12_D3D_CAP(hasBindless); \ + DX12_D3D_CAP(hasNVApi); \ + DX12_D3D_CAP(hasATIApi); \ + DX12_D3D_CAP(hasVariableRateShading); \ + DX12_D3D_CAP(hasVariableRateShadingTexture); \ + DX12_D3D_CAP(hasVariableRateShadingShaderOutput); \ + DX12_D3D_CAP(hasVariableRateShadingCombiners); \ + DX12_D3D_CAP(hasVariableRateShadingBy4); \ + DX12_D3D_CAP(hasAliasedTextures); \ + DX12_D3D_CAP(hasResourceHeaps); \ + DX12_D3D_CAP(hasBufferOverlapCopy); \ + DX12_D3D_CAP(hasBufferOverlapRegionsCopy); \ + DX12_D3D_CAP(hasUAVOnlyForcedSampleCount); \ + DX12_D3D_CAP(hasShader64BitIntegerResources); \ + DX12_D3D_CAP(hasNativeRenderPassSubPasses); \ + DX12_D3D_CAP(hasTiled2DResources); \ + DX12_D3D_CAP(hasTiled3DResources); \ + DX12_D3D_CAP(hasTiledSafeResourcesAccess); \ + DX12_D3D_CAP(hasTiledMemoryAliasing); \ + DX12_D3D_CAP(hasDLSS); \ + DX12_D3D_CAP(hasXESS); \ + DX12_D3D_CAP(hasDrawID); \ + DX12_D3D_CAP(hasMeshShader); \ + DX12_D3D_CAP(hasBasicViewInstancing); \ + DX12_D3D_CAP(hasOptimizedViewInstancing); \ + DX12_D3D_CAP(hasAcceleratedViewInstancing); \ + DX12_D3D_CAP(hasRenderPassDepthResolve); \ + DX12_D3D_CAP(hasStereoExpansion); \ + DX12_D3D_CAP(hasTileBasedArchitecture); \ + DX12_D3D_CAP(hasLazyMemory); \ + DX12_D3D_CAP(hasIndirectSupport); \ + DX12_D3D_CAP(hasCompareSampler); + +#define DX12_D3D_CAP_SET_RELEVANT_FOR_PIPELINES \ + DX12_D3D_CAP(hasRaytracing); \ + DX12_D3D_CAP(hasRaytracingT11); \ + DX12_D3D_CAP(hasVariableRateShadingShaderOutput); \ + DX12_D3D_CAP(hasVariableRateShadingBy4); \ + DX12_D3D_CAP(hasShader64BitIntegerResources); \ + DX12_D3D_CAP(hasTiled2DResources); \ + DX12_D3D_CAP(hasTiled3DResources); \ + DX12_D3D_CAP(hasMeshShader); \ + DX12_D3D_CAP(hasBasicViewInstancing); \ + DX12_D3D_CAP(hasShaderFloat16Support); diff --git a/prog/engine/drv/drv3d_DX12/d3dformat.h b/prog/engine/drv/drv3d_DX12/d3dformat.h index c16607940..53bfc9fd5 100644 --- a/prog/engine/drv/drv3d_DX12/d3dformat.h +++ b/prog/engine/drv/drv3d_DX12/d3dformat.h @@ -1,5 +1,7 @@ #pragma once +#include "driver.h" // MAKEFOURCC is defined somewhere there + /* Formats * Most of these names have the following convention: * A = Alpha diff --git a/prog/engine/drv/drv3d_DX12/debug/break_point.h b/prog/engine/drv/drv3d_DX12/debug/break_point.h index 01f440b1a..a78f87de6 100644 --- a/prog/engine/drv/drv3d_DX12/debug/break_point.h +++ b/prog/engine/drv/drv3d_DX12/debug/break_point.h @@ -1,7 +1,9 @@ #pragma once +#include #include "call_stack.h" + namespace drv3d_dx12::debug::break_point { namespace core diff --git a/prog/engine/drv/drv3d_DX12/debug/call_stack.h b/prog/engine/drv/drv3d_DX12/debug/call_stack.h index 5cfc8788a..0215c9ab1 100644 --- a/prog/engine/drv/drv3d_DX12/debug/call_stack.h +++ b/prog/engine/drv/drv3d_DX12/debug/call_stack.h @@ -1,10 +1,12 @@ #pragma once +#include "driver.h" #include "call_stack_null.h" #include "call_stack_return_address.h" #include "call_stack_full_stack.h" #include "call_stack_selectable.h" + namespace drv3d_dx12 { namespace debug diff --git a/prog/engine/drv/drv3d_DX12/debug/call_stack_full_stack.h b/prog/engine/drv/drv3d_DX12/debug/call_stack_full_stack.h index b172344fd..6a1aa9db7 100644 --- a/prog/engine/drv/drv3d_DX12/debug/call_stack_full_stack.h +++ b/prog/engine/drv/drv3d_DX12/debug/call_stack_full_stack.h @@ -1,9 +1,15 @@ #pragma once #include -#include -#include -#include +#include +#include +#include +#include +#include +#include + + +class DataBlock; namespace drv3d_dx12 { @@ -13,7 +19,7 @@ namespace call_stack { namespace full_stack { -static constexpr uint32_t max_call_stack_depth = 32; +inline constexpr uint32_t max_call_stack_depth = 32; using CallStack = eastl::array; struct CallStackHasher diff --git a/prog/engine/drv/drv3d_DX12/debug/call_stack_null.h b/prog/engine/drv/drv3d_DX12/debug/call_stack_null.h index 42d494a9a..ccf81e753 100644 --- a/prog/engine/drv/drv3d_DX12/debug/call_stack_null.h +++ b/prog/engine/drv/drv3d_DX12/debug/call_stack_null.h @@ -1,5 +1,10 @@ #pragma once +#include +#include +#include + + namespace drv3d_dx12 { namespace debug diff --git a/prog/engine/drv/drv3d_DX12/debug/call_stack_return_address.h b/prog/engine/drv/drv3d_DX12/debug/call_stack_return_address.h index 808802d85..8551483d8 100644 --- a/prog/engine/drv/drv3d_DX12/debug/call_stack_return_address.h +++ b/prog/engine/drv/drv3d_DX12/debug/call_stack_return_address.h @@ -1,5 +1,10 @@ #pragma once +#include +#include +#include + + #if COMMANDS_STORE_RETURN_ADDRESS #include diff --git a/prog/engine/drv/drv3d_DX12/debug/command_list_storage.h b/prog/engine/drv/drv3d_DX12/debug/command_list_storage.h index d28f9822c..be5602afb 100644 --- a/prog/engine/drv/drv3d_DX12/debug/command_list_storage.h +++ b/prog/engine/drv/drv3d_DX12/debug/command_list_storage.h @@ -1,5 +1,10 @@ #pragma once +#include + +#include "driver.h" + + namespace drv3d_dx12::debug { // Possible improvement: diff --git a/prog/engine/drv/drv3d_DX12/debug/command_list_trace.h b/prog/engine/drv/drv3d_DX12/debug/command_list_trace.h index 6d280525d..d4a03bbf4 100644 --- a/prog/engine/drv/drv3d_DX12/debug/command_list_trace.h +++ b/prog/engine/drv/drv3d_DX12/debug/command_list_trace.h @@ -1,6 +1,13 @@ #pragma once +#include +#include + +#include "driver.h" #include "pipeline_resource_reporter.h" +#include "call_stack.h" +#include "pipeline.h" + inline const char *to_string(D3D12_AUTO_BREADCRUMB_OP op) { diff --git a/prog/engine/drv/drv3d_DX12/debug/command_list_trace_recorder.h b/prog/engine/drv/drv3d_DX12/debug/command_list_trace_recorder.h index 50f5cf822..181af18af 100644 --- a/prog/engine/drv/drv3d_DX12/debug/command_list_trace_recorder.h +++ b/prog/engine/drv/drv3d_DX12/debug/command_list_trace_recorder.h @@ -1,5 +1,11 @@ #pragma once +#include + +#include "driver.h" +#include "winapi_helpers.h" + + namespace drv3d_dx12::debug { class CommandListTraceRecorder diff --git a/prog/engine/drv/drv3d_DX12/debug/configuration.h b/prog/engine/drv/drv3d_DX12/debug/configuration.h index 88d7f7efc..4745311d7 100644 --- a/prog/engine/drv/drv3d_DX12/debug/configuration.h +++ b/prog/engine/drv/drv3d_DX12/debug/configuration.h @@ -1,5 +1,9 @@ #pragma once +#include +#include + + namespace drv3d_dx12::debug { union Configuration diff --git a/prog/engine/drv/drv3d_DX12/debug/device_state_pc.cpp b/prog/engine/drv/drv3d_DX12/debug/device_state_pc.cpp index 7d88f1819..9ab956e93 100644 --- a/prog/engine/drv/drv3d_DX12/debug/device_state_pc.cpp +++ b/prog/engine/drv/drv3d_DX12/debug/device_state_pc.cpp @@ -89,7 +89,7 @@ bool debug::pc::DeviceState::setup(debug::GlobalState &global, ID3D12Device *dev auto denyId = get_ignored_validation_messages(*::dgs_get_settings()->getBlockByNameEx("dx12")->getBlockByNameEx("debug")); defaultFilter.DenyList.pSeverityList = denySeverity; - defaultFilter.DenyList.NumSeverities = static_cast(array_size(denySeverity)); + defaultFilter.DenyList.NumSeverities = static_cast(countof(denySeverity)); defaultFilter.DenyList.pIDList = denyId.data(); defaultFilter.DenyList.NumIDs = static_cast(denyId.size()); debugQueue->AddRetrievalFilterEntries(&defaultFilter); diff --git a/prog/engine/drv/drv3d_DX12/debug/event_marker_tracker.h b/prog/engine/drv/drv3d_DX12/debug/event_marker_tracker.h index 89422d0c4..e915db5e4 100644 --- a/prog/engine/drv/drv3d_DX12/debug/event_marker_tracker.h +++ b/prog/engine/drv/drv3d_DX12/debug/event_marker_tracker.h @@ -1,6 +1,8 @@ #pragma once #include +#include + namespace drv3d_dx12::debug::event_marker { diff --git a/prog/engine/drv/drv3d_DX12/debug/gpu_capture.h b/prog/engine/drv/drv3d_DX12/debug/gpu_capture.h index 76ac63103..0eb7e6ebe 100644 --- a/prog/engine/drv/drv3d_DX12/debug/gpu_capture.h +++ b/prog/engine/drv/drv3d_DX12/debug/gpu_capture.h @@ -1,8 +1,13 @@ #pragma once #include +#include +#include +#include "driver.h" #include "configuration.h" +#include "winapi_helpers.h" + struct RENDERDOC_API_1_5_0; @@ -12,6 +17,11 @@ interface DECLSPEC_UUID("9f251514-9d4d-4902-9d60-18988ab7d4b5") DECLSPEC_NOVTABL STDMETHOD_(void, EndCapture)() PURE; }; +namespace drv3d_dx12 +{ +struct Direct3D12Enviroment; +} + namespace drv3d_dx12 { namespace debug diff --git a/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_dagor_trace.h b/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_dagor_trace.h index dfc821a51..ffba8dd93 100644 --- a/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_dagor_trace.h +++ b/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_dagor_trace.h @@ -1,9 +1,21 @@ #pragma once +#include + #include "command_list_storage.h" #include "command_list_trace.h" #include "command_list_trace_recorder.h" + +namespace drv3d_dx12 +{ +struct Direct3D12Enviroment; +namespace debug +{ +union Configuration; +} +} // namespace drv3d_dx12 + namespace drv3d_dx12::debug::gpu_postmortem::dagor { class Trace diff --git a/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_microsoft_dred.h b/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_microsoft_dred.h index 812cbe4b1..08bad478d 100644 --- a/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_microsoft_dred.h +++ b/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_microsoft_dred.h @@ -4,6 +4,16 @@ #include "command_list_trace.h" #include "command_list_trace_recorder.h" + +namespace drv3d_dx12 +{ +struct Direct3D12Enviroment; +namespace debug +{ +union Configuration; +} +} // namespace drv3d_dx12 + inline const char *to_string(D3D12_DRED_ALLOCATION_TYPE type) { switch (type) diff --git a/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_null_trace.h b/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_null_trace.h index 277ea4fdd..ac541ad22 100644 --- a/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_null_trace.h +++ b/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_null_trace.h @@ -3,11 +3,20 @@ #include "call_stack.h" #include +#include #include -#include "command_list_storage.h" -#include "command_list_trace_recorder.h" -#include "gpu_postmortem_null_trace.h" +#include "pipeline.h" + + +namespace drv3d_dx12 +{ +struct Direct3D12Enviroment; +namespace debug +{ +union Configuration; +} +} // namespace drv3d_dx12 namespace drv3d_dx12::debug::gpu_postmortem { diff --git a/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_nvidia_aftermath.h b/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_nvidia_aftermath.h index 3a2aab11c..b4080d6c2 100644 --- a/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_nvidia_aftermath.h +++ b/prog/engine/drv/drv3d_DX12/debug/gpu_postmortem_nvidia_aftermath.h @@ -1,9 +1,26 @@ #pragma once +#include +#include + +#include "driver.h" +#include "pipeline.h" +#include "winapi_helpers.h" +#include "command_list_storage.h" +#include "call_stack.h" +#include "configuration.h" + +// These headers are not self-contained and need to be included after driver.h #include #include #include + +namespace drv3d_dx12 +{ +struct Direct3D12Enviroment; +} + namespace drv3d_dx12::debug::gpu_postmortem::nvidia { class Aftermath diff --git a/prog/engine/drv/drv3d_DX12/debug/pipeline_resource_reporter.h b/prog/engine/drv/drv3d_DX12/debug/pipeline_resource_reporter.h index 88239c72f..70d042481 100644 --- a/prog/engine/drv/drv3d_DX12/debug/pipeline_resource_reporter.h +++ b/prog/engine/drv/drv3d_DX12/debug/pipeline_resource_reporter.h @@ -1,5 +1,12 @@ #pragma once +namespace drv3d_dx12 +{ +struct PipelineStageStateBase; +class BasePipeline; +class ComputePipeline; +} // namespace drv3d_dx12 + namespace drv3d_dx12::debug { void report_resources(const PipelineStageStateBase &state, ComputePipeline *pipe); diff --git a/prog/engine/drv/drv3d_DX12/derived_span.h b/prog/engine/drv/drv3d_DX12/derived_span.h new file mode 100644 index 000000000..cc24509bf --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/derived_span.h @@ -0,0 +1,72 @@ +#pragma once + +#include +#include + + +template +class DerivedSpan +{ + using BytePointerType = typename eastl::conditional::value, const uint8_t *, uint8_t *>::type; + BytePointerType uBase = nullptr; + size_t uSize = 0; + size_t uCount = 0; + BytePointerType atIndex(size_t i) const { return &uBase[i * uSize]; } + +public: + DerivedSpan() = default; + DerivedSpan(const DerivedSpan &) = default; + template + DerivedSpan(U *u_base, size_t u_count) : uBase{reinterpret_cast(u_base)}, uSize{sizeof(U)}, uCount{u_count} + { + static_assert(eastl::is_base_of::value, "U is invalid type"); + } + template + DerivedSpan(const eastl::vector &u_base) : DerivedSpan{u_base.data(), u_base.size()} + {} + class Iterator + { + BytePointerType uBase = nullptr; + size_t uSize = 0; + + public: + Iterator() = default; + Iterator(const Iterator &) = default; + Iterator(BytePointerType u_base, size_t u_size) : uBase{u_base}, uSize{u_size} {} + + friend bool operator==(const Iterator &l, const Iterator &r) { return l.uBase == r.uBase; } + friend bool operator!=(const Iterator &l, const Iterator &r) { return !(l == r); } + + Iterator &operator++() + { + uBase += uSize; + return *this; + } + Iterator operator++(int) const + { + auto other = *this; + ++other; + return other; + } + + Iterator &operator--() + { + uBase -= uSize; + return *this; + } + Iterator operator--(int) const + { + auto other = *this; + --other; + return other; + } + T &operator*() const { return *reinterpret_cast(uBase); } + }; + + Iterator begin() const { return {uBase, uSize}; } + Iterator cbegin() const { return begin(); } + Iterator end() const { return {atIndex(uCount), uSize}; } + Iterator cend() const { return end(); } + size_t size() const { return uCount; } + T *data() const { return reinterpret_cast(uBase); } +}; diff --git a/prog/engine/drv/drv3d_DX12/descriptor_heap.h b/prog/engine/drv/drv3d_DX12/descriptor_heap.h index 0d20448dd..f23597c74 100644 --- a/prog/engine/drv/drv3d_DX12/descriptor_heap.h +++ b/prog/engine/drv/drv3d_DX12/descriptor_heap.h @@ -1,5 +1,15 @@ #pragma once +#include +#include +#include + +#include "driver.h" +#include "d3d12_error_handling.h" +#include "d3d12_utils.h" +#include "value_range.h" + + namespace drv3d_dx12 { template diff --git a/prog/engine/drv/drv3d_DX12/device.cpp b/prog/engine/drv/drv3d_DX12/device.cpp index afaa68796..13aba1c5a 100644 --- a/prog/engine/drv/drv3d_DX12/device.cpp +++ b/prog/engine/drv/drv3d_DX12/device.cpp @@ -238,6 +238,21 @@ uint64_t drv3d_dx12::calculate_texture_staging_buffer_size(Extent3D size, MipMap return totalSize; } +bool Device::isSamplesCountSupported(DXGI_FORMAT format, int32_t samples_count) +{ + if (samples_count == 1) + return true; + + D3D12_FEATURE_DATA_MULTISAMPLE_QUALITY_LEVELS multisampleQualityLevels{}; + multisampleQualityLevels.Format = format; + multisampleQualityLevels.SampleCount = samples_count; + + auto result = device->CheckFeatureSupport(D3D12_FEATURE_MULTISAMPLE_QUALITY_LEVELS, // + &multisampleQualityLevels, sizeof(multisampleQualityLevels)); + + return SUCCEEDED(result) && multisampleQualityLevels.NumQualityLevels > 0; +} + D3D12_CPU_DESCRIPTOR_HANDLE Device::getNonRecentImageViews(Image *img, ImageViewState state) { auto iter = eastl::find_if(begin(img->getOldViews()), end(img->getOldViews()), @@ -259,7 +274,7 @@ D3D12_CPU_DESCRIPTOR_HANDLE Device::getNonRecentImageViews(Image *img, ImageView viewInfo.state = state; if (state.isSRV()) { - D3D12_SHADER_RESOURCE_VIEW_DESC desc = state.asSRVDesc(img->getType()); + D3D12_SHADER_RESOURCE_VIEW_DESC desc = state.asSRVDesc(img->getType(), img->isMultisampled()); viewInfo.handle = resources.allocateTextureSRVDescriptor(device.get()); device->CreateShaderResourceView(img->getHandle(), &desc, viewInfo.handle); } @@ -271,13 +286,13 @@ D3D12_CPU_DESCRIPTOR_HANDLE Device::getNonRecentImageViews(Image *img, ImageView } else if (state.isRTV()) { - D3D12_RENDER_TARGET_VIEW_DESC desc = state.asRTVDesc(img->getType()); + D3D12_RENDER_TARGET_VIEW_DESC desc = state.asRTVDesc(img->getType(), img->isMultisampled()); viewInfo.handle = resources.allocateTextureRTVDescriptor(device.get()); device->CreateRenderTargetView(img->getHandle(), &desc, viewInfo.handle); } else if (state.isDSV()) { - D3D12_DEPTH_STENCIL_VIEW_DESC desc = state.asDSVDesc(img->getType()); + D3D12_DEPTH_STENCIL_VIEW_DESC desc = state.asDSVDesc(img->getType(), img->isMultisampled()); viewInfo.handle = resources.allocateTextureDSVDescriptor(device.get()); device->CreateDepthStencilView(img->getHandle(), &desc, viewInfo.handle); } @@ -695,8 +710,8 @@ void Device::shutdown(const DeviceCapsAndShaderModel &features) // smNone is a indicator for default constructed, eg on error case if (features.shaderModel != d3d::smNone) { - pipelineCacheSetup.generateBlks = dxBlock->getBool("generateCacheBlks", false); - pipelineCacheSetup.alwaysGenerateBlks = dxBlock->getBool("alwaysGenerateCacheBlks", false); + pipelineCacheSetup.generateBlks = dxBlock->getBool("generateCacheBlks", pipeMan.needToUpdateCache); + pipelineCacheSetup.alwaysGenerateBlks = dxBlock->getBool("alwaysGenerateCacheBlks", pipeMan.needToUpdateCache); } else { @@ -735,7 +750,6 @@ void Device::adjustCaps(Driver3dDesc &capabilities) #if _TARGET_PC_WIN - capabilities.shaderModel = 6.0_sm; capabilities.caps.hasDepthReadOnly = true; capabilities.caps.hasStructuredBuffers = true; capabilities.caps.hasNoOverwriteOnShaderResourceBuffers = true; @@ -761,8 +775,7 @@ void Device::adjustCaps(Driver3dDesc &capabilities) capabilities.caps.hasUAVOnlyForcedSampleCount = true; capabilities.caps.hasNativeRenderPassSubPasses = false; capabilities.caps.hasDrawID = true; - capabilities.caps.hasRenderPassDepthResolve = false; - capabilities.caps.hasShaderFloat16Support = false; + capabilities.caps.hasRenderPassDepthResolve = true; #if HAS_NVAPI // This is a bloody workaround for broken terrain tessellation. @@ -775,19 +788,20 @@ void Device::adjustCaps(Driver3dDesc &capabilities) } #endif - static constexpr D3D_SHADER_MODEL latestShaderModelWeSupport = D3D_SHADER_MODEL_6_6; - auto op0 = checkFeatureSupport(); auto op1 = checkFeatureSupport(); // auto op2 = checkFeatureSupport(); auto op3 = checkFeatureSupport(); - // auto op4 = checkFeatureSupport(); + auto op4 = checkFeatureSupport(); auto op5 = checkFeatureSupport(); auto op6 = checkFeatureSupport(); auto op7 = checkFeatureSupport(); // auto op8 = checkFeatureSupport(); auto op9 = checkFeatureSupport(); - auto sm = checkFeatureSupport(latestShaderModelWeSupport); + auto sm = checkFeatureSupport(shader_model_to_dx(d3d::smMax)); + + capabilities.shaderModel = shader_model_from_dx(sm.HighestShaderModel); + debug("DX12: GPU has support for Shader Model %u.%u", capabilities.shaderModel.major, capabilities.shaderModel.minor); capabilities.caps.hasConservativeRassterization = D3D12_CONSERVATIVE_RASTERIZATION_TIER_NOT_SUPPORTED != op0.ConservativeRasterizationTier; @@ -799,6 +813,7 @@ void Device::adjustCaps(Driver3dDesc &capabilities) // We don't support the limited max 128 SRVs per stage tier 1 supports. // Xbox always supports full heap of SRVs capabilities.caps.hasBindless = D3D12_RESOURCE_BINDING_TIER_2 <= op0.ResourceBindingTier; + if (config.features.test(DeviceFeaturesConfig::DISABLE_BINDLESS)) { capabilities.caps.hasBindless = false; @@ -811,12 +826,13 @@ void Device::adjustCaps(Driver3dDesc &capabilities) capabilities.caps.hasOptimizedViewInstancing = D3D12_VIEW_INSTANCING_TIER_2 <= op3.ViewInstancingTier; capabilities.caps.hasAcceleratedViewInstancing = D3D12_VIEW_INSTANCING_TIER_3 <= op3.ViewInstancingTier; + capabilities.caps.hasShaderFloat16Support = op4.Native16BitShaderOpsSupported; + caps.set(Caps::RAY_TRACING, D3D12_RAYTRACING_TIER_1_0 <= op5.RaytracingTier); capabilities.caps.hasRaytracing = D3D12_RAYTRACING_TIER_1_0 <= op5.RaytracingTier; caps.set(Caps::RAY_TRACING_T1_1, D3D12_RAYTRACING_TIER_1_1 <= op5.RaytracingTier); - capabilities.caps.hasRaytracingT11 = - D3D12_RAYTRACING_TIER_1_1 <= op5.RaytracingTier && D3D_SHADER_MODEL_6_5 <= sm.HighestShaderModel; + capabilities.caps.hasRaytracingT11 = D3D12_RAYTRACING_TIER_1_1 <= op5.RaytracingTier && (capabilities.shaderModel >= 6.5_sm); capabilities.caps.hasVariableRateShading = D3D12_VARIABLE_SHADING_RATE_TIER_1 <= op6.VariableShadingRateTier; caps.set(Caps::SHADING_RATE_T1, D3D12_VARIABLE_SHADING_RATE_TIER_1 <= op6.VariableShadingRateTier); @@ -832,12 +848,8 @@ void Device::adjustCaps(Driver3dDesc &capabilities) capabilities.caps.hasMeshShader = D3D12_MESH_SHADER_TIER_1 <= op7.MeshShaderTier; - if (D3D_SHADER_MODEL_6_6 <= sm.HighestShaderModel) - { - debug("GPU has support for Shader Model 6.6"); - capabilities.caps.hasShader64BitIntegerResources = FALSE != op9.AtomicInt64OnTypedResourceSupported; - capabilities.shaderModel = 6.6_sm; - } + capabilities.caps.hasShader64BitIntegerResources = + (FALSE != op9.AtomicInt64OnTypedResourceSupported) && (capabilities.shaderModel >= 6.6_sm); capabilities.caps.hasDLSS = getContext().getDlssState() >= DlssState::SUPPORTED; capabilities.caps.hasXESS = getContext().getXessState() >= XessState::SUPPORTED; @@ -942,6 +954,7 @@ Texture *Device::wrapD3DTex(ID3D12Resource *tex_res, ResourceBarrier current_sta BaseTex *tex = newTextureObject(layers.count() > 1 ? RES3D_ARRTEX : RES3D_TEX, flg); tex->tex.image = image; + tex->tex.image->setMultisampled(tex->isMultisampled()); auto &ext = image->getBaseExtent(); tex->setParams(ext.width, ext.height, layers.count() > 1 ? layers.count() : ext.depth, image->getMipLevelRange().count(), name); diff --git a/prog/engine/drv/drv3d_DX12/device.h b/prog/engine/drv/drv3d_DX12/device.h index 9c1b1997f..244e9345c 100644 --- a/prog/engine/drv/drv3d_DX12/device.h +++ b/prog/engine/drv/drv3d_DX12/device.h @@ -15,6 +15,7 @@ #include "bindless.h" #include "device_context.h" #include "query_manager.h" +#include "tagged_handles.h" #include "pipeline/blk_cache.h" #include @@ -633,6 +634,7 @@ class Device : public DeviceErrroState, public DeviceErrorObserver, prot HRESULT findClosestMatchingMode(DXGI_MODE_DESC *out_desc); #endif + bool isSamplesCountSupported(DXGI_FORMAT format, int32_t samples_count); D3D12_FEATURE_DATA_FORMAT_SUPPORT getFormatFeatures(FormatStore fmt); ImageGlobalSubresouceId getSwapchainColorGlobalId() const { return resources.getSwapchainColorGlobalId(); } @@ -1159,32 +1161,6 @@ inline void PipelineStageStateBase::migrateAllSamplers(ID3D12Device *device, Sam inline uint64_t DeviceContext::getCompletedFenceProgress() { return front.completedFrameProgress; } -inline FormatStore BaseTex::getFormat() const { return tex.image ? tex.image->getFormat() : fmt; } -inline void BaseTex::updateDeviceSampler() -{ - sampler = get_device().getSampler(samplerState); - lastSamplerState = samplerState; -} -inline D3D12_CPU_DESCRIPTOR_HANDLE BaseTex::getDeviceSampler() -{ - if (!sampler.ptr || samplerState != lastSamplerState) - { - updateDeviceSampler(); - } - - return sampler; -} - -inline void BaseTex::updateTexName() -{ - // don't propagate down to stub images - if (isStub()) - return; - if (tex.image) - { - get_device().setTexName(tex.image, getResName()); - } -} inline Extent2D FramebufferInfo::makeDrawArea(Extent2D def /*= {}*/) const { // if swapchain for 0 is used we need to check depth stencil use, diff --git a/prog/engine/drv/drv3d_DX12/device_caps_and_shader_model.h b/prog/engine/drv/drv3d_DX12/device_caps_and_shader_model.h new file mode 100644 index 000000000..7d85daa8f --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/device_caps_and_shader_model.h @@ -0,0 +1,61 @@ +#pragma once + +#include <3d/dag_drv3dConsts.h> +#include "d3d_cap_set_xmacro.h" + + +struct DeviceCapsAndShaderModel +{ + // deliberately using base to allow to load all cap values + DeviceDriverCapabilitiesBase caps; + d3d::shadermodel::Version shaderModel; + + bool isCompatibleTo(d3d::shadermodel::Version shader_model) const { return shader_model >= shaderModel; } + bool isCompatibleTo(const DeviceDriverCapabilities &other) const + { + // This is a very simple approach, when a feature of other is requested but not indicated by caps, we are not compatible. +#define DX12_D3D_CAP(name) \ + if (caps.name && !other.name) \ + return false; + DX12_D3D_CAP_SET +#undef DX12_D3D_CAP + return true; + } + bool isPipelineCompatibleTo(const DeviceDriverCapabilities &other) const + { + // This is a very simple approach, when a feature of other is requested but not indicated by caps, we are not compatible. +#define DX12_D3D_CAP(name) \ + if (caps.name && !other.name) \ + return false; + DX12_D3D_CAP_SET_RELEVANT_FOR_PIPELINES +#undef DX12_D3D_CAP + return true; + } + bool isCompatibleTo(const Driver3dDesc &desc) const { return isCompatibleTo(desc.shaderModel) && isCompatibleTo(desc.caps); } + bool isPipelineCompatibleTo(const Driver3dDesc &desc) const + { + return isCompatibleTo(desc.shaderModel) && isPipelineCompatibleTo(desc.caps); + } + static DeviceCapsAndShaderModel fromDriverDesc(const Driver3dDesc &desc) + { + DeviceCapsAndShaderModel result; + result.shaderModel = desc.shaderModel; + // need to do a copy this way to properly copy constants into variables. +#define DX12_D3D_CAP(name) result.caps.name = desc.caps.name; + DX12_D3D_CAP_SET +#undef DX12_D3D_CAP + return result; + } +}; + +inline d3d::shadermodel::Version shader_model_from_dx(D3D_SHADER_MODEL model) +{ + unsigned int ma = (model >> 4) & 0xF; + unsigned int mi = (model >> 0) & 0xF; + return {ma, mi}; +} + +inline D3D_SHADER_MODEL shader_model_to_dx(d3d::shadermodel::Version model) +{ + return static_cast(model.major << 4 | model.minor); +} diff --git a/prog/engine/drv/drv3d_DX12/device_context.cpp b/prog/engine/drv/drv3d_DX12/device_context.cpp index 7c23879b7..973635d82 100644 --- a/prog/engine/drv/drv3d_DX12/device_context.cpp +++ b/prog/engine/drv/drv3d_DX12/device_context.cpp @@ -6,6 +6,7 @@ #include #include #include <3d/dag_lowLatency.h> + #include #if _TARGET_XBOX @@ -14,6 +15,8 @@ #include #include +#include "render_target_mask_util.h" + #define DX12_LOCK_FRONT() WinAutoLock lock(getFrontGuard()) @@ -432,7 +435,8 @@ void DeviceContext::compilePipelineSet(const DataBlock *feature_sets, DynamicArr pipeline::DataBlockDecodeEnumarator resolver{*feature_sets, 0, d3d::get_driver_desc()}; for (; !resolver.completed(); resolver.next()) { - resolver.decode(featureSetSupported[resolver.index()]); + auto mode = pipeline::FeatureSupportResolver::CompatibilityMode::pipelines; + resolver.decode(mode, featureSetSupported[resolver.index()]); } } @@ -1216,6 +1220,14 @@ void DeviceContext::copyImage(Image *src, Image *dst, const ImageCopy ©) immediateModeExecute(); } +void DeviceContext::resolveMultiSampleImage(Image *src, Image *dst) +{ + auto cmd = make_command(src, dst); + + commandStream.pushBack(cmd); + immediateModeExecute(); +} + void DeviceContext::flushDraws() { DX12_LOCK_FRONT(); @@ -1751,9 +1763,9 @@ void DeviceContext::addGraphicsProgram(GraphicsProgramID program, ShaderID vs, S immediateModeExecute(); } -void DeviceContext::addComputeProgram(ProgramID id, eastl::unique_ptr csm) +void DeviceContext::addComputeProgram(ProgramID id, eastl::unique_ptr csm, CSPreloaded preloaded) { - auto cmd = make_command(id, csm.release()); + auto cmd = make_command(id, csm.release(), preloaded); DX12_LOCK_FRONT(); commandStream.pushBack(cmd); immediateModeExecute(); @@ -3483,7 +3495,7 @@ void DeviceContext::ExecutionContext::addVertexShader(ShaderID id, VertexShaderM void DeviceContext::ExecutionContext::addPixelShader(ShaderID id, PixelShaderModule *sci) { device.pipeMan.addPixelShader(id, sci); } -void DeviceContext::ExecutionContext::addComputePipeline(ProgramID id, ComputeShaderModule *csm) +void DeviceContext::ExecutionContext::addComputePipeline(ProgramID id, ComputeShaderModule *csm, CSPreloaded preloaded) { // if optimization pass is used, then it has to handle this, as it might needs the data for later // commands @@ -3492,7 +3504,7 @@ void DeviceContext::ExecutionContext::addComputePipeline(ProgramID id, ComputeSh device.pipeMan.addCompute(device.device.get(), device.pipelineCache, id, eastl::move(*csm), get_recover_behvior_from_cfg(device.config.features.test(DeviceFeaturesConfig::PIPELINE_COMPILATION_ERROR_IS_FATAL), device.config.features.test(DeviceFeaturesConfig::ASSERT_ON_PIPELINE_COMPILATION_ERROR)), - device.shouldNameObjects()); + device.shouldNameObjects(), preloaded); } void DeviceContext::ExecutionContext::addGraphicsPipeline(GraphicsProgramID program, ShaderID vs, ShaderID ps) @@ -4060,6 +4072,21 @@ void DeviceContext::ExecutionContext::copyImage(Image *src, Image *dst, const Im dirtyTextureState(dst); } +void DeviceContext::ExecutionContext::resolveMultiSampleImage(Image *src, Image *dst) +{ + contextState.resourceStates.useTextureAsResolveSource(contextState.graphicsCommandListBarrierBatch, + contextState.graphicsCommandListSplitBarrierTracker, src); + + contextState.resourceStates.useTextureAsResolveDestination(contextState.graphicsCommandListBarrierBatch, + contextState.graphicsCommandListSplitBarrierTracker, dst); + + contextState.graphicsCommandListBarrierBatch.execute(contextState.cmdBuffer); + + G_ASSERT(src->getFormat().asDxGiFormat() == dst->getFormat().asDxGiFormat()); + + contextState.cmdBuffer.resolveSubresource(dst->getHandle(), 0, src->getHandle(), 0, src->getFormat().asDxGiFormat()); +} + void DeviceContext::ExecutionContext::blitImage(Image *src, Image *dst, ImageViewState src_view, ImageViewState dst_view, D3D12_CPU_DESCRIPTOR_HANDLE src_view_descroptor, D3D12_CPU_DESCRIPTOR_HANDLE dst_view_descriptor, D3D12_RECT src_rect, D3D12_RECT dst_rect, bool disable_predication) @@ -4669,6 +4696,18 @@ void DeviceContext::ExecutionContext::flushGraphicsMeshState() { auto &staticRenderState = device.pipeMan.getStaticRenderState(contextState.graphicsState.staticRenderStateIdent); + if (dgs_get_settings()->getBlockByNameEx("dx12")->getBool("validateInGameSpikes", false)) + { + logerr("Pipeline creation during game! Patch for cache will be updated. Share game/cache/dx12_cache.blk file with graphics " + "programmers."); + + device.pipeMan.needToUpdateCache = true; + + contextState.graphicsState.pipeline->errorPrintMeshBlkString(*contextState.graphicsState.basePipeline, + contextState.graphicsState.statusBits.test(GraphicsState::USE_WIREFRAME), staticRenderState, + contextState.graphicsState.framebufferState.framebufferLayout); + } + contextState.graphicsState.pipeline->loadMesh(device.device.get(), device.pipeMan, *contextState.graphicsState.basePipeline, device.pipelineCache, contextState.graphicsState.statusBits.test(GraphicsState::USE_WIREFRAME), staticRenderState, contextState.graphicsState.framebufferState.framebufferLayout, @@ -4719,9 +4758,20 @@ void DeviceContext::ExecutionContext::flushGraphicsState(D3D12_PRIMITIVE_TOPOLOG if (!contextState.graphicsState.pipeline->isReady()) { auto &inputLayout = device.pipeMan.getInputLayout(internlInputLayout); - auto &staticRenderState = device.pipeMan.getStaticRenderState(contextState.graphicsState.staticRenderStateIdent); + if (dgs_get_settings()->getBlockByNameEx("dx12")->getBool("validateInGameSpikes", false)) + { + logerr("Pipeline creation during game! Patch for cache will be updated. Share game/cache/dx12_cache.blk file with graphics " + "programmers."); + + device.pipeMan.needToUpdateCache = true; + + contextState.graphicsState.pipeline->errorPrintBlkString(*contextState.graphicsState.basePipeline, inputLayout, + contextState.graphicsState.statusBits.test(GraphicsState::USE_WIREFRAME), staticRenderState, + contextState.graphicsState.framebufferState.framebufferLayout, topType); + } + contextState.graphicsState.pipeline->load(device.device.get(), device.pipeMan, *contextState.graphicsState.basePipeline, device.pipelineCache, inputLayout, contextState.graphicsState.statusBits.test(GraphicsState::USE_WIREFRAME), staticRenderState, contextState.graphicsState.framebufferState.framebufferLayout, topType, @@ -4753,7 +4803,8 @@ void DeviceContext::ExecutionContext::flushIndexBuffer() if (buffer && readyCommandList()) { - D3D12_INDEX_BUFFER_VIEW view{buffer.gpuPointer, buffer.size, type}; + G_ASSERT(buffer.size <= eastl::numeric_limits::max()); + D3D12_INDEX_BUFFER_VIEW view{buffer.gpuPointer, static_cast(buffer.size), type}; contextState.cmdBuffer.iaSetIndexBuffer(&view); } } @@ -5523,7 +5574,7 @@ void DeviceContext::ExecutionContext::bufferBarrier(BufferResourceReference buff char cbuf[MAX_OBJECT_NAME_LENGTH]; char maskNameBuffer[256]; auto state = translate_buffer_barrier_to_state(barrier); - make_resource_barrier_string_from_state(maskNameBuffer, array_size(maskNameBuffer), state, barrier); + make_resource_barrier_string_from_state(maskNameBuffer, countof(maskNameBuffer), state, barrier); debug("DX12: Resource barrier for buffer %s - %p, with %s, during %s", get_resource_name(buffer.buffer, cbuf), buffer.buffer, maskNameBuffer, getEventPath()); } @@ -5638,7 +5689,7 @@ void DeviceContext::ExecutionContext::textureBarrier(Image *tex, SubresourceRang char cbuf[MAX_OBJECT_NAME_LENGTH]; char maskNameBuffer[256]; auto state = translate_texture_barrier_to_state(barrier, !tex->getFormat().isColor()); - make_resource_barrier_string_from_state(maskNameBuffer, array_size(maskNameBuffer), state, barrier); + make_resource_barrier_string_from_state(maskNameBuffer, countof(maskNameBuffer), state, barrier); debug("DX12: Resource barrier for texture %s - %p [%u - %u], with %s, during %s", static_cast(barrier), get_resource_name(tex->getHandle(), cbuf), tex->getHandle(), sub_res_range.start, sub_res_range.stop, maskNameBuffer, this->getEventPath()); @@ -5893,8 +5944,8 @@ void DeviceContext::ExecutionContext::aliasFlush(GpuPipeline gpu_pipeline) G_UNUSED(gpu_pipeline); } -void DeviceContext::ExecutionContext::twoPhaseCopyBuffer(BufferResourceReferenceAndOffset source, uint32_t destination_offset, - ScratchBuffer scratch_memory, uint32_t data_size) +void DeviceContext::ExecutionContext::twoPhaseCopyBuffer(BufferResourceReferenceAndOffset source, uint64_t destination_offset, + ScratchBuffer scratch_memory, uint64_t data_size) { if (!readyCommandList()) { @@ -6155,7 +6206,7 @@ void DeviceContext::ExecutionContext::loadComputeShaderFromDump(ProgramID progra device.pipeMan.loadComputeShaderFromDump(device.device.get(), device.pipelineCache, program, get_recover_behvior_from_cfg(device.config.features.test(DeviceFeaturesConfig::PIPELINE_COMPILATION_ERROR_IS_FATAL), device.config.features.test(DeviceFeaturesConfig::ASSERT_ON_PIPELINE_COMPILATION_ERROR)), - device.shouldNameObjects()); + device.shouldNameObjects(), CSPreloaded::Yes); } void DeviceContext::ExecutionContext::compilePipelineSet(DynamicArray &&input_layouts, diff --git a/prog/engine/drv/drv3d_DX12/device_context.h b/prog/engine/drv/drv3d_DX12/device_context.h index 498549b69..8d36b51f2 100644 --- a/prog/engine/drv/drv3d_DX12/device_context.h +++ b/prog/engine/drv/drv3d_DX12/device_context.h @@ -12,7 +12,10 @@ #include "texture.h" #include "buffer.h" #include "resource_memory_heap.h" +#include "tagged_handles.h" #include "bindless.h" +#include "device_queue.h" +#include "swapchain.h" #include "ngx_wrapper.h" #include "xess_wrapper.h" #include "fsr2_wrapper.h" @@ -23,6 +26,9 @@ #include "command_list.h" #include "stateful_command_buffer.h" #include "resource_state_tracker.h" +#include "viewport_state.h" +#include "const_register_type.h" + namespace drv3d_dx12 { @@ -399,7 +405,7 @@ struct FramebufferInfo FramebufferMask getMatchingAttachmentMask(Image *texture) { FramebufferMask result; - for (uint32_t i = 0; i < array_size(colorAttachments); ++i) + for (uint32_t i = 0; i < countof(colorAttachments); ++i) { result.colorAttachmentMask |= ((colorAttachments[i].image == texture) ? 1u : 0u) << i; } @@ -573,7 +579,7 @@ struct GraphicsState { statusBits.set(INDEX_BUFFER_STATE_DIRTY); } - for (uint32_t i = 0; i < array_size(vertexBuffers); ++i) + for (uint32_t i = 0; i < countof(vertexBuffers); ++i) { if (vertexBuffers[i].resourceId == ident) { @@ -1267,7 +1273,7 @@ class DeviceContext : protected ResourceUsageHistoryDataSetDebugger, public debu void endConditionalRender(); void addVertexShader(ShaderID id, VertexShaderModule *sci); void addPixelShader(ShaderID id, PixelShaderModule *sci); - void addComputePipeline(ProgramID id, ComputeShaderModule *csm); + void addComputePipeline(ProgramID id, ComputeShaderModule *csm, CSPreloaded preloaded); void addGraphicsPipeline(GraphicsProgramID program, ShaderID vs, ShaderID ps); #if D3D_HAS_RAY_TRACING void addRaytracePipeline(ProgramID program, uint32_t max_recursion, uint32_t shader_count, const ShaderID *shaders, @@ -1296,6 +1302,7 @@ class DeviceContext : protected ResourceUsageHistoryDataSetDebugger, public debu const ClearDepthStencilValue &value); void clearColorImage(Image *image, ImageViewState view, D3D12_CPU_DESCRIPTOR_HANDLE view_descriptor, const ClearColorValue &value); void copyImage(Image *src, Image *dst, const ImageCopy ©); + void resolveMultiSampleImage(Image *src, Image *dst); void blitImage(Image *src, Image *dst, ImageViewState src_view, ImageViewState dst_view, D3D12_CPU_DESCRIPTOR_HANDLE src_view_descroptor, D3D12_CPU_DESCRIPTOR_HANDLE dst_view_descriptor, D3D12_RECT src_rect, D3D12_RECT dst_rect, bool disable_predication); @@ -1423,8 +1430,8 @@ class DeviceContext : protected ResourceUsageHistoryDataSetDebugger, public debu void deactivateBuffer(BufferResourceReferenceAndAddressRange buffer, const ResourceMemory &memory, GpuPipeline gpu_pipeline); void deactivateTexture(Image *tex, GpuPipeline gpu_pipeline); void aliasFlush(GpuPipeline gpu_pipeline); - void twoPhaseCopyBuffer(BufferResourceReferenceAndOffset source, uint32_t destination_offset, ScratchBuffer scratch_memory, - uint32_t data_size); + void twoPhaseCopyBuffer(BufferResourceReferenceAndOffset source, uint64_t destination_offset, ScratchBuffer scratch_memory, + uint64_t data_size); void transitionBuffer(BufferResourceReference buffer, D3D12_RESOURCE_STATES state); @@ -1723,6 +1730,7 @@ class DeviceContext : protected ResourceUsageHistoryDataSetDebugger, public debu void clearDepthStencilImage(Image *image, const ImageSubresourceRange &area, const ClearDepthStencilValue &value); void clearColorImage(Image *image, const ImageSubresourceRange &area, const ClearColorValue &value); void copyImage(Image *src, Image *dst, const ImageCopy ©); + void resolveMultiSampleImage(Image *src, Image *dst); void flushDraws(); // Similar to flushDraws, with the exception that it will only execute a flush when no queries are active. // Returns true if it executed a flush, otherwise it returns false. @@ -1771,7 +1779,7 @@ class DeviceContext : protected ResourceUsageHistoryDataSetDebugger, public debu void removePixelShader(ShaderID id); void addGraphicsProgram(GraphicsProgramID program, ShaderID vs, ShaderID ps); - void addComputeProgram(ProgramID id, eastl::unique_ptr csm); + void addComputeProgram(ProgramID id, eastl::unique_ptr csm, CSPreloaded preloaded); void removeProgram(ProgramID program); #if D3D_HAS_RAY_TRACING diff --git a/prog/engine/drv/drv3d_DX12/device_context_cmd.h b/prog/engine/drv/drv3d_DX12/device_context_cmd.h index db07c3b11..64d616f19 100644 --- a/prog/engine/drv/drv3d_DX12/device_context_cmd.h +++ b/prog/engine/drv/drv3d_DX12/device_context_cmd.h @@ -396,6 +396,14 @@ DX12_BEGIN_CONTEXT_COMMAND(CopyImage) #endif DX12_END_CONTEXT_COMMAND +DX12_BEGIN_CONTEXT_COMMAND(ResolveMultiSampleImage) + DX12_CONTEXT_COMMAND_PARAM(Image *, src) + DX12_CONTEXT_COMMAND_PARAM(Image *, dst) +#if DX12_CONTEXT_COMMAND_IMPLEMENTATION + ctx.resolveMultiSampleImage(src, dst); +#endif +DX12_END_CONTEXT_COMMAND + DX12_BEGIN_CONTEXT_COMMAND(EndCPUTextureAccess) DX12_CONTEXT_COMMAND_PARAM(Image *, texture) @@ -719,9 +727,10 @@ DX12_END_CONTEXT_COMMAND DX12_BEGIN_CONTEXT_COMMAND(AddComputeProgram) DX12_CONTEXT_COMMAND_PARAM(ProgramID, id) DX12_CONTEXT_COMMAND_PARAM(ComputeShaderModule *, csm) + DX12_CONTEXT_COMMAND_PARAM(CSPreloaded, preloaded) #if DX12_CONTEXT_COMMAND_IMPLEMENTATION - ctx.addComputePipeline(id, csm); + ctx.addComputePipeline(id, csm, preloaded); #endif DX12_END_CONTEXT_COMMAND @@ -1271,9 +1280,9 @@ DX12_END_CONTEXT_COMMAND DX12_BEGIN_CONTEXT_COMMAND(TwoPhaseCopyBuffer) DX12_CONTEXT_COMMAND_PARAM(BufferResourceReferenceAndOffset, source) - DX12_CONTEXT_COMMAND_PARAM(uint32_t, destinationOffset) + DX12_CONTEXT_COMMAND_PARAM(uint64_t, destinationOffset) DX12_CONTEXT_COMMAND_PARAM(ScratchBuffer, scratchMemory) - DX12_CONTEXT_COMMAND_PARAM(uint32_t, size) + DX12_CONTEXT_COMMAND_PARAM(uint64_t, size) #if DX12_CONTEXT_COMMAND_IMPLEMENTATION ctx.twoPhaseCopyBuffer(source, destinationOffset, scratchMemory, size); diff --git a/prog/engine/drv/drv3d_DX12/device_memory_class.h b/prog/engine/drv/drv3d_DX12/device_memory_class.h new file mode 100644 index 000000000..e75af905f --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/device_memory_class.h @@ -0,0 +1,49 @@ +#pragma once + +#include <3d/rayTrace/dag_drvRayTrace.h> + +#include "driver.h" + + +namespace drv3d_dx12 +{ + +// TODO rename some of the names to better resemble what they are intended for +enum class DeviceMemoryClass +{ + DEVICE_RESIDENT_IMAGE, + DEVICE_RESIDENT_BUFFER, + // linear cpu cached textures + HOST_RESIDENT_HOST_READ_WRITE_IMAGE, + // linear cpu non-cached textures + HOST_RESIDENT_HOST_WRITE_ONLY_IMAGE, + HOST_RESIDENT_HOST_READ_WRITE_BUFFER, + + HOST_RESIDENT_HOST_READ_ONLY_BUFFER, + HOST_RESIDENT_HOST_WRITE_ONLY_BUFFER, + // special AMD memory type, + // a portion of gpu mem is host + // visible (256mb). + DEVICE_RESIDENT_HOST_WRITE_ONLY_BUFFER, + // we handle memory for push ring buffer differently than any other + PUSH_RING_BUFFER, + TEMPORARY_UPLOAD_BUFFER, + + READ_BACK_BUFFER, + BIDIRECTIONAL_BUFFER, + + RESERVED_RESOURCE, + +#if DX12_USE_ESRAM + ESRAM_RESOURCE, +#endif + +#if D3D_HAS_RAY_TRACING + DEVICE_RESIDENT_ACCELERATION_STRUCTURE, +#endif + + COUNT, + INVALID = COUNT +}; + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/device_queue.h b/prog/engine/drv/drv3d_DX12/device_queue.h index 93741572f..12da4552a 100644 --- a/prog/engine/drv/drv3d_DX12/device_queue.h +++ b/prog/engine/drv/drv3d_DX12/device_queue.h @@ -1,7 +1,13 @@ #pragma once #include +#include #include +#include <3d/dag_drv3d.h> + +#include "d3d12_error_handling.h" +#include "d3d12_debug_names.h" + #if _TARGET_XBOXONE #include diff --git a/prog/engine/drv/drv3d_DX12/driver.h b/prog/engine/drv/drv3d_DX12/driver.h index d558ec68a..ac7370946 100644 --- a/prog/engine/drv/drv3d_DX12/driver.h +++ b/prog/engine/drv/drv3d_DX12/driver.h @@ -1,99 +1,25 @@ #pragma once -#if _TARGET_PC_WIN -#include -#include -#include -#include -#include -#include -#endif -#include <3d/dag_drv3d.h> - -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -#include "drvCommonConsts.h" - -#include - -#include "constants.h" - -#include "bitfield.h" -#include "value_range.h" -#include "versioned_com_ptr.h" -#include "drv_log_defs.h" - - #if _TARGET_XBOX #include "driver_xbox.h" +#elif _TARGET_PC_WIN +#include "driver_win.h" +#else +#error "Dx12 driver is not supported on targets other than PC and Xbox" #endif -#if _TARGET_PC_WIN -typedef IDXGISwapChain3 DXGISwapChain; -typedef IDXGIFactory4 DXGIFactory; -typedef IDXGIAdapter4 DXGIAdapter; -typedef ID3D12Device3 D3DDevice; -typedef ID3D12GraphicsCommandList2 D3DGraphicsCommandList; -using D3DCopyCommandList = ID3D12GraphicsCommandList; -// on PC we only lock down the execution mode on release builds -#define FIXED_EXECUTION_MODE DAGOR_DBGLEVEL == 0 -#define DX12_ALLOW_SPLIT_BARRIERS 1 -#define DX12_WHATCH_IN_FLIGHT_BARRIERS DAGOR_DBGLEVEL > 0 -#define DX12_VALIDATE_INPUT_LAYOUT_USES DAGOR_DBGLEVEL > 0 -#define DX12_INDIVIDUAL_BARRIER_CHECK 0 -#define DX12_REPORT_TRANSITION_INFO 0 -#define DX12_TRACK_ACTIVE_DRAW_EVENTS DAGOR_DBGLEVEL > 0 -#define DX12_VALIDATE_USER_BARRIERS DAGOR_DBGLEVEL > 0 -#define DX12_AUTOMATIC_BARRIERS 1 -#define DX12_PROCESS_USER_BARRIERS 1 -#define DX12_RECORD_TIMING_DATA 1 -#define DX12_CAPTURE_AFTER_LONG_FRAMES (DX12_RECORD_TIMING_DATA && (DAGOR_DBGLEVEL > 0)) -#define DX12_REPORT_PIPELINE_CREATE_TIMING 1 -// TODO no real gamma control on dx12... -#define DX12_HAS_GAMMA_CONTROL 1 - -// Possible to run with set to 0, but there is no benefit -#define DX12_USE_AUTO_PROMOTE_AND_DECAY 1 - -#define DX12_ENABLE_CONST_BUFFER_DESCRIPTORS 1 - -#define DX12_SELECTABLE_CALL_STACK_CAPTURE 1 - -#define DX12_VALIDATA_COPY_COMMAND_LIST 1 -#define DX12_VALIDATE_COMPUTE_COMMAND_LIST 1 -#define DX12_VALIDATE_RAYTRACE_COMMAND_LIST 1 -#define DX12_VALIDATE_GRAPHICS_COMMAND_LIST 1 - -#define DX12_PROCESS_USER_BARRIERS_DEFAULT 0 -#endif - #if DX12_USE_AUTO_PROMOTE_AND_DECAY -static constexpr D3D12_RESOURCE_STATES D3D12_RESOURCE_STATE_COPY_QUEUE_TARGET = D3D12_RESOURCE_STATE_COMMON; -static constexpr D3D12_RESOURCE_STATES D3D12_RESOURCE_STATE_COPY_QUEUE_SOURCE = D3D12_RESOURCE_STATE_COMMON; -static constexpr D3D12_RESOURCE_STATES D3D12_RESOURCE_STATE_INITIAL_BUFFER_STATE = D3D12_RESOURCE_STATE_COMMON; +inline constexpr D3D12_RESOURCE_STATES D3D12_RESOURCE_STATE_COPY_QUEUE_TARGET = D3D12_RESOURCE_STATE_COMMON; +inline constexpr D3D12_RESOURCE_STATES D3D12_RESOURCE_STATE_COPY_QUEUE_SOURCE = D3D12_RESOURCE_STATE_COMMON; +inline constexpr D3D12_RESOURCE_STATES D3D12_RESOURCE_STATE_INITIAL_BUFFER_STATE = D3D12_RESOURCE_STATE_COMMON; // Can not be detected with auto promote and decay as it needs // D3D12_RESOURCE_STATE_COPY_QUEUE_TARGET to be different than D3D12_RESOURCE_STATE_COMMON. #define DX12_FIX_UNITITALIZED_STATIC_TEXTURE_STATE 0 #else -static constexpr D3D12_RESOURCE_STATES D3D12_RESOURCE_STATE_COPY_QUEUE_TARGET = D3D12_RESOURCE_STATE_COPY_DEST; -static constexpr D3D12_RESOURCE_STATES D3D12_RESOURCE_STATE_COPY_QUEUE_SOURCE = D3D12_RESOURCE_STATE_COPY_SOURCE; -static constexpr D3D12_RESOURCE_STATES D3D12_RESOURCE_STATE_INITIAL_BUFFER_STATE = +inline constexpr D3D12_RESOURCE_STATES D3D12_RESOURCE_STATE_COPY_QUEUE_TARGET = D3D12_RESOURCE_STATE_COPY_DEST; +inline constexpr D3D12_RESOURCE_STATES D3D12_RESOURCE_STATE_COPY_QUEUE_SOURCE = D3D12_RESOURCE_STATE_COPY_SOURCE; +inline constexpr D3D12_RESOURCE_STATES D3D12_RESOURCE_STATE_INITIAL_BUFFER_STATE = D3D12_RESOURCE_STATE_COPY_DEST | D3D12_RESOURCE_STATE_COPY_SOURCE; // Fixes an engine error, where static textures with uninitialized content are used as source for // draws, dispatches or copies. This bug makes only problems on consoles as there auto promotion and @@ -129,3074 +55,25 @@ static constexpr D3D12_RESOURCE_STATES D3D12_RESOURCE_STATE_INITIAL_BUFFER_STATE #define DX12_PRINT_USER_TEXTURE_BARRIERS 0 #if DAGOR_DBGLEVEL > 0 || _TARGET_PC_WIN -#define DX12_DOES_SET_DEBUG_NAMES 1 -#define DX12_SET_DEBUG_OBJ_NAME(obj, name) obj->SetName(name) +#define DX12_DOES_SET_DEBUG_NAMES 1 #else #define DX12_DOES_SET_DEBUG_NAMES 0 -#define DX12_SET_DEBUG_OBJ_NAME(obj, name) #endif #if !DX12_AUTOMATIC_BARRIERS && !DX12_PROCESS_USER_BARRIERS #error "DX12 Driver configured to _not_ generate required barriers and to _ignore_ user barriers, this will crash on execution" #endif -#if _TARGET_XBOX && !_TARGET_SCARLETT -#define DX12_USE_ESRAM 1 -#else -#define DX12_USE_ESRAM 0 -#endif - #define DX12_ENABLE_MT_VALIDATION DAGOR_DBGLEVEL > 0 #define DX12_ENABLE_PEDANTIC_MT_VALIDATION DAGOR_DBGLEVEL > 1 #define DX12_SUPPORT_RESOURCE_MEMORY_METRICS DAGOR_DBGLEVEL > 0 #define DX12_RESOURCE_USAGE_TRACKER DAGOR_DBGLEVEL > 0 -template -struct BitsNeeded -{ - static constexpr int VALUE = BitsNeeded::VALUE + 1; -}; -template <> -struct BitsNeeded<0> -{ - static constexpr int VALUE = 1; -}; -template <> -struct BitsNeeded<1> -{ - static constexpr int VALUE = 1; -}; - -struct Extent2D -{ - uint32_t width; - uint32_t height; -}; - -inline bool operator==(Extent2D l, Extent2D r) { return l.width == r.width && l.height == r.height; } -inline bool operator!=(Extent2D l, Extent2D r) { return !(l == r); } - -struct Extent3D -{ - uint32_t width; - uint32_t height; - uint32_t depth; - - explicit operator Extent2D() const { return {width, height}; } -}; - -inline Extent3D operator*(Extent3D l, Extent3D r) { return {l.width * r.width, l.height * r.height, l.depth * r.depth}; } - -inline Extent3D operator/(Extent3D l, Extent3D r) { return {l.width / r.width, l.height / r.height, l.depth / r.depth}; } - -inline bool operator==(Extent3D l, Extent3D r) { return l.depth == r.depth && static_cast(l) == static_cast(r); } - -inline bool operator!=(Extent3D l, Extent3D r) { return !(l == r); } - -inline Extent3D operator>>(Extent3D value, uint32_t shift) -{ - return {value.width >> shift, value.height >> shift, value.depth >> shift}; -} - -inline Extent3D max(Extent3D a, Extent3D b) { return {max(a.width, b.width), max(a.height, b.height), max(a.depth, b.depth)}; } - -inline Extent3D min(Extent3D a, Extent3D b) { return {min(a.width, b.width), min(a.height, b.height), min(a.depth, b.depth)}; } - -inline Extent3D mip_extent(Extent3D value, uint32_t mip) { return max(value >> mip, {1, 1, 1}); } - -struct Offset2D -{ - int32_t x; - int32_t y; -}; - -inline bool operator==(Offset2D l, Offset2D r) { return l.x == r.x && l.y == r.y; } - -inline bool operator!=(Offset2D l, Offset2D r) { return !(l == r); } - -struct Offset3D -{ - int32_t x; - int32_t y; - int32_t z; - - explicit operator Offset2D() { return {x, y}; } -}; - -inline bool operator==(Offset3D l, Offset3D r) { return l.z == r.z && static_cast(l) == static_cast(r); } - -inline bool operator!=(Offset3D l, Offset3D r) { return !(l == r); } - -inline Extent3D operator+(Extent3D ext, Offset3D ofs) { return {ext.width + ofs.x, ext.height + ofs.y, ext.depth + ofs.z}; } - -inline D3D12_RECT clamp_rect(D3D12_RECT rect, Extent2D ext) -{ - rect.left = clamp(rect.left, 0, ext.width); - rect.right = clamp(rect.right, 0, ext.width); - rect.top = clamp(rect.top, 0, ext.height); - rect.bottom = clamp(rect.bottom, 0, ext.height); - return rect; -} - -struct ViewportState -{ - int x; - int y; - int width; - int height; - float minZ; - float maxZ; - - ViewportState() = default; - - ViewportState(const D3D12_VIEWPORT &vp) - { - x = vp.TopLeftX; - y = vp.TopLeftY; - width = vp.Width; - height = vp.Height; - minZ = vp.MinDepth; - maxZ = vp.MaxDepth; - } - - D3D12_RECT asRect() const - { - D3D12_RECT result; - result.left = x; - result.top = y; - result.right = x + width; - result.bottom = y + height; - - return result; - } - - operator D3D12_VIEWPORT() const - { - D3D12_VIEWPORT result; - result.TopLeftX = x; - result.TopLeftY = y; - result.Width = width; - result.Height = height; - result.MinDepth = minZ; - result.MaxDepth = maxZ; - return result; - } -}; - -inline bool operator==(const ViewportState &l, const ViewportState &r) -{ -#define CMP_P(n) (l.n == r.n) - return CMP_P(x) && CMP_P(y) && CMP_P(width) && CMP_P(height) && CMP_P(minZ) && CMP_P(maxZ); -#undef CMP_P -} -inline bool operator!=(const ViewportState &l, const ViewportState &r) { return !(l == r); } -enum class RegionDifference -{ - EQUAL, - SUBSET, - SUPERSET -}; -inline RegionDifference classify_viewport_diff(const ViewportState &from, const ViewportState &to) -{ - const int32_t dX = to.x - from.x; - const int32_t dY = to.y - from.y; - const int32_t dW = (to.width + to.x) - (from.width + from.x); - const int32_t dH = (to.height + to.y) - (from.height + from.y); - - RegionDifference rectDif = RegionDifference::EQUAL; - // if all zero, then they are the same - if (dX | dY | dW | dH) - { - // can be either subset or completely different - if (dX >= 0 && dY >= 0 && dW <= 0 && dH <= 0) - { - rectDif = RegionDifference::SUBSET; - } - else - { - rectDif = RegionDifference::SUPERSET; - } - } - - if (RegionDifference::SUPERSET != rectDif) - { - // min/max z only affect viewport but not render regions, so it is always a subset if it has - // changed - if (to.maxZ != from.maxZ || to.minZ != from.minZ) - { - return RegionDifference::SUBSET; - } - } - return rectDif; -} - - namespace drv3d_dx12 { -struct ConstRegisterType -{ - uint32_t components[SHADER_REGISTER_ELEMENTS]; -}; -inline bool operator==(const ConstRegisterType &l, const ConstRegisterType &r) -{ - return eastl::equal(eastl::begin(l.components), eastl::end(l.components), eastl::begin(r.components)); -} - -template -class TaggedIndexType -{ - I value{}; - - constexpr TaggedIndexType(I v) : value{v} {} - -public: - using ValueType = I; - - constexpr TaggedIndexType() = default; - ~TaggedIndexType() = default; - - TaggedIndexType(const TaggedIndexType &) = default; - TaggedIndexType &operator=(const TaggedIndexType &) = default; - - static constexpr TaggedIndexType make(I v) { return {v}; } - - constexpr I index() const { return value; } - - friend bool operator==(const TaggedIndexType &l, const TaggedIndexType &r) { return l.value == r.value; } - - friend bool operator!=(const TaggedIndexType &l, const TaggedIndexType &r) { return l.value != r.value; } - - friend bool operator<(const TaggedIndexType &l, const TaggedIndexType &r) { return l.value < r.value; } - - friend bool operator>(const TaggedIndexType &l, const TaggedIndexType &r) { return l.value > r.value; } - - friend bool operator<=(const TaggedIndexType &l, const TaggedIndexType &r) { return l.value <= r.value; } - - friend bool operator>=(const TaggedIndexType &l, const TaggedIndexType &r) { return l.value >= r.value; } - - friend int operator-(const TaggedIndexType &l, const TaggedIndexType &r) { return l.value - r.value; } - - friend TaggedIndexType operator+(const TaggedIndexType &l, I r) { return {I(l.value + r)}; } - - friend TaggedIndexType operator-(const TaggedIndexType &l, I r) { return {I(l.value - r)}; } - - TaggedIndexType &operator+=(I r) - { - value += r; - return *this; - } - - TaggedIndexType &operator-=(I r) - { - value -= r; - return *this; - } - - template - TaggedIndexType &operator+=(C r) - { - *this = *this + r; - return *this; - } - - template - TaggedIndexType &operator-=(C r) - { - *this = *this - r; - return *this; - } - - TaggedIndexType &operator++() - { - ++value; - return *this; - } - - TaggedIndexType &operator--() - { - --value; - return *this; - } - - TaggedIndexType operator++(int) const - { - auto copy = *this; - return ++copy; - } - - TaggedIndexType operator--(int) const - { - auto copy = *this; - return --copy; - } - - operator DagorSafeArg() const { return {index()}; } -}; - -template -class TaggedRangeType : private ValueRange -{ - using RangeType = ValueRange; - -public: - using ValueType = typename ValueRange::ValueType; - using RangeType::begin; - using RangeType::end; - using RangeType::isInside; - using RangeType::isValidRange; - using RangeType::size; - using RangeType::ValueRange; - - constexpr IT front() const { return RangeType::front(); } - constexpr IT back() const { return RangeType::back(); } - - constexpr TaggedRangeType front(ValueType offset) const { return {IT(this->start + offset), this->stop}; } - constexpr TaggedRangeType back(ValueType offset) const { return {IT(this->stop - offset), this->stop}; } - - void resize(uint32_t count) { this->stop = this->start + count; } - - constexpr TaggedRangeType subRange(IT offset, uint32_t count) const { return make(this->start + offset, count); } - - constexpr TaggedRangeType subRange(uint32_t offset, uint32_t count) const { return make(this->start + offset, count); } - - static constexpr TaggedRangeType make(IT base, uint32_t count) { return {base, base + count}; } - - static constexpr TaggedRangeType make(uint32_t base, uint32_t count) { return {IT::make(base), IT::make(base + count)}; } - - static constexpr TaggedRangeType make_invalid() { return {IT::make(1), IT::make(0)}; } -}; - -template -class TaggedCountType -{ -public: - using ValueType = IT; - using IndexValueType = typename ValueType::ValueType; - using RangeType = TaggedRangeType; - -private: - IndexValueType value{}; - - constexpr TaggedCountType(IndexValueType v) : value{v} {} - -public: - struct Iterator - { - IndexValueType at{}; - - constexpr Iterator() = default; - ~Iterator() = default; - constexpr Iterator(const Iterator &) = default; - Iterator &operator=(const Iterator &) = default; - constexpr Iterator(IndexValueType v) : at(v) {} - constexpr ValueType operator*() const { return ValueType::make(at); } - Iterator &operator++() - { - ++at; - return *this; - } - Iterator operator++(int) { return at++; } - Iterator &operator--() - { - --at; - return *this; - } - Iterator operator--(int) { return at--; } - friend constexpr bool operator==(Iterator l, Iterator r) { return l.at == r.at; } - friend constexpr bool operator!=(Iterator l, Iterator r) { return l.at != r.at; } - }; - constexpr Iterator begin() const { return {0}; } - constexpr Iterator end() const { return {value}; } - - constexpr TaggedCountType() = default; - ~TaggedCountType() = default; - - TaggedCountType(const TaggedCountType &) = default; - TaggedCountType &operator=(const TaggedCountType &) = default; - - static constexpr TaggedCountType make(IndexValueType v) { return {v}; } - - constexpr IndexValueType count() const { return value; } - - constexpr RangeType asRange() const { return RangeType::make(0, value); } - // Allow implicit conversion to range as count is a specialized range - constexpr operator RangeType() const { return asRange(); } - - constexpr RangeType front(ValueType offset) const { return RangeType::make(offset, value - offset.index()); } - constexpr RangeType back(ValueType offset) const { return RangeType::make(value - offset.index(), offset.index()); } - - operator DagorSafeArg() const { return {count()}; } - - friend bool operator==(const TaggedCountType &l, const TaggedCountType &r) { return l.value == r.value; } - - friend bool operator!=(const TaggedCountType &l, const TaggedCountType &r) { return l.value != r.value; } - - friend bool operator<=(const TaggedCountType &l, const TaggedCountType &r) { return l.value <= r.value; } - - friend bool operator>=(const TaggedCountType &l, const TaggedCountType &r) { return l.value >= r.value; } - - friend bool operator<(const TaggedCountType &l, const TaggedCountType &r) { return l.value < r.value; } - - friend bool operator>(const TaggedCountType &l, const TaggedCountType &r) { return l.value > r.value; } - - friend bool operator==(const RangeType &l, const TaggedCountType &r) { return 0 == l.front().index() && l.size() == r.value; } - - friend bool operator!=(const RangeType &l, const TaggedCountType &r) { return 0 != l.front().index() && l.size() != r.value; } - - friend bool operator==(const TaggedCountType &l, const RangeType &r) { return 0 == r.front().index() && r.size() == l.value; } - - friend bool operator!=(const TaggedCountType &l, const RangeType &r) { return 0 != r.front().index() && r.size() != l.value; } -}; - -template -inline constexpr bool operator==(const TaggedCountType &l, const typename TaggedCountType::ValueType &r) -{ - return l.count() == r.index(); -} - -template -inline constexpr bool operator!=(const TaggedCountType &l, const typename TaggedCountType::ValueType &r) -{ - return l.count() != r.index(); -} - -template -inline constexpr bool operator<=(const TaggedCountType &l, const typename TaggedCountType::ValueType &r) -{ - return l.count() <= r.index(); -} - -template -inline constexpr bool operator>=(const TaggedCountType &l, const typename TaggedCountType::ValueType &r) -{ - return l.count() >= r.index(); -} - -template -inline constexpr bool operator<(const TaggedCountType &l, const typename TaggedCountType::ValueType &r) -{ - return l.count() < r.index(); -} - -template -inline constexpr bool operator>(const TaggedCountType &l, const typename TaggedCountType::ValueType &r) -{ - return l.count() > r.index(); -} - -template -inline constexpr bool operator==(const typename TaggedCountType::ValueType &l, const TaggedCountType &r) -{ - return l.index() == r.count(); -} - -template -inline constexpr bool operator!=(const typename TaggedCountType::ValueType &l, const TaggedCountType &r) -{ - return l.index() != r.count(); -} - -template -inline constexpr bool operator<=(const typename TaggedCountType::ValueType &l, const TaggedCountType &r) -{ - return l.index() <= r.count(); -} - -template -inline constexpr bool operator>=(const typename TaggedCountType::ValueType &l, const TaggedCountType &r) -{ - return l.index() >= r.count(); -} - -template -inline constexpr bool operator<(const typename TaggedCountType::ValueType &l, const TaggedCountType &r) -{ - return l.index() < r.count(); -} - -struct MipMapIndexTag; -using MipMapIndex = TaggedIndexType; -using MipMapRange = TaggedRangeType; -using MipMapCount = TaggedCountType; - -struct SubresourceIndexTag; -using SubresourceIndex = TaggedIndexType; -using SubresourceRange = TaggedRangeType; -using SubresourceCount = TaggedCountType; - -struct ArrayLayerIndexTag; -using ArrayLayerIndex = TaggedIndexType; -using ArrayLayerRange = TaggedRangeType; -using ArrayLayerCount = TaggedCountType; - -struct FormatPlaneIndexTag; -using FormatPlaneIndex = TaggedIndexType; -using FormatPlaneRange = TaggedRangeType; -using FormatPlaneCount = TaggedCountType; - -class SubresourcePerFormatPlaneCount -{ -public: - using ValueType = SubresourceIndex::ValueType; - -private: - ValueType value{}; - - constexpr SubresourcePerFormatPlaneCount(ValueType v) : value{v} {} - -public: - constexpr SubresourcePerFormatPlaneCount() = default; - ~SubresourcePerFormatPlaneCount() = default; - - SubresourcePerFormatPlaneCount(const SubresourcePerFormatPlaneCount &) = default; - SubresourcePerFormatPlaneCount &operator=(const SubresourcePerFormatPlaneCount &) = default; - - static constexpr SubresourcePerFormatPlaneCount make(ValueType v) { return {v}; } - static constexpr SubresourcePerFormatPlaneCount make(MipMapCount mip, ArrayLayerCount layers) - { - return {ValueType(mip.count() * layers.count())}; - } - static constexpr SubresourcePerFormatPlaneCount make(ArrayLayerCount layers, MipMapCount mip) - { - return {ValueType(mip.count() * layers.count())}; - } - - constexpr ValueType count() const { return value; } - - operator DagorSafeArg() const { return {count()}; } -}; - -inline SubresourcePerFormatPlaneCount operator*(const MipMapCount &l, const ArrayLayerCount &r) -{ - return SubresourcePerFormatPlaneCount::make(l, r); -} - -inline SubresourcePerFormatPlaneCount operator*(const ArrayLayerCount &l, const MipMapCount &r) -{ - return SubresourcePerFormatPlaneCount::make(l, r); -} - -// Per plane subres count times the plane index yields the subres range of the plane index -inline SubresourceRange operator*(const SubresourcePerFormatPlaneCount &l, const FormatPlaneIndex &r) -{ - return SubresourceRange::make(l.count() * r.index(), l.count()); -} - -// To keep it associative index times per plane yields also the subres range of the plane index -inline SubresourceRange operator*(const FormatPlaneIndex &l, const SubresourcePerFormatPlaneCount &r) -{ - return SubresourceRange::make(r.count() * l.index(), r.count()); -} - -// Per plane subres count times the plane count yields the total subres count -inline SubresourceCount operator*(const SubresourcePerFormatPlaneCount &l, const FormatPlaneCount &r) -{ - return SubresourceCount::make(l.count() * r.count()); -} - -inline SubresourceCount operator*(const FormatPlaneCount &r, const SubresourcePerFormatPlaneCount &l) -{ - return SubresourceCount::make(l.count() * r.count()); -} -inline SubresourceIndex calculate_subresource_index(MipMapIndex mip, ArrayLayerIndex array, MipMapCount mips_per_array) -{ - return SubresourceIndex::make(mip.index() + (array.index() * mips_per_array.count())); -} +class Device; +Device &get_device(); -inline SubresourceIndex operator+(const SubresourceIndex &l, const SubresourcePerFormatPlaneCount &r) -{ - return SubresourceIndex::make(l.index() + r.count()); -} } // namespace drv3d_dx12 -#include "format_store.h" - -namespace drv3d_dx12 -{ -BEGIN_BITFIELD_TYPE(ImageViewState, uint64_t) - enum Type - { - INVALID, // 0! - SRV, - UAV, - RTV, - DSV_RW, - DSV_CONST - }; - enum - { - WORD_SIZE = sizeof(uint64_t) * 8, - SAMPLE_STENCIL_BITS = 1, - SAMPLE_STENCIL_SHIFT = 0, - TYPE_BITS = 3, - TYPE_SHIFT = SAMPLE_STENCIL_BITS + SAMPLE_STENCIL_SHIFT, - IS_CUBEMAP_BITS = 1, - IS_CUBEMAP_SHIFT = TYPE_BITS + TYPE_SHIFT, - IS_ARRAY_BITS = 1, - IS_ARRAY_SHIFT = IS_CUBEMAP_BITS + IS_CUBEMAP_SHIFT, - FORMAT_BITS = FormatStore::BITS + 1, - FORMAT_SHIFT = IS_ARRAY_SHIFT + IS_ARRAY_BITS, - MIPMAP_OFFSET_BITS = BitsNeeded<15>::VALUE, - MIPMAP_OFFSET_SHIFT = FORMAT_SHIFT + FORMAT_BITS, - MIPMAP_RANGE_OFFSET = 1, - MIPMAP_RANGE_BITS = BitsNeeded<16 - MIPMAP_RANGE_OFFSET>::VALUE, - MIPMAP_RANGE_SHIFT = MIPMAP_OFFSET_SHIFT + MIPMAP_OFFSET_BITS, - // automatic assign left over space to array range def - ARRAY_DATA_SIZE = WORD_SIZE - MIPMAP_RANGE_SHIFT - MIPMAP_RANGE_BITS, - ARRAY_OFFSET_BITS = (ARRAY_DATA_SIZE / 2) + (ARRAY_DATA_SIZE % 2), - ARRAY_OFFSET_SHIFT = MIPMAP_RANGE_SHIFT + MIPMAP_RANGE_BITS, - ARRAY_RANGE_OFFSET = 1, - ARRAY_RANGE_BITS = ARRAY_DATA_SIZE / 2, - ARRAY_RANGE_SHIFT = (ARRAY_OFFSET_SHIFT + ARRAY_OFFSET_BITS) - }; - ADD_BITFIELD_MEMBER(sampleStencil, SAMPLE_STENCIL_SHIFT, SAMPLE_STENCIL_BITS) - ADD_BITFIELD_MEMBER(type, TYPE_SHIFT, TYPE_BITS) - ADD_BITFIELD_MEMBER(isCubemap, IS_CUBEMAP_SHIFT, IS_CUBEMAP_BITS) - ADD_BITFIELD_MEMBER(isArray, IS_ARRAY_SHIFT, IS_ARRAY_BITS) - ADD_BITFIELD_MEMBER(format, FORMAT_SHIFT, FORMAT_BITS) - ADD_BITFIELD_MEMBER(mipmapOffset, MIPMAP_OFFSET_SHIFT, MIPMAP_OFFSET_BITS); - ADD_BITFIELD_MEMBER(mipmapRange, MIPMAP_RANGE_SHIFT, MIPMAP_RANGE_BITS) - ADD_BITFIELD_MEMBER(arrayOffset, ARRAY_OFFSET_SHIFT, ARRAY_OFFSET_BITS) - ADD_BITFIELD_MEMBER(arrayRange, ARRAY_RANGE_SHIFT, ARRAY_RANGE_BITS) - bool isValid() const - { - return uint64_t(*this) != 0; - } // since type can't be 0/UNKNOWN, all bits can't be 0. comparing whole machine word is faster than extract type - explicit operator bool() const { return isValid(); } - void setType(Type tp) { type = tp; } - Type getType() const { return static_cast(static_cast(type)); } - void setRTV() { setType(RTV); } - void setDSV(bool as_const) { setType(as_const ? DSV_CONST : DSV_RW); } - void setSRV() { setType(SRV); } - void setUAV() { setType(UAV); } - bool isRTV() const { return static_cast(type) == RTV; } - bool isDSV() const { return static_cast(type) == DSV_RW || static_cast(type) == DSV_CONST; } - bool isSRV() const { return static_cast(type) == SRV; } - bool isUAV() const { return static_cast(type) == UAV; } - - // TODO check cube/array handling - - // TODO: is d24/d32 always planar? - D3D12_SHADER_RESOURCE_VIEW_DESC asSRVDesc(D3D12_RESOURCE_DIMENSION dim) const - { - D3D12_SHADER_RESOURCE_VIEW_DESC result; - const auto fmt = getFormat(); - result.Format = fmt.asDxGiFormat(); - uint32_t planeSlice = 0; - if (DXGI_FORMAT_D24_UNORM_S8_UINT == result.Format) - { - if (0 == sampleStencil) - { - result.Format = DXGI_FORMAT_R24_UNORM_X8_TYPELESS; - } - else - { - result.Format = DXGI_FORMAT_X24_TYPELESS_G8_UINT; - planeSlice = fmt.getPlanes().count() > 1 ? 1 : 0; - } - } - else if (DXGI_FORMAT_D32_FLOAT_S8X24_UINT == result.Format) - { - if (0 == sampleStencil) - { - result.Format = DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS; - } - else - { - result.Format = DXGI_FORMAT_X32_TYPELESS_G8X24_UINT; - planeSlice = fmt.getPlanes().count() > 1 ? 1 : 0; - } - } - else if (DXGI_FORMAT_D16_UNORM == result.Format) - { - result.Format = DXGI_FORMAT_R16_UNORM; - } - else if (DXGI_FORMAT_D32_FLOAT == result.Format) - { - result.Format = DXGI_FORMAT_R32_FLOAT; - } - result.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING; - switch (dim) - { - case D3D12_RESOURCE_DIMENSION_BUFFER: - case D3D12_RESOURCE_DIMENSION_UNKNOWN: fatal("Usage error!"); return {}; - case D3D12_RESOURCE_DIMENSION_TEXTURE1D: - if (isArray) - { - result.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE1DARRAY; - auto &target = result.Texture1DArray; - target.MostDetailedMip = getMipBase().index(); - target.MipLevels = getMipCount(); - target.FirstArraySlice = getArrayBase().index(); - target.ArraySize = getArrayCount(); - target.ResourceMinLODClamp = 0.f; - } - else - { - result.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE1D; - auto &target = result.Texture1D; - target.MostDetailedMip = getMipBase().index(); - target.MipLevels = getMipCount(); - target.ResourceMinLODClamp = 0.f; - } - break; - case D3D12_RESOURCE_DIMENSION_TEXTURE2D: - if (isCubemap) - { - if (isArray) - { - result.ViewDimension = D3D12_SRV_DIMENSION_TEXTURECUBEARRAY; - auto &target = result.TextureCubeArray; - target.MostDetailedMip = getMipBase().index(); - target.MipLevels = getMipCount(); - target.First2DArrayFace = getArrayBase().index(); - target.NumCubes = getArrayCount() / 6; - target.ResourceMinLODClamp = 0.f; - } - else - { - result.ViewDimension = D3D12_SRV_DIMENSION_TEXTURECUBE; - auto &target = result.TextureCube; - target.MostDetailedMip = getMipBase().index(); - target.MipLevels = getMipCount(); - target.ResourceMinLODClamp = 0.f; - } - } - else - { - if (isArray) - { - result.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2DARRAY; - auto &target = result.Texture2DArray; - target.MostDetailedMip = getMipBase().index(); - target.MipLevels = getMipCount(); - target.FirstArraySlice = getArrayBase().index(); - target.ArraySize = getArrayCount(); - target.PlaneSlice = planeSlice; - target.ResourceMinLODClamp = 0.f; - } - else - { - result.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D; - auto &target = result.Texture2D; - target.MostDetailedMip = getMipBase().index(); - target.MipLevels = getMipCount(); - target.PlaneSlice = planeSlice; - target.ResourceMinLODClamp = 0.f; - } - } - break; - case D3D12_RESOURCE_DIMENSION_TEXTURE3D: - result.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE3D; - auto &target = result.Texture3D; - target.MostDetailedMip = getMipBase().index(); - target.MipLevels = getMipCount(); - target.ResourceMinLODClamp = 0.f; - break; - } - return result; - } - - D3D12_UNORDERED_ACCESS_VIEW_DESC asUAVDesc(D3D12_RESOURCE_DIMENSION dim) const - { - D3D12_UNORDERED_ACCESS_VIEW_DESC result; - result.Format = getFormat().asDxGiFormat(); - switch (dim) - { - case D3D12_RESOURCE_DIMENSION_BUFFER: - case D3D12_RESOURCE_DIMENSION_UNKNOWN: fatal("Usage error!"); return {}; - case D3D12_RESOURCE_DIMENSION_TEXTURE1D: - if (isArray) - { - result.ViewDimension = D3D12_UAV_DIMENSION_TEXTURE1DARRAY; - auto &target = result.Texture1DArray; - target.MipSlice = getMipBase().index(); - target.FirstArraySlice = getArrayBase().index(); - target.ArraySize = getArrayCount(); - } - else - { - result.ViewDimension = D3D12_UAV_DIMENSION_TEXTURE1D; - auto &target = result.Texture1D; - target.MipSlice = getMipBase().index(); - } - break; - case D3D12_RESOURCE_DIMENSION_TEXTURE2D: - // Array and cube are the same for UAV - if (isArray || isCubemap) - { - result.ViewDimension = D3D12_UAV_DIMENSION_TEXTURE2DARRAY; - auto &target = result.Texture2DArray; - target.MipSlice = getMipBase().index(); - target.FirstArraySlice = getArrayBase().index(); - target.ArraySize = getArrayCount(); - target.PlaneSlice = 0; - } - else - { - result.ViewDimension = D3D12_UAV_DIMENSION_TEXTURE2D; - auto &target = result.Texture2D; - target.MipSlice = getMipBase().index(); - target.PlaneSlice = 0; - } - break; - case D3D12_RESOURCE_DIMENSION_TEXTURE3D: - result.ViewDimension = D3D12_UAV_DIMENSION_TEXTURE3D; - auto &target = result.Texture3D; - target.MipSlice = getMipBase().index(); - target.FirstWSlice = getArrayBase().index(); - target.WSize = getArrayCount(); - break; - } - return result; - } - - D3D12_RENDER_TARGET_VIEW_DESC asRTVDesc(D3D12_RESOURCE_DIMENSION dim) const - { - D3D12_RENDER_TARGET_VIEW_DESC result; - result.Format = getFormat().asDxGiFormat(); - switch (dim) - { - case D3D12_RESOURCE_DIMENSION_BUFFER: - case D3D12_RESOURCE_DIMENSION_UNKNOWN: fatal("Usage error!"); return {}; - case D3D12_RESOURCE_DIMENSION_TEXTURE1D: - if (isArray) - { - result.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE1DARRAY; - auto &target = result.Texture1DArray; - target.MipSlice = getMipBase().index(); - target.FirstArraySlice = getArrayBase().index(); - target.ArraySize = getArrayCount(); - } - else - { - result.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE1D; - auto &target = result.Texture1D; - target.MipSlice = getMipBase().index(); - } - break; - case D3D12_RESOURCE_DIMENSION_TEXTURE2D: - if (isArray || isCubemap) - { - result.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE2DARRAY; - auto &target = result.Texture2DArray; - target.MipSlice = getMipBase().index(); - target.FirstArraySlice = getArrayBase().index(); - target.ArraySize = getArrayCount(); - target.PlaneSlice = 0; - } - else - { - result.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE2D; - auto &target = result.Texture2D; - target.MipSlice = getMipBase().index(); - target.PlaneSlice = 0; - } - break; - case D3D12_RESOURCE_DIMENSION_TEXTURE3D: - result.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE3D; - auto &target = result.Texture3D; - target.MipSlice = getMipBase().index(); - target.FirstWSlice = getArrayBase().index(); - target.WSize = getArrayCount(); - break; - } - - return result; - } - - D3D12_DEPTH_STENCIL_VIEW_DESC asDSVDesc(D3D12_RESOURCE_DIMENSION dim) const - { - D3D12_DEPTH_STENCIL_VIEW_DESC result; - auto fmt = getFormat(); - result.Format = fmt.asDxGiFormat(); - result.Flags = D3D12_DSV_FLAG_NONE; - if (getType() == DSV_CONST) - { - if (fmt.isDepth()) - result.Flags |= D3D12_DSV_FLAG_READ_ONLY_DEPTH; - if (fmt.isStencil()) - result.Flags |= D3D12_DSV_FLAG_READ_ONLY_STENCIL; - } - switch (dim) - { - case D3D12_RESOURCE_DIMENSION_BUFFER: - case D3D12_RESOURCE_DIMENSION_UNKNOWN: fatal("Usage error!"); return {}; - case D3D12_RESOURCE_DIMENSION_TEXTURE1D: - if (isArray) - { - result.ViewDimension = D3D12_DSV_DIMENSION_TEXTURE1DARRAY; - auto &target = result.Texture1DArray; - target.MipSlice = getMipBase().index(); - target.FirstArraySlice = getArrayBase().index(); - target.ArraySize = getArrayCount(); - } - else - { - result.ViewDimension = D3D12_DSV_DIMENSION_TEXTURE1D; - auto &target = result.Texture1D; - target.MipSlice = getMipBase().index(); - } - break; - case D3D12_RESOURCE_DIMENSION_TEXTURE2D: - if (isArray || isCubemap) - { - result.ViewDimension = D3D12_DSV_DIMENSION_TEXTURE2DARRAY; - auto &target = result.Texture2DArray; - target.MipSlice = getMipBase().index(); - target.FirstArraySlice = getArrayBase().index(); - target.ArraySize = getArrayCount(); - } - else - { - result.ViewDimension = D3D12_DSV_DIMENSION_TEXTURE2D; - auto &target = result.Texture2D; - target.MipSlice = getMipBase().index(); - } - break; - case D3D12_RESOURCE_DIMENSION_TEXTURE3D: fatal("DX12: Volume depth stencil view not supported"); break; - } - return result; - } - - void setFormat(FormatStore fmt) { format = fmt.index; } - FormatStore getFormat() const { return FormatStore(format); } - void setMipBase(MipMapIndex u) { mipmapOffset = u.index(); } - MipMapIndex getMipBase() const { return MipMapIndex::make(mipmapOffset); } - void setMipCount(uint8_t u) { mipmapRange = u - MIPMAP_RANGE_OFFSET; } - uint8_t getMipCount() const { return MIPMAP_RANGE_OFFSET + mipmapRange; } - void setSingleMipMapRange(MipMapIndex index) - { - setMipBase(index); - G_STATIC_ASSERT(1 == MIPMAP_RANGE_OFFSET); - mipmapRange = 0; - } - void setMipMapRange(MipMapIndex index, uint32_t count) - { - setMipBase(index); - setMipCount(count); - } - void setMipMapRange(MipMapRange range) - { - setMipBase(range.front()); - setMipCount(range.size()); - } - MipMapRange getMipRange() const { return MipMapRange::make(getMipBase(), getMipCount()); } - void setArrayBase(ArrayLayerIndex u) { arrayOffset = u.index(); } - ArrayLayerIndex getArrayBase() const { return ArrayLayerIndex::make(arrayOffset); } - void setArrayCount(uint16_t u) { arrayRange = u - ARRAY_RANGE_OFFSET; } - uint16_t getArrayCount() const { return ARRAY_RANGE_OFFSET + (uint32_t)arrayRange; } - void setArrayRange(ArrayLayerRange range) - { - setArrayBase(range.front()); - setArrayCount(range.size()); - } - void setSingleArrayRange(ArrayLayerIndex index) - { - setArrayBase(index); - G_STATIC_ASSERT(1 == ARRAY_RANGE_OFFSET); - arrayRange = 0; - } - ArrayLayerRange getArrayRange() const { return ArrayLayerRange::make(getArrayBase(), getArrayCount()); } - void setSingleDepthLayer(uint16_t base) - { - setArrayBase(ArrayLayerIndex::make(base)); - setArrayCount(1); - } - void setDepthLayerRange(uint16_t base, uint16_t count) - { - setArrayBase(ArrayLayerIndex::make(base)); - setArrayCount(count); - } - FormatPlaneIndex getPlaneIndex() const - { - return FormatPlaneIndex::make((sampleStencil && getFormat().getPlanes().count() > 1) ? 1 : 0); - } - template - // NOTE: does not take plane index into account - void iterateSubresources(D3D12_RESOURCE_DIMENSION dim, MipMapCount mip_per_array, T clb) - { - if (D3D12_RESOURCE_DIMENSION_TEXTURE3D == dim) - { - for (auto m : getMipRange()) - { - clb(SubresourceIndex::make(m.index())); - } - } - else - { - for (auto a : getArrayRange()) - { - for (auto m : getMipRange()) - { - clb(calculate_subresource_index(m, a, mip_per_array)); - } - } - } - } -END_BITFIELD_TYPE() -inline bool operator==(ImageViewState l, ImageViewState r) { return l.wrapper.value == r.wrapper.value; } -inline bool operator!=(ImageViewState l, ImageViewState r) { return l.wrapper.value != r.wrapper.value; } -inline bool operator<(ImageViewState l, ImageViewState r) { return l.wrapper.value < r.wrapper.value; } - -// We have our own half float converter here, as we may want to tweak it a bit in the future -namespace half_float -{ -static constexpr uint32_t float_sign_mask = 0x80000000U; -static constexpr uint32_t float_exponent_mask = 0x7F800000U; -static constexpr uint32_t float_mantissa_mask = 0x7FFFFFU; -static constexpr uint32_t float_mantiassa_size = 23; -static constexpr int32_t float_mantiassa_bias = 127; -static constexpr int32_t bias = 15; -static constexpr int32_t exponent_size = 5; -static constexpr int32_t mantissa_size = 10; -static constexpr int32_t max_exponent = (1 << exponent_size) - 1; - -inline float convert_to_float(uint16_t v) -{ - int32_t exponent = int32_t(((v & 0x7FFFU)) >> mantissa_size); - bool isNan = exponent >= max_exponent; - uint32_t exponentPart = - exponent <= 0 ? 0U : (isNan ? float_exponent_mask : ((exponent - bias + float_mantiassa_bias) << float_mantiassa_size)); - uint32_t signPart = uint32_t(v & 0x8000U) << 16; - uint32_t fractionPart = isNan ? float_mantissa_mask : (uint32_t(v) << (float_mantiassa_size - mantissa_size)) & float_mantissa_mask; - uint32_t floatBits = signPart | exponentPart | fractionPart; - float floatValue; - memcpy(&floatValue, &floatBits, sizeof(float)); - return floatValue; -} - -inline uint16_t convert_from_float(float v) -{ - uint32_t floatBits; - memcpy(&floatBits, &v, sizeof(float)); - - int32_t exponent = ((floatBits & float_exponent_mask) >> float_mantiassa_size) - float_mantiassa_bias + bias; - uint32_t exponentPart = clamp(exponent, 0, max_exponent) << mantissa_size; - uint32_t signPart = ((floatBits & float_sign_mask) >> 16); - uint32_t fractionPart = exponent >= 0 ? ((floatBits & float_mantissa_mask) >> (float_mantiassa_size - mantissa_size)) : 0U; - - return signPart | exponentPart | fractionPart; -} -} // namespace half_float - -BEGIN_BITFIELD_TYPE(SamplerState, uint32_t) - enum - { - BIAS_BITS = 16, - BIAS_OFFSET = 0, - MIP_BITS = 1, - MIP_SHIFT = BIAS_OFFSET + BIAS_BITS, - FILTER_BITS = 1, - FILTER_SHIFT = MIP_SHIFT + MIP_BITS, - // Instead of using N bits per coord, we store all coords in one value, this safes 2 bits - COORD_VALUE_COUNT = (D3D12_TEXTURE_ADDRESS_MODE_MIRROR_ONCE - D3D12_TEXTURE_ADDRESS_MODE_WRAP) + 1, - COORD_MAX_VALUE = COORD_VALUE_COUNT * COORD_VALUE_COUNT * COORD_VALUE_COUNT, - COORD_BITS = BitsNeeded::VALUE, - COORD_SHIFT = FILTER_SHIFT + FILTER_BITS, - ANISO_BITS = 3, - ANISO_SHIFT = COORD_SHIFT + COORD_BITS, - BORDER_BITS = 2, - BORDER_SHIFT = ANISO_SHIFT + ANISO_BITS, - IS_COMPARE_BITS = 1, - IS_COMPARE_SHIFT = BORDER_BITS + BORDER_SHIFT, - FLOAT_EXP_BASE = 127, - FLOAT_EXP_SHIFT = 23, - }; - ADD_BITFIELD_MEMBER(mipMapMode, MIP_SHIFT, MIP_BITS) - ADD_BITFIELD_MEMBER(filterMode, FILTER_SHIFT, FILTER_BITS) - ADD_BITFIELD_MEMBER(borderColorMode, BORDER_SHIFT, BORDER_BITS) - ADD_BITFIELD_MEMBER(anisotropicValue, ANISO_SHIFT, ANISO_BITS) - ADD_BITFIELD_MEMBER(coordModes, COORD_SHIFT, COORD_BITS) - ADD_BITFIELD_MEMBER(biasBits, BIAS_OFFSET, BIAS_BITS) - ADD_BITFIELD_MEMBER(isCompare, IS_COMPARE_SHIFT, IS_COMPARE_BITS) - - BEGIN_BITFIELD_TYPE(iee754Float, uint32_t) - float asFloat; - uint32_t asUint; - int32_t asInt; - ADD_BITFIELD_MEMBER(mantissa, 0, 23) - ADD_BITFIELD_MEMBER(exponent, 23, 8) - ADD_BITFIELD_MEMBER(sign, 31, 1) - END_BITFIELD_TYPE() - - D3D12_SAMPLER_DESC asDesc() const - { - D3D12_SAMPLER_DESC result; - - result.MaxAnisotropy = getAnisoInt(); - if (result.MaxAnisotropy > 1) - result.Filter = D3D12_ENCODE_ANISOTROPIC_FILTER(static_cast(isCompare)); - else - result.Filter = D3D12_ENCODE_BASIC_FILTER(getFilter(), getFilter(), getMip(), static_cast(isCompare)); - - result.AddressU = getU(); - result.AddressV = getV(); - result.AddressW = getW(); - result.MipLODBias = getBias(); - result.ComparisonFunc = isCompare ? D3D12_COMPARISON_FUNC_LESS_EQUAL : D3D12_COMPARISON_FUNC_ALWAYS; - result.MinLOD = 0; - result.MaxLOD = FLT_MAX; - result.BorderColor[0] = static_cast(borderColorMode) & 1 ? 1.f : 0.f; - result.BorderColor[1] = static_cast(borderColorMode) & 1 ? 1.f : 0.f; - result.BorderColor[2] = static_cast(borderColorMode) & 1 ? 1.f : 0.f; - result.BorderColor[3] = static_cast(borderColorMode) & 2 ? 1.f : 0.f; - - return result; - } - - void setMip(D3D12_FILTER_TYPE mip) { mipMapMode = (uint32_t)mip; } - D3D12_FILTER_TYPE getMip() const { return static_cast(static_cast(mipMapMode)); } - void setFilter(D3D12_FILTER_TYPE filter) { filterMode = filter; } - D3D12_FILTER_TYPE getFilter() const { return static_cast(static_cast(filterMode)); } - void setCoordModes(D3D12_TEXTURE_ADDRESS_MODE u, D3D12_TEXTURE_ADDRESS_MODE v, D3D12_TEXTURE_ADDRESS_MODE w) - { - auto rawU = static_cast(u) - D3D12_TEXTURE_ADDRESS_MODE_WRAP; - auto rawV = static_cast(v) - D3D12_TEXTURE_ADDRESS_MODE_WRAP; - auto rawW = static_cast(w) - D3D12_TEXTURE_ADDRESS_MODE_WRAP; - coordModes = rawW * COORD_VALUE_COUNT * COORD_VALUE_COUNT + rawV * COORD_VALUE_COUNT + rawU; - } - void setU(D3D12_TEXTURE_ADDRESS_MODE u) - { - auto oldRawU = coordModes % COORD_VALUE_COUNT; - auto newRawU = static_cast(u) - D3D12_TEXTURE_ADDRESS_MODE_WRAP; - coordModes -= oldRawU; - coordModes += newRawU; - } - void setV(D3D12_TEXTURE_ADDRESS_MODE v) - { - auto oldRawV = (coordModes / COORD_VALUE_COUNT) % COORD_VALUE_COUNT; - auto newRawV = static_cast(v) - D3D12_TEXTURE_ADDRESS_MODE_WRAP; - coordModes -= oldRawV * COORD_VALUE_COUNT; - coordModes += newRawV * COORD_VALUE_COUNT; - } - void setW(D3D12_TEXTURE_ADDRESS_MODE w) - { - auto oldRawW = (coordModes / COORD_VALUE_COUNT / COORD_VALUE_COUNT) % COORD_VALUE_COUNT; - auto newRawW = static_cast(w) - D3D12_TEXTURE_ADDRESS_MODE_WRAP; - coordModes -= oldRawW * COORD_VALUE_COUNT * COORD_VALUE_COUNT; - coordModes += newRawW * COORD_VALUE_COUNT * COORD_VALUE_COUNT; - } - D3D12_TEXTURE_ADDRESS_MODE getU() const - { - auto rawValue = coordModes % COORD_VALUE_COUNT; - return static_cast(D3D12_TEXTURE_ADDRESS_MODE_WRAP + rawValue); - } - D3D12_TEXTURE_ADDRESS_MODE getV() const - { - auto rawValue = (coordModes / COORD_VALUE_COUNT) % COORD_VALUE_COUNT; - return static_cast(D3D12_TEXTURE_ADDRESS_MODE_WRAP + rawValue); - } - D3D12_TEXTURE_ADDRESS_MODE getW() const - { - auto rawValue = (coordModes / COORD_VALUE_COUNT) / COORD_VALUE_COUNT; - return static_cast(D3D12_TEXTURE_ADDRESS_MODE_WRAP + rawValue); - } - void setBias(float b) { biasBits = half_float::convert_from_float(b); } - float getBias() const { return half_float::convert_to_float(biasBits); } - void setAniso(float a) - { - // some float magic, falls flat on its face if it is not ieee-754 - // extracts exponent and subtracts the base - // clamps the result into range from 0 to 4 which represents 1,2,4,8 and 16 as floats - // negative values are treated as positive - // values from range 0 - 1 are rounded up - // everything else is rounded down - iee754Float f; - f.asFloat = a; - int32_t value = f.exponent - FLOAT_EXP_BASE; - // clamp from 1 to 16 - value = clamp(value, 0, 4); - anisotropicValue = value; - } - float getAniso() const - { - iee754Float f; - f.exponent = FLOAT_EXP_BASE + anisotropicValue; - return f.asFloat; - } - uint32_t getAnisoInt() const { return 1u << static_cast(anisotropicValue); } - // Same restrictions as with vulkan, either color is white or black and its either fully - // transparent or opaque - void setBorder(E3DCOLOR color) { borderColorMode = ((color.r || color.g || color.b) ? 1 : 0) | (color.a ? 2 : 0); } - E3DCOLOR getBorder() const - { - E3DCOLOR result; - result.r = static_cast(borderColorMode) & 1 ? 0xFF : 0; - result.g = static_cast(borderColorMode) & 1 ? 0xFF : 0; - result.b = static_cast(borderColorMode) & 1 ? 0xFF : 0; - result.a = static_cast(borderColorMode) & 2 ? 0xFF : 0; - return result; - } - - bool needsBorderColor() const - { - return (D3D12_TEXTURE_ADDRESS_MODE_BORDER == getU()) || (D3D12_TEXTURE_ADDRESS_MODE_BORDER == getV()) || - (D3D12_TEXTURE_ADDRESS_MODE_BORDER == getW()); - } - - bool normalizeSelf() - { - bool wasNormalized = false; - if (!needsBorderColor()) - { - setBorder(0); - wasNormalized = true; - } - return wasNormalized; - } - - SamplerState normalize() const - { - // normalization is when border color is not needed we default to color 0 - SamplerState copy = *this; - copy.normalizeSelf(); - return copy; - } - - static SamplerState fromSamplerInfo(const d3d::SamplerInfo &info); -END_BITFIELD_TYPE() - -struct ImageViewInfo -{ - D3D12_CPU_DESCRIPTOR_HANDLE handle; - ImageViewState state; -}; - - -template -constexpr inline size_t array_size(T (&)[N]) -{ - return N; -} - -inline D3D12_PRIMITIVE_TOPOLOGY translate_primitive_topology_to_dx12(int value) -{ - // G_ASSERTF(value < PRIM_TRIFAN, "primitive topology was %u", value); -#if _TARGET_XBOX - if (value == PRIM_QUADLIST) - return D3D_PRIMITIVE_TOPOLOGY_QUADLIST; -#endif - return static_cast(value); -} - -template -class TaggedHandle -{ - H h; // -V730_NOINIT - -public: - bool interlockedIsNull() const { return interlocked_acquire_load(h) == NullValue; } - void interlockedSet(H v) { interlocked_release_store(h, v); } - H get() const { return h; } - explicit TaggedHandle(H a) : h(a) {} - TaggedHandle() {} - bool operator!() const { return h == NullValue; } - friend bool operator==(TaggedHandle l, TaggedHandle r) { return l.get() == r.get(); } - friend bool operator!=(TaggedHandle l, TaggedHandle r) { return l.get() != r.get(); } - friend bool operator<(TaggedHandle l, TaggedHandle r) { return l.get() < r.get(); } - friend bool operator>(TaggedHandle l, TaggedHandle r) { return l.get() > r.get(); } - friend H operator-(TaggedHandle l, TaggedHandle r) { return l.get() - r.get(); } - static TaggedHandle Null() { return TaggedHandle(NullValue); } - static TaggedHandle make(H value) { return TaggedHandle{value}; } -}; - -template -inline TaggedHandle genereate_next_id(TaggedHandle last_id) -{ - auto value = last_id.get() + 1; - if (value == NullValue) - ++value; - G_ASSERT(value != NullValue); - return TaggedHandle{value}; -} - -struct GraphicsProgramIDTag -{}; -struct InputLayoutIDTag -{}; -struct StaticRenderStateIDTag -{}; -struct InternalInputLayoutIDTag -{}; -struct FramebufferLayoutIDTag -{}; -struct BindlessSetIdTag; - -typedef TaggedHandle InputLayoutID; -typedef TaggedHandle StaticRenderStateID; -using InternalInputLayoutID = TaggedHandle; -using FramebufferLayoutID = TaggedHandle; -using BindlessSetId = TaggedHandle; - -// comes from ShaderID::group member having 3 bits -> 0-7 -static constexpr uint32_t max_scripted_shaders_bin_groups = 8; - -class ShaderByteCodeId -{ - union - { - uint32_t value; - struct - { - uint32_t group : 3; - uint32_t index : 29; - }; - }; - -public: - // ShaderByteCodeId() = default; - // explicit ShaderByteCodeId(uint32_t value) : value{value} {} - explicit operator bool() const { return -1 != value; } - bool operator!() const { return -1 == value; } - - uint32_t getGroup() const { return group; } - uint32_t getIndex() const { return index; } - - int32_t exportValue() const { return value; } - - static ShaderByteCodeId Null() - { - ShaderByteCodeId result; - result.value = -1; - return result; - } - - static ShaderByteCodeId make(uint32_t group, uint32_t index) - { - ShaderByteCodeId result; - result.group = group; - result.index = index; - return result; - } - - friend bool operator<(const ShaderByteCodeId l, const ShaderByteCodeId r) { return l.value < r.value; } - friend bool operator>(const ShaderByteCodeId l, const ShaderByteCodeId r) { return l.value > r.value; } - friend bool operator!=(const ShaderByteCodeId l, const ShaderByteCodeId r) { return l.value != r.value; } - friend bool operator==(const ShaderByteCodeId l, const ShaderByteCodeId r) { return l.value == r.value; } -}; - -class ShaderID -{ - union - { - int32_t value; - struct - { - uint32_t group : 3; - uint32_t index : 29; - }; - }; - -public: - // ShaderID() = default; - // explicit ShaderID(uint32_t value) : value{value} {} - explicit operator bool() const { return -1 != value; } - bool operator!() const { return -1 == value; } - - uint32_t getGroup() const { return group; } - uint32_t getIndex() const { return index; } - - int32_t exportValue() const { return value; } - - static ShaderID Null() - { - ShaderID result; - result.value = -1; - return result; - } - - static ShaderID importValue(int32_t value) - { - ShaderID result; - result.value = value; - return result; - } - - static ShaderID make(uint32_t group, uint32_t index) - { - ShaderID result; - result.group = group; - result.index = index; - return result; - } - - friend bool operator<(const ShaderID l, const ShaderID r) { return l.value < r.value; } - - friend bool operator>(const ShaderID l, const ShaderID r) { return l.value > r.value; } - - friend bool operator!=(const ShaderID l, const ShaderID r) { return l.value != r.value; } - - friend bool operator==(const ShaderID l, const ShaderID r) { return l.value == r.value; } -}; - -class ProgramID -{ - union - { - int32_t value; - struct - { - uint32_t type : 2; - uint32_t group : 3; - uint32_t index : 27; - }; - }; - -public: - bool operator!() const { return -1 == value; } - explicit operator bool() const { return -1 == value; } - - uint32_t getType() const { return type; } - uint32_t getIndex() const { return index; } - uint32_t getGroup() const { return group; } - - int32_t exportValue() const { return value; } - - static constexpr uint32_t type_graphics = 0; - static constexpr uint32_t type_compute = 1; - static constexpr uint32_t type_raytrace = 2; - - bool isGraphics() const { return type_graphics == type; } - bool isCompute() const { return type_compute == type; } - bool isRaytrace() const { return type_raytrace == type; } - - static ProgramID Null() - { - ProgramID result; - result.value = -1; - return result; - } - - static ProgramID importValue(int32_t value) - { - ProgramID result; - result.value = value; - return result; - } - - static ProgramID asGraphicsProgram(uint32_t group, uint32_t index) - { - ProgramID result; - result.type = type_graphics; - result.group = group; - result.index = index; - return result; - } - - static ProgramID asComputeProgram(uint32_t group, uint32_t index) - { - ProgramID result; - result.type = type_compute; - result.group = group; - result.index = index; - return result; - } - - static ProgramID asRaytraceProgram(uint32_t group, uint32_t index) - { - ProgramID result; - result.type = type_raytrace; - result.group = group; - result.index = index; - return result; - } - - friend bool operator<(const ProgramID l, const ProgramID r) { return l.value < r.value; } - - friend bool operator>(const ProgramID l, const ProgramID r) { return l.value > r.value; } - - friend bool operator!=(const ProgramID l, const ProgramID r) { return l.value != r.value; } - - friend bool operator==(const ProgramID l, const ProgramID r) { return l.value == r.value; } -}; - -class GraphicsProgramID -{ - union - { - int32_t value; - struct - { - uint32_t group : 3; - uint32_t index : 29; - }; - }; - -public: - bool operator!() const { return -1 == value; } - explicit operator bool() const { return -1 == value; } - - uint32_t getIndex() const { return index; } - uint32_t getGroup() const { return group; } - - int32_t exportValue() const { return value; } - - static GraphicsProgramID Null() - { - GraphicsProgramID result; - result.value = -1; - return result; - } - - static GraphicsProgramID importValue(int32_t value) - { - GraphicsProgramID result; - result.value = value; - return result; - } - - static GraphicsProgramID make(uint32_t group, uint32_t index) - { - GraphicsProgramID result; - result.group = group; - result.index = index; - return result; - } - - friend bool operator<(const GraphicsProgramID l, const GraphicsProgramID r) { return l.value < r.value; } - - friend bool operator>(const GraphicsProgramID l, const GraphicsProgramID r) { return l.value > r.value; } - - friend bool operator!=(const GraphicsProgramID l, const GraphicsProgramID r) { return l.value != r.value; } - - friend bool operator==(const GraphicsProgramID l, const GraphicsProgramID r) { return l.value == r.value; } -}; - - -// TODO rename some of the names to better resemble what they are intended for -enum class DeviceMemoryClass -{ - DEVICE_RESIDENT_IMAGE, - DEVICE_RESIDENT_BUFFER, - // linear cpu cached textures - HOST_RESIDENT_HOST_READ_WRITE_IMAGE, - // linear cpu non-cached textures - HOST_RESIDENT_HOST_WRITE_ONLY_IMAGE, - HOST_RESIDENT_HOST_READ_WRITE_BUFFER, - - HOST_RESIDENT_HOST_READ_ONLY_BUFFER, - HOST_RESIDENT_HOST_WRITE_ONLY_BUFFER, - // special AMD memory type, - // a portion of gpu mem is host - // visible (256mb). - DEVICE_RESIDENT_HOST_WRITE_ONLY_BUFFER, - // we handle memory for push ring buffer differently than any other - PUSH_RING_BUFFER, - TEMPORARY_UPLOAD_BUFFER, - - READ_BACK_BUFFER, - BIDIRECTIONAL_BUFFER, - - RESERVED_RESOURCE, - -#if DX12_USE_ESRAM - ESRAM_RESOURCE, -#endif - -#if D3D_HAS_RAY_TRACING - DEVICE_RESIDENT_ACCELERATION_STRUCTURE, -#endif - - COUNT, - INVALID = COUNT -}; - -inline constexpr UINT calculate_subresource_index(UINT mip_slice, UINT array_slice, UINT plane_slice, UINT mip_size, UINT array_size) -{ - return mip_slice + (array_slice * mip_size) + (plane_slice * mip_size * array_size); -} - -inline constexpr UINT calculate_mip_slice_from_index(UINT index, UINT mip_size) { return index % mip_size; } - -inline constexpr UINT calculate_array_slice_from_index(UINT index, UINT mip_size, UINT array_size) -{ - return (index / mip_size) % array_size; -} - -inline constexpr UINT calculate_plane_slice_from_index(UINT index, UINT mip_size, UINT array_size) -{ - return (index / mip_size) / array_size; -} - -inline D3D12_COMPARISON_FUNC translate_compare_func_to_dx12(int cmp) { return static_cast(cmp); } -inline D3D12_STENCIL_OP translate_stencil_op_to_dx12(int so) { return static_cast(so); } -inline D3D12_BLEND translate_alpha_blend_mode_to_dx12(int b) { return static_cast(b); } -inline D3D12_BLEND translate_rgb_blend_mode_to_dx12(int b) { return static_cast(b); } -inline D3D12_BLEND_OP translate_blend_op_to_dx12(int bo) { return static_cast(bo); } -inline D3D12_TEXTURE_ADDRESS_MODE translate_texture_address_mode_to_dx12(int mode) -{ - return static_cast(mode); -} -inline int translate_texture_address_mode_to_engine(D3D12_TEXTURE_ADDRESS_MODE mode) { return static_cast(mode); } -inline D3D12_FILTER_TYPE translate_filter_type_to_dx12(int ft) -{ - return (ft == TEXFILTER_POINT || ft == TEXFILTER_NONE) ? D3D12_FILTER_TYPE_POINT : D3D12_FILTER_TYPE_LINEAR; -} -inline D3D12_FILTER_TYPE translate_mip_filter_type_to_dx12(int ft) -{ - return (ft == TEXMIPMAP_POINT || ft == TEXMIPMAP_NONE) ? D3D12_FILTER_TYPE_POINT : D3D12_FILTER_TYPE_LINEAR; -} - -inline SamplerState SamplerState::fromSamplerInfo(const d3d::SamplerInfo &info) -{ - SamplerState state; - state.isCompare = info.filter_mode == d3d::FilterMode::Compare; - state.setFilter(translate_filter_type_to_dx12(static_cast(info.filter_mode))); - state.setMip(translate_mip_filter_type_to_dx12(static_cast(info.mip_map_mode))); - state.setU(translate_texture_address_mode_to_dx12(static_cast(info.address_mode_u))); - state.setV(translate_texture_address_mode_to_dx12(static_cast(info.address_mode_v))); - state.setW(translate_texture_address_mode_to_dx12(static_cast(info.address_mode_w))); - state.setBias(info.mip_map_bias); - state.setAniso(info.anisotropic_max); - state.setBorder(info.border_color); - return state; -} - -class Device; -Device &get_device(); - -inline const Offset3D &toOffset(const Extent3D &ext) -{ - // sanity checks - G_STATIC_ASSERT(offsetof(Extent3D, width) == offsetof(Offset3D, x)); - G_STATIC_ASSERT(offsetof(Extent3D, height) == offsetof(Offset3D, y)); - G_STATIC_ASSERT(offsetof(Extent3D, depth) == offsetof(Offset3D, z)); - return reinterpret_cast(ext); -} -} // namespace drv3d_dx12 - -inline uint32_t nextPowerOfTwo(uint32_t u) -{ - --u; - u |= u >> 1; - u |= u >> 2; - u |= u >> 4; - u |= u >> 8; - u |= u >> 16; - return ++u; -} - -inline bool operator==(D3D12_CPU_DESCRIPTOR_HANDLE l, D3D12_CPU_DESCRIPTOR_HANDLE r) { return l.ptr == r.ptr; } - -inline bool operator!=(D3D12_CPU_DESCRIPTOR_HANDLE l, D3D12_CPU_DESCRIPTOR_HANDLE r) { return !(l == r); } - -inline D3D12_PRIMITIVE_TOPOLOGY pimitive_type_to_primtive_topology(D3D_PRIMITIVE pt, D3D12_PRIMITIVE_TOPOLOGY initial) -{ - if (pt >= D3D_PRIMITIVE_1_CONTROL_POINT_PATCH && pt <= D3D_PRIMITIVE_32_CONTROL_POINT_PATCH) - return static_cast( - D3D_PRIMITIVE_TOPOLOGY_1_CONTROL_POINT_PATCHLIST + pt - D3D_PRIMITIVE_1_CONTROL_POINT_PATCH); - return initial; -} - -inline D3D12_PRIMITIVE_TOPOLOGY_TYPE topology_to_topology_type(D3D12_PRIMITIVE_TOPOLOGY top) -{ - if (D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST <= top && D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP_ADJ >= top) - return D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; - if (D3D_PRIMITIVE_TOPOLOGY_POINTLIST == top) - return D3D12_PRIMITIVE_TOPOLOGY_TYPE_POINT; - if (D3D_PRIMITIVE_TOPOLOGY_LINELIST == top || D3D_PRIMITIVE_TOPOLOGY_LINESTRIP == top) - return D3D12_PRIMITIVE_TOPOLOGY_TYPE_LINE; - if (D3D_PRIMITIVE_TOPOLOGY_UNDEFINED == top) - return D3D12_PRIMITIVE_TOPOLOGY_TYPE_UNDEFINED; -#if _TARGET_XBOX - if (D3D_PRIMITIVE_TOPOLOGY_QUADLIST == top) - return PRIMITIVE_TOPOLOGY_TYPE_QUAD; -#endif - return D3D12_PRIMITIVE_TOPOLOGY_TYPE_PATCH; -} - -inline D3D12_GPU_DESCRIPTOR_HANDLE operator+(D3D12_GPU_DESCRIPTOR_HANDLE l, uint64_t r) { return {l.ptr + r}; } - -inline D3D12_CPU_DESCRIPTOR_HANDLE operator+(D3D12_CPU_DESCRIPTOR_HANDLE l, size_t r) { return {l.ptr + r}; } - -inline wchar_t *lazyToWchar(const char *str, wchar_t *buf, size_t max_len) -{ - auto ed = str + max_len - 1; - auto at = buf; - for (; *str && str != ed; ++str, ++at) - *at = *str; - *at = L'\0'; - return buf; -} - - -inline bool operator==(D3D12_GPU_DESCRIPTOR_HANDLE l, D3D12_GPU_DESCRIPTOR_HANDLE r) { return l.ptr == r.ptr; } - -inline bool operator!=(D3D12_GPU_DESCRIPTOR_HANDLE l, D3D12_GPU_DESCRIPTOR_HANDLE r) { return !(l == r); } - -#define D3D12_ERROR_INVALID_HOST_EXE_SDK_VERSION _HRESULT_TYPEDEF_(0x887E0003L) - -namespace drv3d_dx12 -{ -void report_oom_info(); -void set_last_error(HRESULT error); -HRESULT get_last_error_code(); -inline const char *dxgi_error_code_to_string(HRESULT ec) -{ -#define ENUM_CASE(Name) \ - case Name: return #Name - switch (ec) - { - ENUM_CASE(E_FAIL); // returned by init code if a step fails in a fatal way - ENUM_CASE(DXGI_ERROR_INVALID_CALL); - ENUM_CASE(DXGI_ERROR_NOT_FOUND); - ENUM_CASE(DXGI_ERROR_MORE_DATA); - ENUM_CASE(DXGI_ERROR_UNSUPPORTED); - ENUM_CASE(DXGI_ERROR_DEVICE_REMOVED); - ENUM_CASE(DXGI_ERROR_DEVICE_HUNG); - ENUM_CASE(DXGI_ERROR_DEVICE_RESET); - ENUM_CASE(DXGI_ERROR_WAS_STILL_DRAWING); - ENUM_CASE(DXGI_ERROR_FRAME_STATISTICS_DISJOINT); - ENUM_CASE(DXGI_ERROR_GRAPHICS_VIDPN_SOURCE_IN_USE); - ENUM_CASE(DXGI_ERROR_DRIVER_INTERNAL_ERROR); - ENUM_CASE(DXGI_ERROR_NONEXCLUSIVE); - ENUM_CASE(DXGI_ERROR_NOT_CURRENTLY_AVAILABLE); - ENUM_CASE(DXGI_ERROR_REMOTE_CLIENT_DISCONNECTED); - ENUM_CASE(DXGI_ERROR_REMOTE_OUTOFMEMORY); - ENUM_CASE(DXGI_ERROR_ACCESS_LOST); - ENUM_CASE(DXGI_ERROR_WAIT_TIMEOUT); - ENUM_CASE(DXGI_ERROR_SESSION_DISCONNECTED); - ENUM_CASE(DXGI_ERROR_RESTRICT_TO_OUTPUT_STALE); - ENUM_CASE(DXGI_ERROR_CANNOT_PROTECT_CONTENT); - ENUM_CASE(DXGI_ERROR_ACCESS_DENIED); - ENUM_CASE(DXGI_ERROR_NAME_ALREADY_EXISTS); - ENUM_CASE(DXGI_STATUS_UNOCCLUDED); - ENUM_CASE(DXGI_STATUS_DDA_WAS_STILL_DRAWING); - ENUM_CASE(DXGI_ERROR_MODE_CHANGE_IN_PROGRESS); - ENUM_CASE(E_INVALIDARG); - ENUM_CASE(E_OUTOFMEMORY); -#if _TARGET_PC_WIN - ENUM_CASE(D3D12_ERROR_ADAPTER_NOT_FOUND); - ENUM_CASE(D3D12_ERROR_DRIVER_VERSION_MISMATCH); - ENUM_CASE(D3D12_ERROR_INVALID_HOST_EXE_SDK_VERSION); -#endif - } -#undef ENUM_CASE - - return ""; -} - -inline HRESULT dx12_check_result_no_oom_report(HRESULT result, const char *DAGOR_HAS_LOGS(expr), const char *DAGOR_HAS_LOGS(file), - int DAGOR_HAS_LOGS(line)) -{ - if (SUCCEEDED(result)) - return result; - - set_last_error(result); - - auto resultStr = dxgi_error_code_to_string(result); - if ('\0' == resultStr[0]) - { - logerr("%s returned unknown return code %u, %s %u", expr, result, file, line); - } - else - { - logerr("%s returned %s, %s %u", expr, resultStr, file, line); - } - - return result; -} - -inline bool is_oom_error_code(HRESULT result) { return E_OUTOFMEMORY == result; } - -inline HRESULT dx12_check_result(HRESULT result, const char *DAGOR_HAS_LOGS(expr), const char *DAGOR_HAS_LOGS(file), - int DAGOR_HAS_LOGS(line)) -{ - if (SUCCEEDED(result)) - return result; - - if (is_oom_error_code(result)) - { - report_oom_info(); - } - - set_last_error(result); - - auto resultStr = dxgi_error_code_to_string(result); - if ('\0' == resultStr[0]) - { - logerr("%s returned unknown return code %u, %s %u", expr, result, file, line); - } - else - { - logerr("%s returned %s, %s %u", expr, resultStr, file, line); - } - - return result; -} - -inline bool is_recoverable_error(HRESULT error) -{ - switch (error) - { - default: return true; - // any device error is not recoverable - case DXGI_ERROR_DEVICE_REMOVED: - case DXGI_ERROR_DEVICE_HUNG: - case DXGI_ERROR_DEVICE_RESET: return false; - } -} - -inline HRESULT dx12_debug_result(HRESULT result, const char *DAGOR_HAS_LOGS(expr), const char *DAGOR_HAS_LOGS(file), - int DAGOR_HAS_LOGS(line)) -{ - if (SUCCEEDED(result)) - return result; - - set_last_error(result); - - auto resultStr = dxgi_error_code_to_string(result); - if ('\0' == resultStr[0]) - { - debug("%s returned unknown return code %u, %s %u", expr, result, file, line); - } - else - { - debug("%s returned %s, %s %u", expr, resultStr, file, line); - } - - return result; -} -} // namespace drv3d_dx12 - -#define DX12_DEBUG_RESULT(r) drv3d_dx12::dx12_debug_result(r, #r, __FILE__, __LINE__) -#define DX12_DEBUG_OK(r) SUCCEEDED(DX12_DEBUG_RESULT(r)) -#define DX12_DEBUG_FAIL(r) FAILED(DX12_DEBUG_RESULT(r)) - -#define DX12_CHECK_RESULT(r) drv3d_dx12::dx12_check_result(r, #r, __FILE__, __LINE__) -#define DX12_CHECK_OK(r) SUCCEEDED(DX12_CHECK_RESULT(r)) -#define DX12_CHECK_FAIL(r) FAILED(DX12_CHECK_RESULT(r)) -#define DX12_EXIT_ON_FAIL(r) \ - if (DX12_CHECK_FAIL(r)) \ - { \ - /* no-op */ \ - } - -#define DX12_CHECK_RESULT_NO_OOM_CHECK(r) drv3d_dx12::dx12_check_result_no_oom_report(r, #r, __FILE__, __LINE__) - -struct UnloadLibHandler -{ - typedef HMODULE pointer; - void operator()(HMODULE lib) - { - if (lib) - { - FreeLibrary(lib); - } - } -}; - -using LibPointer = eastl::unique_ptr; - -struct GenericHandleHandler -{ - typedef HANDLE pointer; - void operator()(pointer h) - { - if (h != nullptr && h != INVALID_HANDLE_VALUE) - CloseHandle(h); - } -}; - -using HandlePointer = eastl::unique_ptr; -using EventPointer = eastl::unique_ptr; - -struct VirtaulAllocMemoryHandler -{ - void operator()(void *ptr) { VirtualFree(ptr, 0, MEM_RELEASE); } -}; - -template -using VirtualAllocPtr = eastl::unique_ptr; - -// TODO: move this to utils -inline D3D12_RECT asRect(const D3D12_VIEWPORT &vp) -{ - D3D12_RECT rect; - rect.left = vp.TopLeftX; - rect.top = vp.TopLeftY; - rect.right = vp.TopLeftX + vp.Width; - rect.bottom = vp.TopLeftY + vp.Height; - return rect; -} -#if !_TARGET_XBOXONE -inline bool operator==(const D3D12_RECT &l, const D3D12_RECT &r) -{ - return l.left == r.left && l.top == r.top && l.right == r.right && l.bottom == r.bottom; -} -inline bool operator!=(const D3D12_RECT &l, const D3D12_RECT &r) { return !(l == r); } -#endif - -template -inline D3D12_RANGE asDx12Range(ValueRange range) -{ - return {static_cast(range.front()), static_cast(range.back() + 1)}; -} - -struct MemoryStatus -{ - size_t allocatedBytes = 0; - size_t freeBytes = 0; - // memory still allocated that is kept for fast allocation or freed when another chunk is - // freed. eg. zombie heaps. - size_t reservedBytes = 0; -}; - -inline MemoryStatus operator+(const MemoryStatus &l, const MemoryStatus &r) -{ - MemoryStatus f = l; - f.allocatedBytes += r.allocatedBytes; - f.freeBytes += r.freeBytes; - f.reservedBytes += r.reservedBytes; - return f; -} - -#if !_TARGET_XBOXONE -inline D3D12_SHADING_RATE_COMBINER map_shading_rate_combiner_to_dx12(VariableRateShadingCombiner combiner) -{ - G_STATIC_ASSERT(D3D12_SHADING_RATE_COMBINER_PASSTHROUGH == static_cast(VariableRateShadingCombiner::VRS_PASSTHROUGH)); - G_STATIC_ASSERT(D3D12_SHADING_RATE_COMBINER_OVERRIDE == static_cast(VariableRateShadingCombiner::VRS_OVERRIDE)); - G_STATIC_ASSERT(D3D12_SHADING_RATE_COMBINER_MIN == static_cast(VariableRateShadingCombiner::VRS_MIN)); - G_STATIC_ASSERT(D3D12_SHADING_RATE_COMBINER_MAX == static_cast(VariableRateShadingCombiner::VRS_MAX)); - G_STATIC_ASSERT(D3D12_SHADING_RATE_COMBINER_SUM == static_cast(VariableRateShadingCombiner::VRS_SUM)); - return static_cast(combiner); -} - -inline D3D12_SHADING_RATE make_shading_rate_from_int_values(unsigned x, unsigned y) -{ - G_ASSERTF_RETURN(x <= 4 && y <= 4, D3D12_SHADING_RATE_1X1, "Variable Shading Rate can not exceed 4"); - G_ASSERTF_RETURN(x != 3 && y != 3, D3D12_SHADING_RATE_1X1, "Variable Shading Rate can not be 3"); - G_ASSERTF_RETURN(abs(int(x / 2) - int(y / 2)) < 2, D3D12_SHADING_RATE_1X1, - "Variable Shading Rate invalid combination of x=%u and y=%u shading rates", x, y); - G_STATIC_ASSERT(D3D12_SHADING_RATE_X_AXIS_SHIFT == 2); - G_STATIC_ASSERT(D3D12_SHADING_RATE_VALID_MASK == 3); - // simple formula (x-rate / 2) << 2 | (y-rage / 2) - // valid range for x and y are 1, 2 and 4 - return static_cast(((x >> 1) << D3D12_SHADING_RATE_X_AXIS_SHIFT) | (y >> 1)); -} -#endif - -template -inline eastl::bitset &or_bit(eastl::bitset &s, size_t i, bool b = true) -{ - return s.set(i, s.test(i) || b); -} - -// NOTE: This is intended for debug only, this is possibly slow, so use with care! -template -inline char *get_resource_name(ID3D12Resource *res, char (&cbuf)[N]) -{ -#if !_TARGET_XBOXONE - wchar_t wcbuf[N]; - UINT cnt = sizeof(wcbuf); - res->GetPrivateData(WKPDID_D3DDebugObjectNameW, &cnt, wcbuf); - eastl::copy(wcbuf, wcbuf + cnt / sizeof(wchar_t), cbuf); - cbuf[min(cnt, N - 1)] = '\0'; -#else - G_UNUSED(res); - cbuf[0] = 0; -#endif - return cbuf; -} - -template -inline char *append_literal(char *at, char *ed, const char (&lit)[N]) -{ - auto d = static_cast(ed - at); - auto c = min(d, N - 1); - memcpy(at, lit, c); - return at + c; -} - -template -inline char *append_or_mask_value_name(char *beg, char *at, char *ed, const char (&name)[N]) -{ - if (beg != at) - { - at = append_literal(at, ed, " | "); - } - return append_literal(at, ed, name); -} - -template -inline char *resource_state_mask_as_string(D3D12_RESOURCE_STATES mask, char (&cbuf)[N]) -{ - auto at = cbuf; - auto ed = cbuf + N - 1; - if (mask == D3D12_RESOURCE_STATE_COMMON) - { - at = append_literal(at, ed, "COMMON"); - } - else - { -#define CHECK_MASK(name) \ - if (D3D12_RESOURCE_STATE_##name == (mask & D3D12_RESOURCE_STATE_##name)) \ - { \ - at = append_or_mask_value_name(cbuf, at, ed, #name); \ - mask ^= D3D12_RESOURCE_STATE_##name; \ - } - // combined state, has to be first - CHECK_MASK(GENERIC_READ) - // single state - CHECK_MASK(VERTEX_AND_CONSTANT_BUFFER) - CHECK_MASK(INDEX_BUFFER) - CHECK_MASK(RENDER_TARGET) - CHECK_MASK(UNORDERED_ACCESS) - CHECK_MASK(DEPTH_WRITE) - CHECK_MASK(DEPTH_READ) - CHECK_MASK(NON_PIXEL_SHADER_RESOURCE) - CHECK_MASK(PIXEL_SHADER_RESOURCE) - CHECK_MASK(STREAM_OUT) - CHECK_MASK(INDIRECT_ARGUMENT) - CHECK_MASK(COPY_DEST) - CHECK_MASK(COPY_SOURCE) - CHECK_MASK(RESOLVE_DEST) - CHECK_MASK(RESOLVE_SOURCE) -#if !_TARGET_XBOXONE - CHECK_MASK(RAYTRACING_ACCELERATION_STRUCTURE) - CHECK_MASK(SHADING_RATE_SOURCE) -#endif - CHECK_MASK(PREDICATION) - CHECK_MASK(VIDEO_DECODE_READ) - CHECK_MASK(VIDEO_DECODE_WRITE) - CHECK_MASK(VIDEO_PROCESS_READ) - CHECK_MASK(VIDEO_PROCESS_WRITE) - CHECK_MASK(VIDEO_ENCODE_READ) - CHECK_MASK(VIDEO_ENCODE_WRITE) -#undef CHECK_MASK - } - *at = '\0'; - return cbuf; -} - -inline const char *to_string(D3D12_RESOURCE_DIMENSION dim) -{ - switch (dim) - { - default: return ""; - case D3D12_RESOURCE_DIMENSION_UNKNOWN: return "unknown"; - case D3D12_RESOURCE_DIMENSION_BUFFER: return "buffer"; - case D3D12_RESOURCE_DIMENSION_TEXTURE1D: return "texture 1D"; - case D3D12_RESOURCE_DIMENSION_TEXTURE2D: return "texture 2D"; - case D3D12_RESOURCE_DIMENSION_TEXTURE3D: return "texture 3D"; - } -} - -inline const char *to_string(D3D12_HEAP_TYPE type) -{ - switch (type) - { - case D3D12_HEAP_TYPE_DEFAULT: return "default"; - case D3D12_HEAP_TYPE_UPLOAD: return "upload"; - case D3D12_HEAP_TYPE_READBACK: return "read back"; - case D3D12_HEAP_TYPE_CUSTOM: return "custom"; - } - return "??"; -} - -inline const char *to_string(D3D12_CPU_PAGE_PROPERTY property) -{ - switch (property) - { - case D3D12_CPU_PAGE_PROPERTY_UNKNOWN: return "unknown"; - case D3D12_CPU_PAGE_PROPERTY_NOT_AVAILABLE: return "not available"; - case D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE: return "write combine"; - case D3D12_CPU_PAGE_PROPERTY_WRITE_BACK: return "write back"; - } - return "??"; -} - -inline const char *to_string(D3D12_MEMORY_POOL pool) -{ - switch (pool) - { - case D3D12_MEMORY_POOL_UNKNOWN: return "unknown"; - case D3D12_MEMORY_POOL_L0: return "L0"; - case D3D12_MEMORY_POOL_L1: return "L1"; - } - return "??"; -} - -inline const char *get_unit_name(uint32_t index) -{ - static const char *unitTable[] = {"Bytes", "KiBytes", "MiBytes", "GiBytes"}; - return unitTable[index]; -} - -inline uint32_t size_to_unit_table(uint64_t sz) -{ - uint32_t unitIndex = 0; - unitIndex += sz >= (1024 * 1024 * 1024); - unitIndex += sz >= (1024 * 1024); - unitIndex += sz >= (1024); - return unitIndex; -} - -inline float compute_unit_type_size(uint64_t sz, uint32_t unit_index) { return static_cast(sz) / (powf(1024, unit_index)); } - -class ByteUnits -{ - uint64_t size = 0; - -public: - ByteUnits() = default; - - ByteUnits(const ByteUnits &) = default; - ByteUnits &operator=(const ByteUnits &) = default; - - - ByteUnits(uint64_t v) : size{v} {} - ByteUnits &operator=(uint64_t v) - { - size = v; - return *this; - } - - ByteUnits &operator+=(ByteUnits o) - { - size += o.size; - return *this; - } - ByteUnits &operator-=(ByteUnits o) - { - size -= o.size; - return *this; - } - - friend ByteUnits operator+(ByteUnits l, ByteUnits r) { return {l.size + r.size}; } - friend ByteUnits operator-(ByteUnits l, ByteUnits r) { return {l.size - r.size}; } - - uint64_t value() const { return size; } - float units() const { return compute_unit_type_size(size, size_to_unit_table(size)); } - const char *name() const { return get_unit_name(size_to_unit_table(size)); } -}; - -template -inline T align_value(T value, T alignment) -{ - return (value + alignment - 1) & ~(alignment - 1); -} - -inline Extent3D align_value(const Extent3D &value, const Extent3D &alignment) -{ - return // - {align_value(value.width, alignment.width), align_value(value.height, alignment.height), - align_value(value.depth, alignment.depth)}; -} - -namespace drv3d_dx12 -{ -union HeapID -{ - using ValueType = uint32_t; - static constexpr uint32_t alias_bits = 1; - static constexpr uint32_t group_bits = 4; - static constexpr uint32_t index_bits = (8 * sizeof(ValueType)) - group_bits - alias_bits; - - ValueType raw = 0; - struct - { - ValueType isAlias : alias_bits; - ValueType group : group_bits; - ValueType index : index_bits; - }; -}; - -#if _TARGET_XBOX -struct VirtualFreeCaller -{ - void operator()(void *pointer) { VirtualFree(pointer, 0, MEM_RELEASE); } -}; - -class ResourceMemory -{ - uint8_t *heap = nullptr; - uint32_t sz = 0; - HeapID heapID; - -public: - ResourceMemory() = default; - ~ResourceMemory() = default; - - ResourceMemory(const ResourceMemory &) = default; - ResourceMemory &operator=(const ResourceMemory &) = default; - - ResourceMemory(ResourceMemory &&) = default; - ResourceMemory &operator=(ResourceMemory &&) = default; - - ResourceMemory(uint8_t *h, uint32_t s, HeapID heap_id) : heap{h}, sz{s}, heapID{heap_id} {} - - explicit operator bool() const { return heap != nullptr; } - - uint32_t size() const { return sz; } - - uintptr_t getAddress() const { return reinterpret_cast(heap); } - - uint8_t *asPointer() const { return heap; } - - ResourceMemory subRange(uint32_t offset, uint32_t o_size) const - { - G_ASSERT(offset + o_size <= size()); - return {heap + offset, o_size, heapID}; - } - - ResourceMemory aliasSubRange(uint32_t new_index, uint32_t offset, uint32_t o_size) const - { - G_ASSERT(offset + o_size <= size()); - HeapID newHeapID = heapID; - newHeapID.isAlias = 1; - newHeapID.index = new_index; - return {heap + offset, o_size, newHeapID}; - } - - bool isSubRangeOf(const ResourceMemory &mem) const - { - // NOTE: this can not check heapID as aliasing may change the heap id (from a real heap to a aliasing heap). - return make_value_range(heap, size()).isSubRangeOf(make_value_range(mem.heap, mem.size())); - } - - bool intersectsWith(const ResourceMemory &mem) const - { - return make_value_range(heap, size()).overlaps(make_value_range(mem.heap, mem.size())); - } - - uint32_t calculateOffset(const ResourceMemory &sub) const { return sub.heap - heap; } - - HeapID getHeapID() const { return heapID; } -}; -#else -class ResourceMemory -{ - ID3D12Heap *heap = nullptr; - ValueRange range; - HeapID heapID; - -public: - ResourceMemory() = default; - ~ResourceMemory() = default; - - ResourceMemory(const ResourceMemory &) = default; - ResourceMemory &operator=(const ResourceMemory &) = default; - - ResourceMemory(ResourceMemory &&) = default; - ResourceMemory &operator=(ResourceMemory &&) = default; - - ResourceMemory(ID3D12Heap *h, ValueRange r, HeapID heap_id) : heap{h}, range{r}, heapID{heap_id} {} - - ID3D12Heap *getHeap() const { return heap; } - - ValueRange getRange() const { return range; } - - explicit operator bool() const { return heap != nullptr; } - - uint32_t size() const { return range.size(); } - - uintptr_t getOffset() const { return range.front(); } - - ResourceMemory subRange(uint32_t offset, uint32_t o_size) const - { - G_ASSERT(offset + o_size <= range.size()); - ResourceMemory r; - r.heap = heap; - r.range = make_value_range(getOffset() + offset, o_size); - r.heapID = heapID; - return r; - } - - ResourceMemory aliasSubRange(uint32_t new_index, uint32_t offset, uint32_t o_size) const - { - G_ASSERT(offset + o_size <= range.size()); - ResourceMemory r; - r.heap = heap; - r.range = make_value_range(getOffset() + offset, o_size); - r.heapID = heapID; - r.heapID.isAlias = 1; - r.heapID.index = new_index; - return r; - } - - bool isSubRangeOf(const ResourceMemory &mem) const - { - // NOTE: this can not check heapID as aliasing may change the heap id (from a real heap to a aliasing heap). - if (mem.heap != heap) - { - return false; - } - return range.isSubRangeOf(mem.range); - } - - bool intersectsWith(const ResourceMemory &mem) const { return (heap == mem.heap) && range.overlaps(mem.range); } - - uint32_t calculateOffset(const ResourceMemory &sub) const { return sub.range.start - range.start; } - - HeapID getHeapID() const { return heapID; } -}; -#endif - -#if DX12_ENABLE_MT_VALIDATION -class DriverMutex -{ - WinCritSec mutex; - bool enabled = false; - uint32_t recursionCount = 0; - -public: - void lock() - { -#if DX12_ENABLE_PEDANTIC_MT_VALIDATION - // NOTE: During startup this will fire at least once as the work cycle just takes the lock to check - // the d3d context device state. - G_ASSERTF(true == enabled, "DX12: Trying to lock the driver context without enabling " - "multithreading first"); -#endif - mutex.lock(); - ++recursionCount; - } - - void unlock() - { -#if DX12_ENABLE_PEDANTIC_MT_VALIDATION - G_ASSERTF(true == enabled, "DX12: Trying to unlock the driver context without enabling " - "multithreading first"); -#endif - --recursionCount; - mutex.unlock(); - } - - bool validateOwnership() - { - if (mutex.tryLock()) - { -#if DX12_ENABLE_PEDANTIC_MT_VALIDATION - bool owns = false; - if (enabled) - { - // If we end up here the two thing can either be true: - // 1) this thread has taken the lock before this, then recustionCount is greater than 0 - // 2) no thread has taken the lock before this, then recursionCount is equal to 0 - owns = recursionCount > 0; - } - else - { - // if MT is not enabled, only the main thread can access it without taking a lock. - owns = owns || (0 == recursionCount) && is_main_thread(); - } -#else - // DX11 behavior replicating expression, it has a flaw, which allows a race between the main - // thread and a different thread as long as MT is disabled. - bool owns = (!enabled && (0 == recursionCount) && is_main_thread()) || (recursionCount > 0); -#endif - mutex.unlock(); - return owns; - } - // Failed to take the mutex, some other thread has it, so we don't own it. - return false; - } - - void enableMT() - { -#if DX12_ENABLE_PEDANTIC_MT_VALIDATION - // NOTE: This locking will show ordering issues during startup, esp with splash screen. - mutex.lock(); - G_ASSERT(false == enabled); -#endif - enabled = true; -#if DX12_ENABLE_PEDANTIC_MT_VALIDATION - mutex.unlock(); -#endif - } - - void disableMT() - { -#if DX12_ENABLE_PEDANTIC_MT_VALIDATION - mutex.lock(); - G_ASSERT(true == enabled); -#endif - enabled = false; -#if DX12_ENABLE_PEDANTIC_MT_VALIDATION - mutex.unlock(); -#endif - } -}; -#else -class DriverMutex -{ - WinCritSec mutex; - -public: - void lock() { mutex.lock(); } - - void unlock() { mutex.unlock(); } - - bool validateOwnership() { return true; } - - void enableMT() {} - - void disableMT() {} -}; -#endif - -struct HostDeviceSharedMemoryRegion -{ - enum class Source - { - TEMPORARY, // TODO: may split into temporary upload and readback (bidirectional makes no sense) - PERSISTENT_UPLOAD, - PERSISTENT_READ_BACK, - PERSISTENT_BIDIRECTIONAL, - PUSH_RING, // illegal to free - }; - // buffer object that supplies the memory - ID3D12Resource *buffer = nullptr; -#if _TARGET_XBOX - // on xbox gpu and cpu pointer are the same - union - { - D3D12_GPU_VIRTUAL_ADDRESS gpuPointer = 0; - uint8_t *pointer; - }; -#else - // offset into gpu virtual memory, including offset! - D3D12_GPU_VIRTUAL_ADDRESS gpuPointer = 0; - // pointer into cpu visible memory, offset is already applied (pointer - offset yields address - // base of the buffer) - uint8_t *pointer = nullptr; -#endif - // offset range of this allocation - // gpuPointer and pointer already have the start of the range added to it - ValueRange range; - Source source = Source::TEMPORARY; - - explicit constexpr operator bool() const { return nullptr != buffer; } - constexpr bool isTemporary() const { return Source::TEMPORARY == source; } - void flushRegion(ValueRange sub_range) const - { - D3D12_RANGE r = {}; - uint8_t *ptr = nullptr; - buffer->Map(0, &r, reinterpret_cast(&ptr)); - G_ASSERT(ptr + range.front() == pointer); - r = asDx12Range(sub_range.shiftBy(range.front())); - buffer->Unmap(0, &r); - } - void invalidateRegion(ValueRange sub_range) const - { - D3D12_RANGE r = asDx12Range(sub_range.shiftBy(range.front())); - uint8_t *ptr = nullptr; - buffer->Map(0, &r, reinterpret_cast(&ptr)); - G_ASSERT(pointer + sub_range.front() == ptr + range.front()); - r.Begin = r.End = 0; - buffer->Unmap(0, &r); - } - void flush() const - { - D3D12_RANGE r = {}; - uint8_t *ptr = nullptr; - buffer->Map(0, &r, reinterpret_cast(&ptr)); - G_ASSERT(ptr + range.front() == pointer); - r = asDx12Range(range); - buffer->Unmap(0, &r); - } - void invalidate() const - { - D3D12_RANGE r = asDx12Range(range); - uint8_t *ptr = nullptr; - buffer->Map(0, &r, reinterpret_cast(&ptr)); - G_ASSERT(ptr + range.front() == pointer); - r.Begin = r.End = 0; - buffer->Unmap(0, &r); - } - template - T *as() const - { - return reinterpret_cast(pointer); - } -}; -} // namespace drv3d_dx12 - -template (T::COUNT)> -class TypedBitSet : private eastl::bitset -{ - using BaseType = eastl::bitset; - -public: - using BaseType::all; - using BaseType::any; - using BaseType::count; - using BaseType::flip; - using BaseType::none; - // using BaseType::to_string; - using BaseType::to_ulong; - // using BaseType::to_ullong; - using BaseType::size; - using typename BaseType::reference; - - TypedBitSet() = default; - TypedBitSet(const TypedBitSet &) = default; - ~TypedBitSet() = default; - - TypedBitSet &operator=(const TypedBitSet &) = default; - - bool operator[](T index) const { return (*this)[static_cast(index)]; } - typename BaseType::reference operator[](T index) { return (*this)[static_cast(index)]; } - bool test(T index) const { return BaseType::test(static_cast(index)); } - - TypedBitSet &set() - { - BaseType::set(); - return *this; - } - - TypedBitSet &set(T index, bool value = true) - { - BaseType::set(static_cast(index), value); - return *this; - } - - TypedBitSet &reset() - { - BaseType::reset(); - return *this; - } - - TypedBitSet &reset(T index) - { - BaseType::reset(static_cast(index)); - return *this; - } - - TypedBitSet operator~() const - { - auto cpy = *this; - cpy.flip(); - return cpy; - } - - bool operator==(const TypedBitSet &other) const { return BaseType::operator==(other); } - - bool operator!=(const TypedBitSet &other) const { return BaseType::operator!=(other); } - - // extended stuff - template - TypedBitSet &set(T0 v0, T1 v1, Ts... vs) - { - set(v0); - set(v1, vs...); - return *this; - } - - template - TypedBitSet &reset(T0 v0, T1 v1, Ts... vs) - { - reset(v0); - reset(v1, vs...); - return *this; - } - - template - TypedBitSet(T0 v0, Ts... vs) - { - set(v0, vs...); - } -}; - -#if _TARGET_PC_WIN -inline DXGI_QUERY_VIDEO_MEMORY_INFO max(const DXGI_QUERY_VIDEO_MEMORY_INFO &l, const DXGI_QUERY_VIDEO_MEMORY_INFO &r) -{ - DXGI_QUERY_VIDEO_MEMORY_INFO result; - result.Budget = max(l.Budget, r.Budget); - result.CurrentUsage = max(l.CurrentUsage, r.CurrentUsage); - result.AvailableForReservation = max(l.AvailableForReservation, r.AvailableForReservation); - result.CurrentReservation = max(l.CurrentReservation, r.CurrentReservation); - return result; -} -#endif - -template -inline eastl::span string_literal_span(const char (&sl)[N]) -{ - return {sl, N - 1}; -} - -template -inline eastl::span string_literal_span(const wchar_t (&sl)[N]) -{ - return {sl, N - 1}; -} - -inline bool is_valid_allocation_info(const D3D12_RESOURCE_ALLOCATION_INFO &info) -{ - // On error DX12 returns ~0 in the SizeInBytes member. - return 0 != ~info.SizeInBytes; -} - -inline uint64_t get_next_resource_alignment(uint64_t alignment, uint32_t samples) -{ - if (D3D12_SMALL_RESOURCE_PLACEMENT_ALIGNMENT == alignment) - { - return D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT; - } - if (samples > 1 && (D3D12_SMALL_MSAA_RESOURCE_PLACEMENT_ALIGNMENT == alignment)) - { - return D3D12_DEFAULT_MSAA_RESOURCE_PLACEMENT_ALIGNMENT; - } - - return alignment; -} - -// NOTE: may adjust desc.Alignment if the requested alignment could not be used -inline D3D12_RESOURCE_ALLOCATION_INFO get_resource_allocation_info(ID3D12Device *device, D3D12_RESOURCE_DESC &desc) -{ - G_ASSERTF(desc.Alignment != 0, "DX12: desc.Alignment should not be 0!"); - auto result = device->GetResourceAllocationInfo(0, 1, &desc); - if (!is_valid_allocation_info(result)) - { - auto nextAlignment = get_next_resource_alignment(desc.Alignment, desc.SampleDesc.Count); - if (nextAlignment != desc.Alignment) - { - desc.Alignment = nextAlignment; - result = device->GetResourceAllocationInfo(0, 1, &desc); - } - } - return result; -} - -inline void report_resource_alloc_info_error(const D3D12_RESOURCE_DESC &desc) -{ - logerr("DX12: Error while querying resource allocation info, resource desc: %s, %u, %u x %u x " - "%u, %u, %s, %u by %u, %u, %08X", - to_string(desc.Dimension), desc.Alignment, desc.Width, desc.Height, desc.DepthOrArraySize, desc.MipLevels, - drv3d_dx12::dxgi_format_name(desc.Format), desc.SampleDesc.Count, desc.SampleDesc.Quality, desc.Layout, desc.Flags); -} - -inline uint64_t calculate_texture_alignment(uint64_t width, uint32_t height, uint32_t depth, uint32_t samples, - D3D12_TEXTURE_LAYOUT layout, D3D12_RESOURCE_FLAGS flags, drv3d_dx12::FormatStore format) -{ - if (D3D12_TEXTURE_LAYOUT_UNKNOWN != layout) - { - if (samples > 1) - { - return D3D12_DEFAULT_MSAA_RESOURCE_PLACEMENT_ALIGNMENT; - } - else - { - return D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT; - } - } - - if ((1 == samples) && ((D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET | D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL) & flags)) - { - return D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT; - } - - uint32_t blockSizeX = 1, blockSizeY = 1; - auto bytesPerBlock = format.getBytesPerPixelBlock(&blockSizeX, &blockSizeY); - const uint32_t textureWidthInBlocks = (width + blockSizeX - 1) / blockSizeX; - const uint32_t textureHeightInBlocks = (height + blockSizeY - 1) / blockSizeY; - - const uint32_t TILE_MEM_SIZE = 4 * 1024; - const uint32_t blocksInTile = TILE_MEM_SIZE / bytesPerBlock; - // MSDN documentation says about "near-equilateral" size for the tile - const uint32_t blocksInTileX = nextPowerOfTwo(sqrt(blocksInTile)); - const uint32_t blocksInTileY = nextPowerOfTwo(blocksInTile / blocksInTileX); - const uint32_t MAX_TILES_COUNT_FOR_SMALL_RES = 16; - const uint32_t tilesCount = ((textureWidthInBlocks + blocksInTileX - 1) / blocksInTileX) * - ((textureHeightInBlocks + blocksInTileY - 1) / blocksInTileY) * depth; - // This check is neccessary according to debug layer and dx12 documentation: - // https://docs.microsoft.com/en-us/windows/win32/api/d3d12/ns-d3d12-d3d12_resource_desc#alignment - const bool smallAmountOfTiles = tilesCount <= MAX_TILES_COUNT_FOR_SMALL_RES; - - if (samples > 1) - { - if (smallAmountOfTiles) - { - return D3D12_SMALL_MSAA_RESOURCE_PLACEMENT_ALIGNMENT; - } - else - { - return D3D12_DEFAULT_MSAA_RESOURCE_PLACEMENT_ALIGNMENT; - } - } - else - { - if (smallAmountOfTiles) - { - return D3D12_SMALL_RESOURCE_PLACEMENT_ALIGNMENT; - } - else - { - return D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT; - } - } -} - -// Applies function object 'f' to each bit index of each set bit in bit mask 'm'. -template -inline void for_each_set_bit(uint32_t m, F f) -{ - while (0 != m) - { - uint32_t i = __bsf_unsafe(m); - m ^= 1u << i; - f(i); - } -} - -// Very simple wrapper to make a non thread safe container thread safe with the help of a paired mutex. -// Access is grated with a AccessToken type, which grants access to the containers interface. -template -class ContainerMutexWrapper -{ - MTX mtx; - T container; - - void lock() { mtx.lock(); } - - void unlock() { mtx.unlock(); } - - T &data() { return container; } - -public: - ContainerMutexWrapper() = default; - ~ContainerMutexWrapper() = default; - - ContainerMutexWrapper(const ContainerMutexWrapper &) = delete; - ContainerMutexWrapper &operator=(const ContainerMutexWrapper &) = delete; - - ContainerMutexWrapper(ContainerMutexWrapper &&) = delete; - ContainerMutexWrapper &operator=(ContainerMutexWrapper &&) = delete; - - class AccessToken - { - ContainerMutexWrapper *parent = nullptr; - - public: - AccessToken() = default; - ~AccessToken() - { - if (parent) - { - parent->unlock(); - } - } - - AccessToken(ContainerMutexWrapper &p) : parent{&p} { parent->lock(); } - - AccessToken(const AccessToken &) = delete; - AccessToken &operator=(const AccessToken &) = delete; - - AccessToken(AccessToken &&other) : parent{other.parent} { other.parent = nullptr; } - AccessToken &operator=(AccessToken &&other) - { - eastl::swap(parent, other.parent); - return *this; - } - - T &operator*() { return parent->data(); } - T *operator->() { return &parent->data(); } - }; - - AccessToken access() { return {*this}; } -}; - -// Input is 4x8 bits color channel mask and output will be a 8bit mask of render targets -inline uint32_t color_channel_mask_to_render_target_mask(uint32_t mask) -{ - // For each color chanel generate the used bit - const uint32_t channel0 = mask >> 0; - const uint32_t channel1 = mask >> 1; - const uint32_t channel2 = mask >> 2; - const uint32_t channel3 = mask >> 3; - // At this point the lower bit of each 4 bit block is now indicating if a target is used or not - const uint32_t channelsToSpacedTargetMask = channel0 | channel1 | channel2 | channel3; - // This erases the top 3 bits of each 4 bit block to compress from 4x8 bits to 8 bits. - const uint32_t t0 = (channelsToSpacedTargetMask >> 0) & 0x00000001; - const uint32_t t1 = (channelsToSpacedTargetMask >> 3) & 0x00000002; - const uint32_t t2 = (channelsToSpacedTargetMask >> 6) & 0x00000004; - const uint32_t t3 = (channelsToSpacedTargetMask >> 9) & 0x00000008; - const uint32_t t4 = (channelsToSpacedTargetMask >> 12) & 0x00000010; - const uint32_t t5 = (channelsToSpacedTargetMask >> 15) & 0x00000020; - const uint32_t t6 = (channelsToSpacedTargetMask >> 18) & 0x00000040; - const uint32_t t7 = (channelsToSpacedTargetMask >> 21) & 0x00000080; - const uint32_t combinedTargetMask = t0 | t1 | t2 | t3 | t4 | t5 | t6 | t7; - return combinedTargetMask; -} - -// Inputs a 8 bit mask of render targets and outputs a 4x8 channel mask, where if a target bit is -// set all corresponding channel bits will be set -inline uint32_t render_target_mask_to_color_channel_mask(uint32_t mask) -{ - // Spread out the individual target bits into the lowest bit of each corresponding 4 bit block, - // which is the indicator bit for the first channel (r) - const uint32_t t0 = (mask & 0x00000001) << 0; - const uint32_t t1 = (mask & 0x00000002) << 3; - const uint32_t t2 = (mask & 0x00000004) << 6; - const uint32_t t3 = (mask & 0x00000008) << 9; - const uint32_t t4 = (mask & 0x00000010) << 12; - const uint32_t t5 = (mask & 0x00000020) << 15; - const uint32_t t6 = (mask & 0x00000040) << 18; - const uint32_t t7 = (mask & 0x00000080) << 21; - const uint32_t r = t0 | t1 | t2 | t3 | t4 | t5 | t6 | t7; - // Replicate indicator bits from first channel (r) to all others (g, b and a) - const uint32_t g = r << 1; - const uint32_t b = r << 2; - const uint32_t a = r << 3; - return r | g | b | a; -} - -// Takes a 4x8 bit render target output channel mask and turns it into a 4x8 render target ouput mask -// where if any channel of a target is enabled all channels of the result are enabled. -// Simply speaking it turns all non 0 hex digits in the mask into F and all 0 are keept as 0. -inline uint32_t spread_color_chanel_mask_to_render_target_color_channel_mask(uint32_t mask) -{ - const uint32_t r = mask & 0x11111111; - const uint32_t g = mask & 0x22222222; - const uint32_t b = mask & 0x44444444; - const uint32_t a = mask & 0x88888888; - const uint32_t r1 = r | (r << 1) | (r << 2) | (r << 3); - const uint32_t g1 = g | (g << 1) | (g << 2) | (g >> 1); - const uint32_t b1 = b | (b << 1) | (b >> 1) | (b >> 2); - const uint32_t a1 = a | (a >> 1) | (a >> 2) | (a >> 3); - return r1 | g1 | b1 | a1; -} - -// used for de and encoding into blk's -#define DX12_D3D_CAP_SET \ - DX12_D3D_CAP(hasAnisotropicFilter); \ - DX12_D3D_CAP(hasDepthReadOnly); \ - DX12_D3D_CAP(hasStructuredBuffers); \ - DX12_D3D_CAP(hasNoOverwriteOnShaderResourceBuffers); \ - DX12_D3D_CAP(hasForcedSamplerCount); \ - DX12_D3D_CAP(hasVolMipMap); \ - DX12_D3D_CAP(hasAsyncCompute); \ - DX12_D3D_CAP(hasOcclusionQuery); \ - DX12_D3D_CAP(hasConstBufferOffset); \ - DX12_D3D_CAP(hasDepthBoundsTest); \ - DX12_D3D_CAP(hasConditionalRender); \ - DX12_D3D_CAP(hasResourceCopyConversion); \ - DX12_D3D_CAP(hasAsyncCopy); \ - DX12_D3D_CAP(hasReadMultisampledDepth); \ - DX12_D3D_CAP(hasInstanceID); \ - DX12_D3D_CAP(hasConservativeRassterization); \ - DX12_D3D_CAP(hasQuadTessellation); \ - DX12_D3D_CAP(hasGather4); \ - DX12_D3D_CAP(hasAlphaCoverage); \ - DX12_D3D_CAP(hasWellSupportedIndirect); \ - DX12_D3D_CAP(hasRaytracing); \ - DX12_D3D_CAP(hasRaytracingT11); \ - DX12_D3D_CAP(hasBindless); \ - DX12_D3D_CAP(hasNVApi); \ - DX12_D3D_CAP(hasATIApi); \ - DX12_D3D_CAP(hasVariableRateShading); \ - DX12_D3D_CAP(hasVariableRateShadingTexture); \ - DX12_D3D_CAP(hasVariableRateShadingShaderOutput); \ - DX12_D3D_CAP(hasVariableRateShadingCombiners); \ - DX12_D3D_CAP(hasVariableRateShadingBy4); \ - DX12_D3D_CAP(hasAliasedTextures); \ - DX12_D3D_CAP(hasResourceHeaps); \ - DX12_D3D_CAP(hasBufferOverlapCopy); \ - DX12_D3D_CAP(hasBufferOverlapRegionsCopy); \ - DX12_D3D_CAP(hasUAVOnlyForcedSampleCount); \ - DX12_D3D_CAP(hasShader64BitIntegerResources); \ - DX12_D3D_CAP(hasNativeRenderPassSubPasses); \ - DX12_D3D_CAP(hasTiled2DResources); \ - DX12_D3D_CAP(hasTiled3DResources); \ - DX12_D3D_CAP(hasTiledSafeResourcesAccess); \ - DX12_D3D_CAP(hasTiledMemoryAliasing); \ - DX12_D3D_CAP(hasDLSS); \ - DX12_D3D_CAP(hasXESS); \ - DX12_D3D_CAP(hasDrawID); \ - DX12_D3D_CAP(hasMeshShader); \ - DX12_D3D_CAP(hasBasicViewInstancing); \ - DX12_D3D_CAP(hasOptimizedViewInstancing); \ - DX12_D3D_CAP(hasAcceleratedViewInstancing); \ - DX12_D3D_CAP(hasRenderPassDepthResolve); \ - DX12_D3D_CAP(hasStereoExpansion); \ - DX12_D3D_CAP(hasTileBasedArchitecture); \ - DX12_D3D_CAP(hasLazyMemory); \ - DX12_D3D_CAP(hasIndirectSupport); \ - DX12_D3D_CAP(hasCompareSampler); - -template -class DynamicArray -{ - eastl::unique_ptr ptr; - size_t count = 0; - -public: - DynamicArray() = default; - DynamicArray(const DynamicArray &) = delete; - DynamicArray(DynamicArray &&) = default; - DynamicArray(T *p, size_t sz) : ptr{p}, count{sz} {} - DynamicArray(eastl::unique_ptr &&p, size_t sz) : ptr{eastl::forward>(p)}, count{sz} {} - explicit DynamicArray(size_t sz) : DynamicArray{eastl::make_unique(sz), sz} {} - DynamicArray &operator=(const DynamicArray &) = delete; - DynamicArray &operator=(DynamicArray &&other) - { - eastl::swap(ptr, other.ptr); - eastl::swap(count, other.count); - return *this; - } - - void adopt(T *new_ptr, size_t new_sz) - { - eastl::unique_ptr newPtr{new_ptr}; - eastl::swap(ptr, newPtr); - count = new_sz; - } - - T *release() - { - count = 0; - return ptr.release(); - } - - bool resize(size_t new_size) - { - if (count == new_size) - { - return false; - } - - if (0 == new_size) - { - ptr.reset(); - count = 0; - return true; - } - - auto newBlock = eastl::make_unique(new_size); - if (!newBlock) - { - return false; - } - - for (uint32_t i = 0; i < count; ++i) - { - newBlock[i] = eastl::move(ptr[i]); - } - eastl::swap(ptr, newBlock); - count = new_size; - return true; - } - - T &operator[](size_t i) { return ptr[i]; } - const T &operator[](size_t i) const { return ptr[i]; } - size_t size() const { return count; } - bool empty() const { return !ptr || 0 == count; } - T *data() { return ptr.get(); } - const T *data() const { return ptr.get(); } - eastl::span asSpan() { return {data(), size()}; } - eastl::span asSpan() const { return {data(), size()}; } - eastl::span releaseAsSpan() - { - auto retValue = asSpan(); - release(); - return retValue; - } - static DynamicArray fromSpan(eastl::span span) { return {span.data(), span.size()}; } - T *begin() { return ptr.get(); } - const T *begin() const { return ptr.get(); } - const T *cbegin() const { return begin(); } - T *end() { return begin() + size(); } - const T *end() const { return begin() + size(); } - const T *cend() const { return end(); } -}; - -template -class DerivedSpan -{ - using BytePointerType = typename eastl::conditional::value, const uint8_t *, uint8_t *>::type; - BytePointerType uBase = nullptr; - size_t uSize = 0; - size_t uCount = 0; - BytePointerType atIndex(size_t i) const { return &uBase[i * uSize]; } - -public: - DerivedSpan() = default; - DerivedSpan(const DerivedSpan &) = default; - template - DerivedSpan(U *u_base, size_t u_count) : uBase{reinterpret_cast(u_base)}, uSize{sizeof(U)}, uCount{u_count} - { - static_assert(eastl::is_base_of::value, "U is invalid type"); - } - template - DerivedSpan(const eastl::vector &u_base) : DerivedSpan{u_base.data(), u_base.size()} - {} - class Iterator - { - BytePointerType uBase = nullptr; - size_t uSize = 0; - - public: - Iterator() = default; - Iterator(const Iterator &) = default; - Iterator(BytePointerType u_base, size_t u_size) : uBase{u_base}, uSize{u_size} {} - - friend bool operator==(const Iterator &l, const Iterator &r) { return l.uBase == r.uBase; } - friend bool operator!=(const Iterator &l, const Iterator &r) { return !(l == r); } - - Iterator &operator++() - { - uBase += uSize; - return *this; - } - Iterator operator++(int) const - { - auto other = *this; - ++other; - return other; - } - - Iterator &operator--() - { - uBase -= uSize; - return *this; - } - Iterator operator--(int) const - { - auto other = *this; - --other; - return other; - } - T &operator*() const { return *reinterpret_cast(uBase); } - }; - - Iterator begin() const { return {uBase, uSize}; } - Iterator cbegin() const { return begin(); } - Iterator end() const { return {atIndex(uCount), uSize}; } - Iterator cend() const { return end(); } - size_t size() const { return uCount; } - T *data() const { return reinterpret_cast(uBase); } -}; - -struct DeviceCapsAndShaderModel -{ - // deliberately using base to allow to load all cap values - DeviceDriverCapabilitiesBase caps; - d3d::shadermodel::Version shaderModel; - - bool isCompatibleTo(d3d::shadermodel::Version shader_model) const { return shader_model <= shaderModel; } - bool isCompatibleTo(const DeviceDriverCapabilities &other) const - { - // This is a very simple approach, when a feature of other is requested but not indicated by caps, we are not compatible. -#define DX12_D3D_CAP(name) \ - if (other.name && !caps.name) \ - return false; - DX12_D3D_CAP_SET -#undef DX12_D3D_CAP - return true; - } - bool isCompatibleTo(const Driver3dDesc &desc) const { return isCompatibleTo(desc.shaderModel) && isCompatibleTo(desc.caps); } - static DeviceCapsAndShaderModel fromDriverDesc(const Driver3dDesc &desc) - { - DeviceCapsAndShaderModel result; - result.shaderModel = desc.shaderModel; - // need to do a copy this way to properly copy constants into variables. -#define DX12_D3D_CAP(name) result.caps.name = desc.caps.name; - DX12_D3D_CAP_SET -#undef DX12_D3D_CAP - return result; - } -}; diff --git a/prog/engine/drv/drv3d_DX12/driver_mutex.h b/prog/engine/drv/drv3d_DX12/driver_mutex.h new file mode 100644 index 000000000..08fcd78d5 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/driver_mutex.h @@ -0,0 +1,113 @@ +#pragma once + +#include +#include + + +namespace drv3d_dx12 +{ + +#if DX12_ENABLE_MT_VALIDATION +class DriverMutex +{ + WinCritSec mutex; + bool enabled = false; + uint32_t recursionCount = 0; + +public: + void lock() + { +#if DX12_ENABLE_PEDANTIC_MT_VALIDATION + // NOTE: During startup this will fire at least once as the work cycle just takes the lock to check + // the d3d context device state. + G_ASSERTF(true == enabled, "DX12: Trying to lock the driver context without enabling " + "multithreading first"); +#endif + mutex.lock(); + ++recursionCount; + } + + void unlock() + { +#if DX12_ENABLE_PEDANTIC_MT_VALIDATION + G_ASSERTF(true == enabled, "DX12: Trying to unlock the driver context without enabling " + "multithreading first"); +#endif + --recursionCount; + mutex.unlock(); + } + + bool validateOwnership() + { + if (mutex.tryLock()) + { +#if DX12_ENABLE_PEDANTIC_MT_VALIDATION + bool owns = false; + if (enabled) + { + // If we end up here the two thing can either be true: + // 1) this thread has taken the lock before this, then recustionCount is greater than 0 + // 2) no thread has taken the lock before this, then recursionCount is equal to 0 + owns = recursionCount > 0; + } + else + { + // if MT is not enabled, only the main thread can access it without taking a lock. + owns = owns || (0 == recursionCount) && is_main_thread(); + } +#else + // DX11 behavior replicating expression, it has a flaw, which allows a race between the main + // thread and a different thread as long as MT is disabled. + bool owns = (!enabled && (0 == recursionCount) && is_main_thread()) || (recursionCount > 0); +#endif + mutex.unlock(); + return owns; + } + // Failed to take the mutex, some other thread has it, so we don't own it. + return false; + } + + void enableMT() + { +#if DX12_ENABLE_PEDANTIC_MT_VALIDATION + // NOTE: This locking will show ordering issues during startup, esp with splash screen. + mutex.lock(); + G_ASSERT(false == enabled); +#endif + enabled = true; +#if DX12_ENABLE_PEDANTIC_MT_VALIDATION + mutex.unlock(); +#endif + } + + void disableMT() + { +#if DX12_ENABLE_PEDANTIC_MT_VALIDATION + mutex.lock(); + G_ASSERT(true == enabled); +#endif + enabled = false; +#if DX12_ENABLE_PEDANTIC_MT_VALIDATION + mutex.unlock(); +#endif + } +}; +#else +class DriverMutex +{ + WinCritSec mutex; + +public: + void lock() { mutex.lock(); } + + void unlock() { mutex.unlock(); } + + bool validateOwnership() { return true; } + + void enableMT() {} + + void disableMT() {} +}; +#endif + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/driver_win.h b/prog/engine/drv/drv3d_DX12/driver_win.h new file mode 100644 index 000000000..0c0595e4b --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/driver_win.h @@ -0,0 +1,49 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + + +typedef IDXGISwapChain3 DXGISwapChain; +typedef IDXGIFactory4 DXGIFactory; +typedef IDXGIAdapter4 DXGIAdapter; +typedef ID3D12Device3 D3DDevice; +typedef ID3D12GraphicsCommandList2 D3DGraphicsCommandList; +using D3DCopyCommandList = ID3D12GraphicsCommandList; + +// on PC we only lock down the execution mode on release builds +#define FIXED_EXECUTION_MODE DAGOR_DBGLEVEL == 0 +#define DX12_ALLOW_SPLIT_BARRIERS 1 +#define DX12_WHATCH_IN_FLIGHT_BARRIERS DAGOR_DBGLEVEL > 0 +#define DX12_VALIDATE_INPUT_LAYOUT_USES DAGOR_DBGLEVEL > 0 +#define DX12_INDIVIDUAL_BARRIER_CHECK 0 +#define DX12_REPORT_TRANSITION_INFO 0 +#define DX12_TRACK_ACTIVE_DRAW_EVENTS DAGOR_DBGLEVEL > 0 +#define DX12_VALIDATE_USER_BARRIERS DAGOR_DBGLEVEL > 0 +#define DX12_AUTOMATIC_BARRIERS 1 +#define DX12_PROCESS_USER_BARRIERS 1 +#define DX12_RECORD_TIMING_DATA 1 +#define DX12_CAPTURE_AFTER_LONG_FRAMES (DX12_RECORD_TIMING_DATA && (DAGOR_DBGLEVEL > 0)) +#define DX12_REPORT_PIPELINE_CREATE_TIMING 1 +// TODO no real gamma control on dx12... +#define DX12_HAS_GAMMA_CONTROL 1 + +// Possible to run with set to 0, but there is no benefit +#define DX12_USE_AUTO_PROMOTE_AND_DECAY 1 + +#define DX12_ENABLE_CONST_BUFFER_DESCRIPTORS 1 + +#define DX12_SELECTABLE_CALL_STACK_CAPTURE 1 + +#define DX12_VALIDATA_COPY_COMMAND_LIST 1 +#define DX12_VALIDATE_COMPUTE_COMMAND_LIST 1 +#define DX12_VALIDATE_RAYTRACE_COMMAND_LIST 1 +#define DX12_VALIDATE_GRAPHICS_COMMAND_LIST 1 + +#define DX12_PROCESS_USER_BARRIERS_DEFAULT 0 + +#define DX12_USE_ESRAM 0 diff --git a/prog/engine/drv/drv3d_DX12/dx12.cpp b/prog/engine/drv/drv3d_DX12/dx12.cpp index 91b214de6..7d890fdc3 100644 --- a/prog/engine/drv/drv3d_DX12/dx12.cpp +++ b/prog/engine/drv/drv3d_DX12/dx12.cpp @@ -3,6 +3,7 @@ #include "../drv3d_commonCode/stereoHelper.h" #include "../drv3d_commonCode/dxgi_utils.h" #include "../drv3d_commonCode/gpuConfig.h" +#include "../drv3d_commonCode/validate_sbuf_flags.h" #include <../hid_mouse/api_wrappers.h> @@ -37,6 +38,8 @@ #include #endif +#include "driver_mutex.h" + #if _TARGET_PC_WIN extern "C" { @@ -1238,6 +1241,19 @@ int on_driver_command_compile_pipeline_set(void *par1) sets->outputFormatSet, sets->graphicsPipelineSet, sets->meshPipelineSet, sets->computePipelineSet, defaultFormat); return 1; } + +int on_get_buffer_gpu_address(void *buffer, void *address) +{ + if (!buffer || !address) + { + return 0; + } + + auto bufferRef = get_any_buffer_ref(static_cast(buffer)); + *static_cast(address) = bufferRef.gpuPointer; + + return 1; +} } // namespace int d3d::driver_command(int command, void *par1, void *par2, void *par3) @@ -1245,6 +1261,7 @@ int d3d::driver_command(int command, void *par1, void *par2, void *par3) STORE_RETURN_ADDRESS(); switch (command) { + case DRV3D_COMMAND_GET_BUFFER_GPU_ADDRESS: return on_get_buffer_gpu_address(par1, par2); case DRV3D_COMMAND_COMPILE_PIPELINE_SET: return on_driver_command_compile_pipeline_set(par1); case DRV3D_COMMAND_REMOVE_DEBUG_BREAK_STRING_SEARCH: api_state.device.getContext().removeDebugBreakString({static_cast(par1)}); @@ -1979,8 +1996,15 @@ bool check_format_features(int cflg, D3D12_FEATURE_DATA_FORMAT_SUPPORT support, if (fmt.isDepth() && (0 == (mask & D3D12_FORMAT_SUPPORT1_DEPTH_STENCIL))) return false; - // no msaa right now - if (cflg & (TEXCF_MULTISAMPLED | TEXCF_MSAATARGET)) + bool isMultisampled = ((cflg & TEXCF_SAMPLECOUNT_MASK) != 0); + + if (isMultisampled && (0 == (mask & D3D12_FORMAT_SUPPORT1_MULTISAMPLE_RENDERTARGET))) + return false; + + if (isMultisampled && (0 == (mask & D3D12_FORMAT_SUPPORT1_MULTISAMPLE_RESOLVE))) + return false; + + if (isMultisampled && (0 == (mask & D3D12_FORMAT_SUPPORT1_MULTISAMPLE_LOAD))) return false; if ((cflg & TEXCF_TILED_RESOURCE) != 0 && (support.Support2 & D3D12_FORMAT_SUPPORT2_TILED) == 0) @@ -1992,10 +2016,22 @@ bool check_format_features(int cflg, D3D12_FEATURE_DATA_FORMAT_SUPPORT support, bool d3d::check_texformat(int cflg) { auto fmt = FormatStore::fromCreateFlags(cflg); + if (!api_state.device.isSamplesCountSupported(fmt.asDxGiFormat(), get_sample_count(cflg))) + return false; auto features = api_state.device.getFormatFeatures(fmt); return check_format_features(cflg, features, fmt, RES3D_TEX); } +int d3d::get_max_sample_count(int cflg) +{ + auto dxgiFormat = FormatStore::fromCreateFlags(cflg).asDxGiFormat(); + for (int32_t numSamples = get_sample_count(TEXCF_SAMPLECOUNT_MAX); numSamples > 1; numSamples /= 2) + if (api_state.device.isSamplesCountSupported(dxgiFormat, numSamples)) + return numSamples; + + return 1; +} + bool d3d::issame_texformat(int cflg1, int cflg2) { auto formatA = FormatStore::fromCreateFlags(cflg1); @@ -2189,10 +2225,10 @@ PROGRAM d3d::create_program(const uint32_t *vs, const uint32_t *ps, VDECL vdecl, return create_program(vprog, fshad, vdecl, strides, streams); } -PROGRAM d3d::create_program_cs(const uint32_t *cs_native) +PROGRAM d3d::create_program_cs(const uint32_t *cs_native, CSPreloaded preloaded) { STORE_RETURN_ADDRESS(); - return api_state.shaderProgramDatabase.newComputeProgram(api_state.device.getContext(), cs_native).exportValue(); + return api_state.shaderProgramDatabase.newComputeProgram(api_state.device.getContext(), cs_native, preloaded).exportValue(); } bool d3d::set_program(PROGRAM prog_id) @@ -2429,7 +2465,7 @@ bool d3d::set_render_target() CHECK_MAIN_THREAD(); ScopedCommitLock ctxLock{api_state.device.getContext()}; api_state.state.resetColorTargetsToBackBuffer(); - api_state.state.resetDepthStencilToBackBuffer(api_state.device.getContext()); + api_state.state.removeDepthStencilTarget(api_state.device.getContext()); api_state.state.setUpdateViewportFromRenderTarget(); return true; } @@ -3258,16 +3294,19 @@ void d3d::get_video_modes_list(Tab &list) { api_state.device.enumerateDi Vbuffer *d3d::create_vb(int size, int flg, const char *name) { + validate_sbuffer_flags(flg | SBCF_BIND_VERTEX, name); return api_state.device.newBufferObject(0, size, flg | SBCF_BIND_VERTEX, 0, name); } Ibuffer *d3d::create_ib(int size, int flg, const char *stat_name) { + validate_sbuffer_flags(flg | SBCF_BIND_INDEX, stat_name); return api_state.device.newBufferObject(0, size, flg | SBCF_BIND_INDEX, 0, stat_name); } Vbuffer *d3d::create_sbuffer(int struct_size, int elements, unsigned flags, unsigned format, const char *name) { + validate_sbuffer_flags(flags, name); return api_state.device.newBufferObject(struct_size, elements, flags, format, name); } diff --git a/prog/engine/drv/drv3d_DX12/dynamic_array.h b/prog/engine/drv/drv3d_DX12/dynamic_array.h new file mode 100644 index 000000000..e3e6cc2ca --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/dynamic_array.h @@ -0,0 +1,92 @@ +#pragma once + +#include +#include + + +template +class DynamicArray +{ + eastl::unique_ptr ptr; + size_t count = 0; + +public: + DynamicArray() = default; + DynamicArray(const DynamicArray &) = delete; + DynamicArray(DynamicArray &&) = default; + DynamicArray(T *p, size_t sz) : ptr{p}, count{sz} {} + DynamicArray(eastl::unique_ptr &&p, size_t sz) : ptr{eastl::forward>(p)}, count{sz} {} + explicit DynamicArray(size_t sz) : DynamicArray{eastl::make_unique(sz), sz} {} + DynamicArray &operator=(const DynamicArray &) = delete; + DynamicArray &operator=(DynamicArray &&other) + { + eastl::swap(ptr, other.ptr); + eastl::swap(count, other.count); + return *this; + } + + void adopt(T *new_ptr, size_t new_sz) + { + eastl::unique_ptr newPtr{new_ptr}; + eastl::swap(ptr, newPtr); + count = new_sz; + } + + T *release() + { + count = 0; + return ptr.release(); + } + + bool resize(size_t new_size) + { + if (count == new_size) + { + return false; + } + + if (0 == new_size) + { + ptr.reset(); + count = 0; + return true; + } + + auto newBlock = eastl::make_unique(new_size); + if (!newBlock) + { + return false; + } + + auto moveCount = min(count, new_size); + for (uint32_t i = 0; i < moveCount; ++i) + { + newBlock[i] = eastl::move(ptr[i]); + } + eastl::swap(ptr, newBlock); + count = new_size; + return true; + } + + T &operator[](size_t i) { return ptr[i]; } + const T &operator[](size_t i) const { return ptr[i]; } + size_t size() const { return count; } + bool empty() const { return !ptr || 0 == count; } + T *data() { return ptr.get(); } + const T *data() const { return ptr.get(); } + eastl::span asSpan() { return {data(), size()}; } + eastl::span asSpan() const { return {data(), size()}; } + eastl::span releaseAsSpan() + { + auto retValue = asSpan(); + release(); + return retValue; + } + static DynamicArray fromSpan(eastl::span span) { return {span.data(), span.size()}; } + T *begin() { return ptr.get(); } + const T *begin() const { return ptr.get(); } + const T *cbegin() const { return begin(); } + T *end() { return begin() + size(); } + const T *end() const { return begin() + size(); } + const T *cend() const { return end(); } +}; diff --git a/prog/engine/drv/drv3d_DX12/extents.h b/prog/engine/drv/drv3d_DX12/extents.h new file mode 100644 index 000000000..abfb27c63 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/extents.h @@ -0,0 +1,96 @@ +#pragma once + +#include +#include +#include + +#include "driver.h" +#include "util.h" + + +struct Extent2D +{ + uint32_t width; + uint32_t height; +}; + +inline bool operator==(Extent2D l, Extent2D r) { return l.width == r.width && l.height == r.height; } +inline bool operator!=(Extent2D l, Extent2D r) { return !(l == r); } + +struct Extent3D +{ + uint32_t width; + uint32_t height; + uint32_t depth; + + explicit operator Extent2D() const { return {width, height}; } +}; + +inline Extent3D operator*(Extent3D l, Extent3D r) { return {l.width * r.width, l.height * r.height, l.depth * r.depth}; } + +inline Extent3D operator/(Extent3D l, Extent3D r) { return {l.width / r.width, l.height / r.height, l.depth / r.depth}; } + +inline bool operator==(Extent3D l, Extent3D r) { return l.depth == r.depth && static_cast(l) == static_cast(r); } + +inline bool operator!=(Extent3D l, Extent3D r) { return !(l == r); } + +inline Extent3D operator>>(Extent3D value, uint32_t shift) +{ + return {value.width >> shift, value.height >> shift, value.depth >> shift}; +} + +inline Extent3D max(Extent3D a, Extent3D b) { return {max(a.width, b.width), max(a.height, b.height), max(a.depth, b.depth)}; } + +inline Extent3D min(Extent3D a, Extent3D b) { return {min(a.width, b.width), min(a.height, b.height), min(a.depth, b.depth)}; } + +inline Extent3D mip_extent(Extent3D value, uint32_t mip) { return max(value >> mip, {1, 1, 1}); } + +struct Offset2D +{ + int32_t x; + int32_t y; +}; + +inline bool operator==(Offset2D l, Offset2D r) { return l.x == r.x && l.y == r.y; } + +inline bool operator!=(Offset2D l, Offset2D r) { return !(l == r); } + +struct Offset3D +{ + int32_t x; + int32_t y; + int32_t z; + + explicit operator Offset2D() { return {x, y}; } +}; + +inline bool operator==(Offset3D l, Offset3D r) { return l.z == r.z && static_cast(l) == static_cast(r); } + +inline bool operator!=(Offset3D l, Offset3D r) { return !(l == r); } + +inline Extent3D operator+(Extent3D ext, Offset3D ofs) { return {ext.width + ofs.x, ext.height + ofs.y, ext.depth + ofs.z}; } + +inline D3D12_RECT clamp_rect(D3D12_RECT rect, Extent2D ext) +{ + rect.left = clamp(rect.left, 0, ext.width); + rect.right = clamp(rect.right, 0, ext.width); + rect.top = clamp(rect.top, 0, ext.height); + rect.bottom = clamp(rect.bottom, 0, ext.height); + return rect; +} + +inline const Offset3D &toOffset(const Extent3D &ext) +{ + // sanity checks + G_STATIC_ASSERT(offsetof(Extent3D, width) == offsetof(Offset3D, x)); + G_STATIC_ASSERT(offsetof(Extent3D, height) == offsetof(Offset3D, y)); + G_STATIC_ASSERT(offsetof(Extent3D, depth) == offsetof(Offset3D, z)); + return reinterpret_cast(ext); +} + +inline Extent3D align_value(const Extent3D &value, const Extent3D &alignment) +{ + return // + {align_value(value.width, alignment.width), align_value(value.height, alignment.height), + align_value(value.depth, alignment.depth)}; +} diff --git a/prog/engine/drv/drv3d_DX12/format_store.cpp b/prog/engine/drv/drv3d_DX12/format_store.cpp index d6aa0da73..9f2cf337d 100644 --- a/prog/engine/drv/drv3d_DX12/format_store.cpp +++ b/prog/engine/drv/drv3d_DX12/format_store.cpp @@ -910,7 +910,7 @@ bool FormatStore::isCopyConvertible(FormatStore other) const return false; } -const char *drv3d_dx12::dxgi_format_name(DXGI_FORMAT fmt) +const char *dxgi_format_name(DXGI_FORMAT fmt) { switch (fmt) { diff --git a/prog/engine/drv/drv3d_DX12/format_store.h b/prog/engine/drv/drv3d_DX12/format_store.h index 0e742d9d5..9be5bd0bf 100644 --- a/prog/engine/drv/drv3d_DX12/format_store.h +++ b/prog/engine/drv/drv3d_DX12/format_store.h @@ -3,6 +3,11 @@ #include <3d/dag_tex3d.h> #include +#include "driver.h" +#include "bitfield.h" +#include "tagged_types.h" + + namespace drv3d_dx12 { // stores formats and offers some utility members @@ -142,5 +147,4 @@ BEGIN_BITFIELD_TYPE(FormatStore, uint8_t) END_BITFIELD_TYPE() inline bool operator==(FormatStore l, FormatStore r) { return l.index == r.index; } inline bool operator!=(FormatStore l, FormatStore r) { return l.index != r.index; } -const char *dxgi_format_name(DXGI_FORMAT fmt); } // namespace drv3d_dx12 \ No newline at end of file diff --git a/prog/engine/drv/drv3d_DX12/format_traits.h b/prog/engine/drv/drv3d_DX12/format_traits.h index 255af7d5f..df51bfa81 100644 --- a/prog/engine/drv/drv3d_DX12/format_traits.h +++ b/prog/engine/drv/drv3d_DX12/format_traits.h @@ -1,5 +1,10 @@ #pragma once +#include <3d/dag_drv3dConsts.h> + +#include "driver.h" + + enum class FormatClass { UNKNOWN, diff --git a/prog/engine/drv/drv3d_DX12/frontend_state.h b/prog/engine/drv/drv3d_DX12/frontend_state.h index e1ada280a..a6f8e06b4 100644 --- a/prog/engine/drv/drv3d_DX12/frontend_state.h +++ b/prog/engine/drv/drv3d_DX12/frontend_state.h @@ -2,6 +2,13 @@ #include <3d/dag_drv3d.h> #include +#include +#include + +#include "texture.h" +#include "shader.h" +#include "resource_manager/raytrace_acceleration_structure.h" + namespace drv3d_dx12 { @@ -644,9 +651,9 @@ struct FrontendState void setStageBRegisterBuffer(uint32_t stage, uint32_t index, Sbuffer *buffer, uint32_t offset, uint32_t size) { - G_ASSERT(stage < array_size(stageResources)); + G_ASSERT(stage < countof(stageResources)); StageResourcesState &target = stageResources[stage]; - G_ASSERT(index < array_size(target.bRegisterBuffers)); + G_ASSERT(index < countof(target.bRegisterBuffers)); OSSpinlockScopedLock resourceBindingLock(resourceBindingGuard); target.markDirtyB(index, target.bRegisterBuffers[index] != buffer || target.bRegisterOffsets[index] != offset || target.bRegisterSizes[index] != size); @@ -657,9 +664,9 @@ struct FrontendState void setStageTRegisterBuffer(uint32_t stage, uint32_t index, Sbuffer *buffer) { - G_ASSERT(stage < array_size(stageResources)); + G_ASSERT(stage < countof(stageResources)); StageResourcesState &target = stageResources[stage]; - G_ASSERT(index < array_size(target.tRegisterBuffers)); + G_ASSERT(index < countof(target.tRegisterBuffers)); OSSpinlockScopedLock resourceBindingLock(resourceBindingGuard); if (target.tRegisterTextures[index]) { @@ -675,9 +682,9 @@ struct FrontendState void setStageURegisterBuffer(uint32_t stage, uint32_t index, Sbuffer *buffer) { - G_ASSERT(stage < array_size(stageResources)); + G_ASSERT(stage < countof(stageResources)); StageResourcesState &target = stageResources[stage]; - G_ASSERT(index < array_size(target.uRegisterBuffers)); + G_ASSERT(index < countof(target.uRegisterBuffers)); GenericBufferInterface *prevBuf = nullptr; { OSSpinlockScopedLock resourceBindingLock(resourceBindingGuard); @@ -704,9 +711,9 @@ struct FrontendState void setStageSRVTexture(uint32_t stage, uint32_t index, BaseTex *texture) { - G_ASSERT(stage < array_size(stageResources)); + G_ASSERT(stage < countof(stageResources)); StageResourcesState &target = stageResources[stage]; - G_ASSERT(index < array_size(target.tRegisterTextures)); + G_ASSERT(index < countof(target.tRegisterTextures)); OSSpinlockScopedLock resourceBindingLock(resourceBindingGuard); if (texture) { @@ -733,9 +740,9 @@ struct FrontendState void setStageSampler(uint32_t stage, uint32_t index, d3d::SamplerHandle handle) { - G_ASSERT(stage < array_size(stageResources)); + G_ASSERT(stage < countof(stageResources)); StageResourcesState &target = stageResources[stage]; - G_ASSERT(index < array_size(target.sRegisterSamplers)); + G_ASSERT(index < countof(target.sRegisterSamplers)); OSSpinlockScopedLock resourceBindingLock(resourceBindingGuard); target.markDirtyS(index, target.sRegisterSamplers[index] != handle); target.sRegisterSamplers[index] = handle; @@ -743,9 +750,9 @@ struct FrontendState void setStageUAVTexture(uint32_t stage, uint32_t index, BaseTex *texture, ImageViewState view) { - G_ASSERT(stage < array_size(stageResources)); + G_ASSERT(stage < countof(stageResources)); StageResourcesState &target = stageResources[stage]; - G_ASSERT(index < array_size(target.uRegisterTextures)); + G_ASSERT(index < countof(target.uRegisterTextures)); OSSpinlockScopedLock resourceBindingLock(resourceBindingGuard); if (texture) { @@ -904,7 +911,7 @@ struct FrontendState uint32_t setComputeConstRegisterCount(uint32_t cnt) { if (cnt) - cnt = clamp(nextPowerOfTwo(cnt), MIN_COMPUTE_CONST_REGISTERS, MAX_COMPUTE_CONST_REGISTERS); + cnt = clamp(get_bigger_pow2(cnt), MIN_COMPUTE_CONST_REGISTERS, MAX_COMPUTE_CONST_REGISTERS); else cnt = MIN_COMPUTE_CONST_REGISTERS; // TODO update things to allow 0 (eg shader can tell how many it needs) markDirty(DirtyState::COMPUTE_CONST_REGISTERS, registerSpaceSizes[STAGE_CS] < cnt); @@ -914,7 +921,7 @@ struct FrontendState uint32_t setVertexConstRegisterCount(uint32_t cnt) { if (cnt) - cnt = clamp(nextPowerOfTwo(cnt), VERTEX_SHADER_MIN_REGISTERS, VERTEX_SHADER_MAX_REGISTERS); + cnt = clamp(get_bigger_pow2(cnt), VERTEX_SHADER_MIN_REGISTERS, VERTEX_SHADER_MAX_REGISTERS); else cnt = VERTEX_SHADER_MIN_REGISTERS; // TODO update things to allow 0 (eg shader can tell how many it needs) markDirty(DirtyState::VERTEX_CONST_REGISTERS, registerSpaceSizes[STAGE_VS] < cnt); @@ -1310,9 +1317,9 @@ struct FrontendState void setStageTRegisterRaytraceAccelerationStructure(uint32_t stage, uint32_t index, RaytraceAccelerationStructure *as) { - G_ASSERT(stage < array_size(stageResources)); + G_ASSERT(stage < countof(stageResources)); StageResourcesState &target = stageResources[stage]; - G_ASSERT(index < array_size(target.tRegisterRaytraceAccelerataionStructures)); + G_ASSERT(index < countof(target.tRegisterRaytraceAccelerataionStructures)); OSSpinlockScopedLock resourceBindingLock(resourceBindingGuard); if (target.tRegisterTextures[index]) { diff --git a/prog/engine/drv/drv3d_DX12/fsr2_wrapper.h b/prog/engine/drv/drv3d_DX12/fsr2_wrapper.h index 49e4c2398..2299f3877 100644 --- a/prog/engine/drv/drv3d_DX12/fsr2_wrapper.h +++ b/prog/engine/drv/drv3d_DX12/fsr2_wrapper.h @@ -1,6 +1,9 @@ #pragma once #include +#include +#include <3d/dag_drv3dConsts.h> + struct FfxFsr2ContextDescription; struct FfxFsr2Context; diff --git a/prog/engine/drv/drv3d_DX12/half_float.h b/prog/engine/drv/drv3d_DX12/half_float.h new file mode 100644 index 000000000..e0fe42d88 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/half_float.h @@ -0,0 +1,48 @@ +#pragma once + +#include +#include + + +// We have our own half float converter here, as we may want to tweak it a bit in the future +namespace drv3d_dx12::half_float +{ + +inline constexpr uint32_t float_sign_mask = 0x80000000U; +inline constexpr uint32_t float_exponent_mask = 0x7F800000U; +inline constexpr uint32_t float_mantissa_mask = 0x7FFFFFU; +inline constexpr uint32_t float_mantiassa_size = 23; +inline constexpr int32_t float_mantiassa_bias = 127; +inline constexpr int32_t bias = 15; +inline constexpr int32_t exponent_size = 5; +inline constexpr int32_t mantissa_size = 10; +inline constexpr int32_t max_exponent = (1 << exponent_size) - 1; + +inline float convert_to_float(uint16_t v) +{ + int32_t exponent = int32_t(((v & 0x7FFFU)) >> mantissa_size); + bool isNan = exponent >= max_exponent; + uint32_t exponentPart = + exponent <= 0 ? 0U : (isNan ? float_exponent_mask : ((exponent - bias + float_mantiassa_bias) << float_mantiassa_size)); + uint32_t signPart = uint32_t(v & 0x8000U) << 16; + uint32_t fractionPart = isNan ? float_mantissa_mask : (uint32_t(v) << (float_mantiassa_size - mantissa_size)) & float_mantissa_mask; + uint32_t floatBits = signPart | exponentPart | fractionPart; + float floatValue; + memcpy(&floatValue, &floatBits, sizeof(float)); + return floatValue; +} + +inline uint16_t convert_from_float(float v) +{ + uint32_t floatBits; + memcpy(&floatBits, &v, sizeof(float)); + + int32_t exponent = ((floatBits & float_exponent_mask) >> float_mantiassa_size) - float_mantiassa_bias + bias; + uint32_t exponentPart = clamp(exponent, 0, max_exponent) << mantissa_size; + uint32_t signPart = ((floatBits & float_sign_mask) >> 16); + uint32_t fractionPart = exponent >= 0 ? ((floatBits & float_mantissa_mask) >> (float_mantiassa_size - mantissa_size)) : 0U; + + return signPart | exponentPart | fractionPart; +} + +} // namespace drv3d_dx12::half_float diff --git a/prog/engine/drv/drv3d_DX12/host_device_shared_memory_region.h b/prog/engine/drv/drv3d_DX12/host_device_shared_memory_region.h new file mode 100644 index 000000000..7ae0e3409 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/host_device_shared_memory_region.h @@ -0,0 +1,94 @@ +#pragma once + +#include + +#include "driver.h" +#include "value_range.h" + + +namespace drv3d_dx12 +{ + +template +inline D3D12_RANGE asDx12Range(ValueRange range) +{ + return {static_cast(range.front()), static_cast(range.back() + 1)}; +} + +struct HostDeviceSharedMemoryRegion +{ + enum class Source + { + TEMPORARY, // TODO: may split into temporary upload and readback (bidirectional makes no sense) + PERSISTENT_UPLOAD, + PERSISTENT_READ_BACK, + PERSISTENT_BIDIRECTIONAL, + PUSH_RING, // illegal to free + }; + // buffer object that supplies the memory + ID3D12Resource *buffer = nullptr; +#if _TARGET_XBOX + // on xbox gpu and cpu pointer are the same + union + { + D3D12_GPU_VIRTUAL_ADDRESS gpuPointer = 0; + uint8_t *pointer; + }; +#else + // offset into gpu virtual memory, including offset! + D3D12_GPU_VIRTUAL_ADDRESS gpuPointer = 0; + // pointer into cpu visible memory, offset is already applied (pointer - offset yields address + // base of the buffer) + uint8_t *pointer = nullptr; +#endif + // offset range of this allocation + // gpuPointer and pointer already have the start of the range added to it + ValueRange range; + Source source = Source::TEMPORARY; + + explicit constexpr operator bool() const { return nullptr != buffer; } + constexpr bool isTemporary() const { return Source::TEMPORARY == source; } + void flushRegion(ValueRange sub_range) const + { + D3D12_RANGE r = {}; + uint8_t *ptr = nullptr; + buffer->Map(0, &r, reinterpret_cast(&ptr)); + G_ASSERT(ptr + range.front() == pointer); + r = asDx12Range(sub_range.shiftBy(range.front())); + buffer->Unmap(0, &r); + } + void invalidateRegion(ValueRange sub_range) const + { + D3D12_RANGE r = asDx12Range(sub_range.shiftBy(range.front())); + uint8_t *ptr = nullptr; + buffer->Map(0, &r, reinterpret_cast(&ptr)); + G_ASSERT(pointer + sub_range.front() == ptr + range.front()); + r.Begin = r.End = 0; + buffer->Unmap(0, &r); + } + void flush() const + { + D3D12_RANGE r = {}; + uint8_t *ptr = nullptr; + buffer->Map(0, &r, reinterpret_cast(&ptr)); + G_ASSERT(ptr + range.front() == pointer); + r = asDx12Range(range); + buffer->Unmap(0, &r); + } + void invalidate() const + { + D3D12_RANGE r = asDx12Range(range); + uint8_t *ptr = nullptr; + buffer->Map(0, &r, reinterpret_cast(&ptr)); + G_ASSERT(ptr + range.front() == pointer); + r.Begin = r.End = 0; + buffer->Unmap(0, &r); + } + template + T *as() const + { + return reinterpret_cast(pointer); + } +}; + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/image_global_subresource_id.h b/prog/engine/drv/drv3d_DX12/image_global_subresource_id.h new file mode 100644 index 000000000..e5ad7f740 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/image_global_subresource_id.h @@ -0,0 +1,200 @@ +#pragma once + +#include + +#include "tagged_types.h" +#include "value_range.h" + + +namespace drv3d_dx12 +{ + +class ImageGlobalSubresouceId +{ +protected: + static constexpr uint32_t invalid_id = 0x00FFFFFF; + + uint32_t value = invalid_id; + + constexpr ImageGlobalSubresouceId(uint32_t v) : value{v} {} + + friend class ExtendedImageGlobalSubresouceId; + +public: + constexpr ImageGlobalSubresouceId() = default; + ~ImageGlobalSubresouceId() = default; + constexpr ImageGlobalSubresouceId(const ImageGlobalSubresouceId &) = default; + ImageGlobalSubresouceId &operator=(const ImageGlobalSubresouceId &) = default; + + constexpr bool isValid() const { return invalid_id != value; } + constexpr uint32_t index() const { return value; } + + static ImageGlobalSubresouceId make(uint32_t value) + { + G_ASSERT(value == (value & invalid_id)); + return {value}; + } + + static constexpr ImageGlobalSubresouceId makec(uint32_t value) { return {value}; } + + static constexpr ImageGlobalSubresouceId make_invalid() { return {}; } + + ImageGlobalSubresouceId &operator+=(uint32_t r) + { + G_ASSERT(isValid()); + value += r; + return *this; + } + + ImageGlobalSubresouceId &operator-=(uint32_t r) + { + G_ASSERT(isValid()); + value -= r; + return *this; + } + + ImageGlobalSubresouceId &operator++() + { + G_ASSERT(isValid()); + ++value; + return *this; + } + + ImageGlobalSubresouceId operator++(int) const + { + G_ASSERT(isValid()); + auto copy = *this; + return ++copy; + } + + ImageGlobalSubresouceId &operator--() + { + G_ASSERT(isValid()); + --value; + return *this; + } + + ImageGlobalSubresouceId operator--(int) const + { + G_ASSERT(isValid()); + auto copy = *this; + return --copy; + } + + operator DagorSafeArg() const { return {index()}; } + + constexpr SubresourceIndex toSubresouceIndex(ImageGlobalSubresouceId base) const + { + return SubresourceIndex::make(index() - base.index()); + } +}; + +inline constexpr ImageGlobalSubresouceId swapchain_color_texture_global_id = ImageGlobalSubresouceId::makec(0); +inline constexpr ImageGlobalSubresouceId swapchain_secondary_color_texture_global_id = ImageGlobalSubresouceId::makec(1); +inline constexpr ImageGlobalSubresouceId first_dynamic_texture_global_id = ImageGlobalSubresouceId::makec(2); + +inline constexpr ImageGlobalSubresouceId operator+(const ImageGlobalSubresouceId &l, uint32_t r) +{ + return ImageGlobalSubresouceId::makec(l.index() + r); +} + +inline constexpr ImageGlobalSubresouceId operator+(const ImageGlobalSubresouceId &l, SubresourceIndex r) +{ + return ImageGlobalSubresouceId::makec(l.index() + r.index()); +} + +inline constexpr ImageGlobalSubresouceId operator-(const ImageGlobalSubresouceId &l, uint32_t r) +{ + return ImageGlobalSubresouceId::makec(l.index() - r); +} + +inline constexpr ImageGlobalSubresouceId operator-(const ImageGlobalSubresouceId &l, SubresourceIndex r) +{ + return ImageGlobalSubresouceId::makec(l.index() - r.index()); +} + +inline constexpr size_t operator-(const ImageGlobalSubresouceId &l, const ImageGlobalSubresouceId &r) { return l.index() - r.index(); } + +inline constexpr bool operator==(const ImageGlobalSubresouceId &l, const ImageGlobalSubresouceId &r) { return l.index() == r.index(); } + +inline constexpr bool operator!=(const ImageGlobalSubresouceId &l, const ImageGlobalSubresouceId &r) { return l.index() != r.index(); } + +inline constexpr bool operator<(const ImageGlobalSubresouceId &l, const ImageGlobalSubresouceId &r) { return l.index() < r.index(); } + +inline constexpr bool operator<=(const ImageGlobalSubresouceId &l, const ImageGlobalSubresouceId &r) { return l.index() <= r.index(); } + +inline constexpr bool operator>(const ImageGlobalSubresouceId &l, const ImageGlobalSubresouceId &r) { return l.index() > r.index(); } + +inline constexpr bool operator>=(const ImageGlobalSubresouceId &l, const ImageGlobalSubresouceId &r) { return l.index() >= r.index(); } + +using BareBoneImageGlobalSubresouceIdRange = ValueRange; + +class ExtendedImageGlobalSubresouceId +{ + using BareBoneType = ImageGlobalSubresouceId; + static constexpr uint32_t invalid_id = BareBoneType::invalid_id; + static constexpr uint32_t static_texture_bit = 1u << 31; + static constexpr uint32_t report_transitions_bit = 1u << 30; + static constexpr uint32_t index_mask = invalid_id; + static constexpr uint32_t status_mask = ~index_mask; + + uint32_t value = invalid_id; + + constexpr ExtendedImageGlobalSubresouceId(uint32_t v) : value{v} {} + +public: + constexpr ExtendedImageGlobalSubresouceId() = default; + ~ExtendedImageGlobalSubresouceId() = default; + constexpr ExtendedImageGlobalSubresouceId(const ExtendedImageGlobalSubresouceId &) = default; + ExtendedImageGlobalSubresouceId &operator=(const ExtendedImageGlobalSubresouceId &) = default; + + constexpr ImageGlobalSubresouceId asBareBone() const { return {index()}; } + + constexpr operator ImageGlobalSubresouceId() const { return asBareBone(); } + + static constexpr ExtendedImageGlobalSubresouceId make(ImageGlobalSubresouceId v) { return {v.index()}; } + + static ExtendedImageGlobalSubresouceId make(uint32_t v) + { + G_ASSERT(0 == (v & index_mask)); + return {v}; + } + + static ExtendedImageGlobalSubresouceId make_static(uint32_t v) + { + G_ASSERT(0 == (v & index_mask)); + return {v | static_texture_bit}; + } + + void setStatic() { value |= static_texture_bit; } + void setNonStatic() { value &= ~static_texture_bit; } + + void enableTransitionReporting() { value |= report_transitions_bit; } + void disableTransitionReporting() { value &= ~report_transitions_bit; } + + constexpr bool isValid() const { return invalid_id != (value & index_mask); } + constexpr uint32_t index() const { return value & index_mask; } + constexpr bool isStatic() const { return 0 != (value & static_texture_bit); } + constexpr bool shouldReportTransitions() const { return 0 != (value & report_transitions_bit); } + + constexpr ExtendedImageGlobalSubresouceId add(uint32_t v) const { return {value + v}; } + + constexpr ExtendedImageGlobalSubresouceId add(SubresourceCount v) const { return {value + v.count()}; } + + constexpr ExtendedImageGlobalSubresouceId sub(uint32_t v) const { return {value - v}; } + + operator DagorSafeArg() const { return {index()}; } + + constexpr SubresourceIndex toSubresouceIndex(ImageGlobalSubresouceId base) const + { + return SubresourceIndex::make(index() - base.index()); + } +}; + +inline constexpr ExtendedImageGlobalSubresouceId operator+(const ExtendedImageGlobalSubresouceId &l, uint32_t r) { return l.add(r); } + +inline constexpr ExtendedImageGlobalSubresouceId operator-(const ExtendedImageGlobalSubresouceId &l, uint32_t r) { return l.sub(r); } + +using ExtendedImageGlobalSubresouceIdRange = ValueRange; + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/image_view_state.h b/prog/engine/drv/drv3d_DX12/image_view_state.h new file mode 100644 index 000000000..a0beb2161 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/image_view_state.h @@ -0,0 +1,494 @@ +#pragma once + +#include "bitfield.h" +#include "format_store.h" + + +namespace drv3d_dx12 +{ + +BEGIN_BITFIELD_TYPE(ImageViewState, uint64_t) + enum Type + { + INVALID, // 0! + SRV, + UAV, + RTV, + DSV_RW, + DSV_CONST + }; + enum + { + WORD_SIZE = sizeof(uint64_t) * 8, + SAMPLE_STENCIL_BITS = 1, + SAMPLE_STENCIL_SHIFT = 0, + TYPE_BITS = 3, + TYPE_SHIFT = SAMPLE_STENCIL_BITS + SAMPLE_STENCIL_SHIFT, + IS_CUBEMAP_BITS = 1, + IS_CUBEMAP_SHIFT = TYPE_BITS + TYPE_SHIFT, + IS_ARRAY_BITS = 1, + IS_ARRAY_SHIFT = IS_CUBEMAP_BITS + IS_CUBEMAP_SHIFT, + FORMAT_BITS = FormatStore::BITS + 1, + FORMAT_SHIFT = IS_ARRAY_SHIFT + IS_ARRAY_BITS, + MIPMAP_OFFSET_BITS = BitsNeeded<15>::VALUE, + MIPMAP_OFFSET_SHIFT = FORMAT_SHIFT + FORMAT_BITS, + MIPMAP_RANGE_OFFSET = 1, + MIPMAP_RANGE_BITS = BitsNeeded<16 - MIPMAP_RANGE_OFFSET>::VALUE, + MIPMAP_RANGE_SHIFT = MIPMAP_OFFSET_SHIFT + MIPMAP_OFFSET_BITS, + // automatic assign left over space to array range def + ARRAY_DATA_SIZE = WORD_SIZE - MIPMAP_RANGE_SHIFT - MIPMAP_RANGE_BITS, + ARRAY_OFFSET_BITS = (ARRAY_DATA_SIZE / 2) + (ARRAY_DATA_SIZE % 2), + ARRAY_OFFSET_SHIFT = MIPMAP_RANGE_SHIFT + MIPMAP_RANGE_BITS, + ARRAY_RANGE_OFFSET = 1, + ARRAY_RANGE_BITS = ARRAY_DATA_SIZE / 2, + ARRAY_RANGE_SHIFT = (ARRAY_OFFSET_SHIFT + ARRAY_OFFSET_BITS) + }; + ADD_BITFIELD_MEMBER(sampleStencil, SAMPLE_STENCIL_SHIFT, SAMPLE_STENCIL_BITS) + ADD_BITFIELD_MEMBER(type, TYPE_SHIFT, TYPE_BITS) + ADD_BITFIELD_MEMBER(isCubemap, IS_CUBEMAP_SHIFT, IS_CUBEMAP_BITS) + ADD_BITFIELD_MEMBER(isArray, IS_ARRAY_SHIFT, IS_ARRAY_BITS) + ADD_BITFIELD_MEMBER(format, FORMAT_SHIFT, FORMAT_BITS) + ADD_BITFIELD_MEMBER(mipmapOffset, MIPMAP_OFFSET_SHIFT, MIPMAP_OFFSET_BITS); + ADD_BITFIELD_MEMBER(mipmapRange, MIPMAP_RANGE_SHIFT, MIPMAP_RANGE_BITS) + ADD_BITFIELD_MEMBER(arrayOffset, ARRAY_OFFSET_SHIFT, ARRAY_OFFSET_BITS) + ADD_BITFIELD_MEMBER(arrayRange, ARRAY_RANGE_SHIFT, ARRAY_RANGE_BITS) + bool isValid() const + { + return uint64_t(*this) != 0; + } // since type can't be 0/UNKNOWN, all bits can't be 0. comparing whole machine word is faster than extract type + explicit operator bool() const { return isValid(); } + void setType(Type tp) { type = tp; } + Type getType() const { return static_cast(static_cast(type)); } + void setRTV() { setType(RTV); } + void setDSV(bool as_const) { setType(as_const ? DSV_CONST : DSV_RW); } + void setSRV() { setType(SRV); } + void setUAV() { setType(UAV); } + bool isRTV() const { return static_cast(type) == RTV; } + bool isDSV() const { return static_cast(type) == DSV_RW || static_cast(type) == DSV_CONST; } + bool isSRV() const { return static_cast(type) == SRV; } + bool isUAV() const { return static_cast(type) == UAV; } + + // TODO check cube/array handling + + // TODO: is d24/d32 always planar? + D3D12_SHADER_RESOURCE_VIEW_DESC asSRVDesc(D3D12_RESOURCE_DIMENSION dim, bool is_multisampled) const + { + D3D12_SHADER_RESOURCE_VIEW_DESC result; + const auto fmt = getFormat(); + result.Format = fmt.asDxGiFormat(); + uint32_t planeSlice = 0; + if (DXGI_FORMAT_D24_UNORM_S8_UINT == result.Format) + { + if (0 == sampleStencil) + { + result.Format = DXGI_FORMAT_R24_UNORM_X8_TYPELESS; + } + else + { + result.Format = DXGI_FORMAT_X24_TYPELESS_G8_UINT; + planeSlice = fmt.getPlanes().count() > 1 ? 1 : 0; + } + } + else if (DXGI_FORMAT_D32_FLOAT_S8X24_UINT == result.Format) + { + if (0 == sampleStencil) + { + result.Format = DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS; + } + else + { + result.Format = DXGI_FORMAT_X32_TYPELESS_G8X24_UINT; + planeSlice = fmt.getPlanes().count() > 1 ? 1 : 0; + } + } + else if (DXGI_FORMAT_D16_UNORM == result.Format) + { + result.Format = DXGI_FORMAT_R16_UNORM; + } + else if (DXGI_FORMAT_D32_FLOAT == result.Format) + { + result.Format = DXGI_FORMAT_R32_FLOAT; + } + result.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING; + switch (dim) + { + case D3D12_RESOURCE_DIMENSION_BUFFER: + case D3D12_RESOURCE_DIMENSION_UNKNOWN: fatal("Usage error!"); return {}; + case D3D12_RESOURCE_DIMENSION_TEXTURE1D: + if (isArray) + { + result.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE1DARRAY; + auto &target = result.Texture1DArray; + target.MostDetailedMip = getMipBase().index(); + target.MipLevels = getMipCount(); + target.FirstArraySlice = getArrayBase().index(); + target.ArraySize = getArrayCount(); + target.ResourceMinLODClamp = 0.f; + } + else + { + result.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE1D; + auto &target = result.Texture1D; + target.MostDetailedMip = getMipBase().index(); + target.MipLevels = getMipCount(); + target.ResourceMinLODClamp = 0.f; + } + break; + case D3D12_RESOURCE_DIMENSION_TEXTURE2D: + if (isCubemap) + { + if (isArray) + { + result.ViewDimension = D3D12_SRV_DIMENSION_TEXTURECUBEARRAY; + auto &target = result.TextureCubeArray; + target.MostDetailedMip = getMipBase().index(); + target.MipLevels = getMipCount(); + target.First2DArrayFace = getArrayBase().index(); + target.NumCubes = getArrayCount() / 6; + target.ResourceMinLODClamp = 0.f; + } + else + { + result.ViewDimension = D3D12_SRV_DIMENSION_TEXTURECUBE; + auto &target = result.TextureCube; + target.MostDetailedMip = getMipBase().index(); + target.MipLevels = getMipCount(); + target.ResourceMinLODClamp = 0.f; + } + } + else + { + if (isArray) + { + result.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2DARRAY; + if (!is_multisampled) + { + auto &target = result.Texture2DArray; + target.MostDetailedMip = getMipBase().index(); + target.MipLevels = getMipCount(); + target.FirstArraySlice = getArrayBase().index(); + target.ArraySize = getArrayCount(); + target.PlaneSlice = planeSlice; + target.ResourceMinLODClamp = 0.f; + } + else + { + auto &target = result.Texture2DMSArray; + target.FirstArraySlice = getArrayBase().index(); + target.ArraySize = getArrayCount(); + } + } + else + { + result.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D; + if (!is_multisampled) + { + auto &target = result.Texture2D; + target.MostDetailedMip = getMipBase().index(); + target.MipLevels = getMipCount(); + target.PlaneSlice = planeSlice; + target.ResourceMinLODClamp = 0.f; + } + else + { + auto &target = result.Texture2DMS; + G_UNUSED(target); + } + } + } + break; + case D3D12_RESOURCE_DIMENSION_TEXTURE3D: + result.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE3D; + auto &target = result.Texture3D; + target.MostDetailedMip = getMipBase().index(); + target.MipLevels = getMipCount(); + target.ResourceMinLODClamp = 0.f; + break; + } + return result; + } + + D3D12_UNORDERED_ACCESS_VIEW_DESC asUAVDesc(D3D12_RESOURCE_DIMENSION dim) const + { + D3D12_UNORDERED_ACCESS_VIEW_DESC result; + result.Format = getFormat().asDxGiFormat(); + switch (dim) + { + case D3D12_RESOURCE_DIMENSION_BUFFER: + case D3D12_RESOURCE_DIMENSION_UNKNOWN: fatal("Usage error!"); return {}; + case D3D12_RESOURCE_DIMENSION_TEXTURE1D: + if (isArray) + { + result.ViewDimension = D3D12_UAV_DIMENSION_TEXTURE1DARRAY; + auto &target = result.Texture1DArray; + target.MipSlice = getMipBase().index(); + target.FirstArraySlice = getArrayBase().index(); + target.ArraySize = getArrayCount(); + } + else + { + result.ViewDimension = D3D12_UAV_DIMENSION_TEXTURE1D; + auto &target = result.Texture1D; + target.MipSlice = getMipBase().index(); + } + break; + case D3D12_RESOURCE_DIMENSION_TEXTURE2D: + // Array and cube are the same for UAV + if (isArray || isCubemap) + { + result.ViewDimension = D3D12_UAV_DIMENSION_TEXTURE2DARRAY; + auto &target = result.Texture2DArray; + target.MipSlice = getMipBase().index(); + target.FirstArraySlice = getArrayBase().index(); + target.ArraySize = getArrayCount(); + target.PlaneSlice = 0; + } + else + { + result.ViewDimension = D3D12_UAV_DIMENSION_TEXTURE2D; + auto &target = result.Texture2D; + target.MipSlice = getMipBase().index(); + target.PlaneSlice = 0; + } + break; + case D3D12_RESOURCE_DIMENSION_TEXTURE3D: + result.ViewDimension = D3D12_UAV_DIMENSION_TEXTURE3D; + auto &target = result.Texture3D; + target.MipSlice = getMipBase().index(); + target.FirstWSlice = getArrayBase().index(); + target.WSize = getArrayCount(); + break; + } + return result; + } + + D3D12_RENDER_TARGET_VIEW_DESC asRTVDesc(D3D12_RESOURCE_DIMENSION dim, bool is_multisampled) const + { + D3D12_RENDER_TARGET_VIEW_DESC result; + result.Format = getFormat().asDxGiFormat(); + switch (dim) + { + case D3D12_RESOURCE_DIMENSION_BUFFER: + case D3D12_RESOURCE_DIMENSION_UNKNOWN: fatal("Usage error!"); return {}; + case D3D12_RESOURCE_DIMENSION_TEXTURE1D: + if (isArray) + { + result.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE1DARRAY; + auto &target = result.Texture1DArray; + target.MipSlice = getMipBase().index(); + target.FirstArraySlice = getArrayBase().index(); + target.ArraySize = getArrayCount(); + } + else + { + result.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE1D; + auto &target = result.Texture1D; + target.MipSlice = getMipBase().index(); + } + break; + case D3D12_RESOURCE_DIMENSION_TEXTURE2D: + if (isArray || isCubemap) + { + result.ViewDimension = is_multisampled ? D3D12_RTV_DIMENSION_TEXTURE2DMSARRAY : D3D12_RTV_DIMENSION_TEXTURE2DARRAY; + if (!is_multisampled) + { + auto &target = result.Texture2DArray; + target.MipSlice = getMipBase().index(); + target.FirstArraySlice = getArrayBase().index(); + target.ArraySize = getArrayCount(); + target.PlaneSlice = 0; + } + else + { + auto &target = result.Texture2DMSArray; + target.ArraySize = getArrayCount(); + target.FirstArraySlice = getArrayBase().index(); + } + } + else + { + result.ViewDimension = is_multisampled ? D3D12_RTV_DIMENSION_TEXTURE2DMS : D3D12_RTV_DIMENSION_TEXTURE2D; + if (!is_multisampled) + { + auto &target = result.Texture2D; + target.MipSlice = getMipBase().index(); + target.PlaneSlice = 0; + } + else + { + auto &target = result.Texture2DMS; + G_UNUSED(target); + } + } + break; + case D3D12_RESOURCE_DIMENSION_TEXTURE3D: + result.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE3D; + auto &target = result.Texture3D; + target.MipSlice = getMipBase().index(); + target.FirstWSlice = getArrayBase().index(); + target.WSize = getArrayCount(); + break; + } + + return result; + } + + D3D12_DEPTH_STENCIL_VIEW_DESC asDSVDesc(D3D12_RESOURCE_DIMENSION dim, bool is_multisampled) const + { + D3D12_DEPTH_STENCIL_VIEW_DESC result; + auto fmt = getFormat(); + result.Format = fmt.asDxGiFormat(); + result.Flags = D3D12_DSV_FLAG_NONE; + if (getType() == DSV_CONST) + { + if (fmt.isDepth()) + result.Flags |= D3D12_DSV_FLAG_READ_ONLY_DEPTH; + if (fmt.isStencil()) + result.Flags |= D3D12_DSV_FLAG_READ_ONLY_STENCIL; + } + switch (dim) + { + case D3D12_RESOURCE_DIMENSION_BUFFER: + case D3D12_RESOURCE_DIMENSION_UNKNOWN: fatal("Usage error!"); return {}; + case D3D12_RESOURCE_DIMENSION_TEXTURE1D: + if (isArray) + { + result.ViewDimension = D3D12_DSV_DIMENSION_TEXTURE1DARRAY; + auto &target = result.Texture1DArray; + target.MipSlice = getMipBase().index(); + target.FirstArraySlice = getArrayBase().index(); + target.ArraySize = getArrayCount(); + } + else + { + result.ViewDimension = D3D12_DSV_DIMENSION_TEXTURE1D; + auto &target = result.Texture1D; + target.MipSlice = getMipBase().index(); + } + break; + case D3D12_RESOURCE_DIMENSION_TEXTURE2D: + if (isArray || isCubemap) + { + result.ViewDimension = is_multisampled ? D3D12_DSV_DIMENSION_TEXTURE2DMSARRAY : D3D12_DSV_DIMENSION_TEXTURE2DARRAY; + if (!is_multisampled) + { + auto &target = result.Texture2DArray; + target.MipSlice = getMipBase().index(); + target.FirstArraySlice = getArrayBase().index(); + target.ArraySize = getArrayCount(); + } + else + { + auto &target = result.Texture2DMSArray; + target.FirstArraySlice = getArrayBase().index(); + target.ArraySize = getArrayCount(); + } + } + else + { + result.ViewDimension = is_multisampled ? D3D12_DSV_DIMENSION_TEXTURE2DMS : D3D12_DSV_DIMENSION_TEXTURE2D; + if (!is_multisampled) + { + auto &target = result.Texture2D; + target.MipSlice = getMipBase().index(); + } + else + { + auto &target = result.Texture2DMS; + G_UNUSED(target); + } + } + break; + case D3D12_RESOURCE_DIMENSION_TEXTURE3D: fatal("DX12: Volume depth stencil view not supported"); break; + } + return result; + } + + void setFormat(FormatStore fmt) { format = fmt.index; } + FormatStore getFormat() const { return FormatStore(format); } + void setMipBase(MipMapIndex u) { mipmapOffset = u.index(); } + MipMapIndex getMipBase() const { return MipMapIndex::make(mipmapOffset); } + void setMipCount(uint8_t u) { mipmapRange = u - MIPMAP_RANGE_OFFSET; } + uint8_t getMipCount() const { return MIPMAP_RANGE_OFFSET + mipmapRange; } + void setSingleMipMapRange(MipMapIndex index) + { + setMipBase(index); + G_STATIC_ASSERT(1 == MIPMAP_RANGE_OFFSET); + mipmapRange = 0; + } + void setMipMapRange(MipMapIndex index, uint32_t count) + { + setMipBase(index); + setMipCount(count); + } + void setMipMapRange(MipMapRange range) + { + setMipBase(range.front()); + setMipCount(range.size()); + } + MipMapRange getMipRange() const { return MipMapRange::make(getMipBase(), getMipCount()); } + void setArrayBase(ArrayLayerIndex u) { arrayOffset = u.index(); } + ArrayLayerIndex getArrayBase() const { return ArrayLayerIndex::make(arrayOffset); } + void setArrayCount(uint16_t u) { arrayRange = u - ARRAY_RANGE_OFFSET; } + uint16_t getArrayCount() const { return ARRAY_RANGE_OFFSET + (uint32_t)arrayRange; } + void setArrayRange(ArrayLayerRange range) + { + setArrayBase(range.front()); + setArrayCount(range.size()); + } + void setSingleArrayRange(ArrayLayerIndex index) + { + setArrayBase(index); + G_STATIC_ASSERT(1 == ARRAY_RANGE_OFFSET); + arrayRange = 0; + } + ArrayLayerRange getArrayRange() const { return ArrayLayerRange::make(getArrayBase(), getArrayCount()); } + void setSingleDepthLayer(uint16_t base) + { + setArrayBase(ArrayLayerIndex::make(base)); + setArrayCount(1); + } + void setDepthLayerRange(uint16_t base, uint16_t count) + { + setArrayBase(ArrayLayerIndex::make(base)); + setArrayCount(count); + } + FormatPlaneIndex getPlaneIndex() const + { + return FormatPlaneIndex::make((sampleStencil && getFormat().getPlanes().count() > 1) ? 1 : 0); + } + template + // NOTE: does not take plane index into account + void iterateSubresources(D3D12_RESOURCE_DIMENSION dim, MipMapCount mip_per_array, T clb) + { + if (D3D12_RESOURCE_DIMENSION_TEXTURE3D == dim) + { + for (auto m : getMipRange()) + { + clb(SubresourceIndex::make(m.index())); + } + } + else + { + for (auto a : getArrayRange()) + { + for (auto m : getMipRange()) + { + clb(calculate_subresource_index(m, a, mip_per_array)); + } + } + } + } +END_BITFIELD_TYPE() + +inline bool operator==(ImageViewState l, ImageViewState r) { return l.wrapper.value == r.wrapper.value; } +inline bool operator!=(ImageViewState l, ImageViewState r) { return l.wrapper.value != r.wrapper.value; } +inline bool operator<(ImageViewState l, ImageViewState r) { return l.wrapper.value < r.wrapper.value; } + +struct ImageViewInfo +{ + D3D12_CPU_DESCRIPTOR_HANDLE handle; + ImageViewState state; +}; + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/jamfile b/prog/engine/drv/drv3d_DX12/jamfile index 61cc38b07..b7fcade1f 100644 --- a/prog/engine/drv/drv3d_DX12/jamfile +++ b/prog/engine/drv/drv3d_DX12/jamfile @@ -24,7 +24,7 @@ Sources = pipeline_cache.cpp bindless.cpp resource_memory_heap.cpp - resource_memory_heap_heap_components.cpp + resource_manager/heap_components.cpp resource_update_buffer.cpp resource_usage_debugger.cpp pipeline/blk_cache.cpp @@ -147,4 +147,8 @@ UseProgLibs += gameLibs/dxil ; +if $(PlatformSpec) = clang && ! $(CLangVer) in 15.0.7 14.0.6 12.0.1 11.0.0 10.0.0 { # >= 16 implied + CPPopt += -Wno-enum-constexpr-conversion ; +} + include $(Root)/prog/_jBuild/build.jam ; diff --git a/prog/engine/drv/drv3d_DX12/pipeline.cpp b/prog/engine/drv/drv3d_DX12/pipeline.cpp index cc1d09936..2db63e058 100644 --- a/prog/engine/drv/drv3d_DX12/pipeline.cpp +++ b/prog/engine/drv/drv3d_DX12/pipeline.cpp @@ -1,5 +1,8 @@ #include "device.h" #include "EASTL/functional.h" +#include + +#include "render_target_mask_util.h" using namespace drv3d_dx12; @@ -663,6 +666,100 @@ bool PipelineVariant::loadMesh(ID3D12Device2 *device, backend::ShaderModuleManag #endif } +void PipelineVariant::errorPrintBlkString(const BasePipeline &base, const InputLayout &input_layout, bool is_wire_frame, + const RenderStateSystem::StaticState &static_state, const FramebufferLayout &fb_layout, D3D12_PRIMITIVE_TOPOLOGY_TYPE top) +{ + // This is probably not the fastest and compactest way to generate the BLK data for production, but if we end up here, we + // are already in the "slow" path. As we are going to pay (probably a lot more) for compiling the pipeline just after this + // reporting. + DataBlock cacheOutBlock; + if (auto inputLayoutOutBlock = cacheOutBlock.addNewBlock("input_layouts")) + { + pipeline::DataBlockEncodeVisitor visitor{*inputLayoutOutBlock}; + visitor.encode(input_layout); + } + + if (auto renderStateOutBlock = cacheOutBlock.addNewBlock("render_states")) + { + pipeline::DataBlockEncodeVisitor visitor{*renderStateOutBlock}; + visitor.encode(static_state); + } + + if (auto framebufferLayoutOutBlock = cacheOutBlock.addNewBlock("framebuffer_layouts")) + { + pipeline::DataBlockEncodeVisitor visitor{*framebufferLayoutOutBlock}; + visitor.encode(fb_layout); + } + + if (auto graphicsPipelinesOutBlock = cacheOutBlock.addNewBlock("graphics_pipelines")) + { + auto baseInfo = base.getIdentifier(); + GraphicsPipelineVariantState variantInfo; + variantInfo.framebufferLayoutIndex = 0; + variantInfo.staticRenderStateIndex = 0; + variantInfo.isWireFrame = is_wire_frame; + variantInfo.topology = top; + variantInfo.inputLayoutIndex = 0; + pipeline::DataBlockEncodeVisitor visitor{*graphicsPipelinesOutBlock, nullptr, 0}; + visitor.encode(baseInfo, eastl::span{&variantInfo, 1}); + } + + if (auto feeaturesOutBlock = cacheOutBlock.addNewBlock("features")) + { + pipeline::DataBlockEncodeVisitor visitor{*feeaturesOutBlock}; + visitor.encode(pipeline::DeviceCapsAndShaderModelEncoder::EncodingMode::pipelines, + DeviceCapsAndShaderModel::fromDriverDesc(d3d::get_driver_desc())); + } + + char buffer[3 * 1024]{}; + ConstrainedMemSaveCB stringGen{buffer, countof(buffer)}; + cacheOutBlock.saveToTextStreamCompact(stringGen); + logerr("%s", buffer); +} + +void PipelineVariant::errorPrintMeshBlkString(const BasePipeline &base, bool is_wire_frame, + const RenderStateSystem::StaticState &static_state, const FramebufferLayout &fb_layout) +{ + // This is probably not the fastest and compactest way to generate the BLK data for production, but if we end up here, we + // are already in the "slow" path. As we are going to pay (probably a lot more) for compiling the pipeline just after this + // reporting. + DataBlock cacheOutBlock; + if (auto renderStateOutBlock = cacheOutBlock.addNewBlock("render_states")) + { + pipeline::DataBlockEncodeVisitor visitor{*renderStateOutBlock}; + visitor.encode(static_state); + } + + if (auto framebufferLayoutOutBlock = cacheOutBlock.addNewBlock("framebuffer_layouts")) + { + pipeline::DataBlockEncodeVisitor visitor{*framebufferLayoutOutBlock}; + visitor.encode(fb_layout); + } + + if (auto meshPipelinesOutBlock = cacheOutBlock.addNewBlock("mesh_pipelines")) + { + auto baseInfo = base.getIdentifier(); + MeshPipelineVariantState variantInfo; + variantInfo.framebufferLayoutIndex = 0; + variantInfo.staticRenderStateIndex = 0; + variantInfo.isWireFrame = is_wire_frame; + pipeline::DataBlockEncodeVisitor visitor{*meshPipelinesOutBlock, nullptr, 0}; + visitor.encode(baseInfo, eastl::span{&variantInfo, 1}); + } + + if (auto feeaturesOutBlock = cacheOutBlock.addNewBlock("features")) + { + pipeline::DataBlockEncodeVisitor visitor{*feeaturesOutBlock}; + visitor.encode(pipeline::DeviceCapsAndShaderModelEncoder::EncodingMode::pipelines, + DeviceCapsAndShaderModel::fromDriverDesc(d3d::get_driver_desc())); + } + + char buffer[3 * 1024]{}; + ConstrainedMemSaveCB stringGen{buffer, countof(buffer)}; + cacheOutBlock.saveToTextStreamCompact(stringGen); + logerr("%s", buffer); +} + void PipelineManager::init(const SetupParameters ¶ms) { D3D12SerializeRootSignature = params.serializeRootSignature; @@ -930,7 +1027,7 @@ struct BasicGraphicsRootSignatureGenerator void setVisibilityPixelShader() { currentVisibility = D3D12_SHADER_VISIBILITY_PIXEL; } void addRootParameterConstantExplicit(uint32_t space, uint32_t index, uint32_t dwords, D3D12_SHADER_VISIBILITY vis) { - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); auto &target = params[desc.NumParameters++]; target.ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS; @@ -963,7 +1060,7 @@ struct BasicGraphicsRootSignatureGenerator { if (!shouldUseConstantBufferRootDescriptors()) { - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); G_ASSERT(rangeSize > 0); auto &target = params[desc.NumParameters++]; @@ -984,7 +1081,7 @@ struct BasicGraphicsRootSignatureGenerator if (shouldUseConstantBufferRootDescriptors()) { G_UNUSED(linear_index); - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); auto &target = params[desc.NumParameters++]; target.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; @@ -1013,7 +1110,7 @@ struct BasicGraphicsRootSignatureGenerator } void endSamplers() { - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); G_ASSERT(rangeSize > 0); auto &target = params[desc.NumParameters++]; @@ -1041,7 +1138,7 @@ struct BasicGraphicsRootSignatureGenerator { if (unboundedSamplersRootParam == nullptr) { - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); signature->def.layout.bindlessSamplersParamIndex = desc.NumParameters++; unboundedSamplersRootParam = ¶ms[signature->def.layout.bindlessSamplersParamIndex]; unboundedSamplersRootParam->ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; @@ -1090,7 +1187,7 @@ struct BasicGraphicsRootSignatureGenerator } void endShaderResourceViews() { - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); G_ASSERT(rangeSize > 0); auto &target = params[desc.NumParameters++]; @@ -1118,7 +1215,7 @@ struct BasicGraphicsRootSignatureGenerator { if (bindlessSRVRootParam == nullptr) { - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); signature->def.layout.bindlessShaderResourceViewParamIndex = desc.NumParameters++; bindlessSRVRootParam = ¶ms[signature->def.layout.bindlessShaderResourceViewParamIndex]; bindlessSRVRootParam->ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; @@ -1165,7 +1262,7 @@ struct BasicGraphicsRootSignatureGenerator } void endUnorderedAccessViews() { - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); G_ASSERT(rangeSize > 0); auto &target = params[desc.NumParameters++]; @@ -1473,7 +1570,7 @@ ComputePipelineSignature *PipelineManager::getComputePipelineSignature(ID3D12Dev } void rootConstantBuffer(uint32_t space, uint32_t index, uint32_t dwords) { - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); signature->def.csLayout.rootConstantsParamIndex = desc.NumParameters; @@ -1494,7 +1591,7 @@ ComputePipelineSignature *PipelineManager::getComputePipelineSignature(ID3D12Dev { if (!shouldUseConstantBufferRootDescriptors()) { - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); G_ASSERT(rangeSize > 0); auto &target = params[desc.NumParameters++]; @@ -1515,7 +1612,7 @@ ComputePipelineSignature *PipelineManager::getComputePipelineSignature(ID3D12Dev if (shouldUseConstantBufferRootDescriptors()) { G_UNUSED(linear_index); - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); auto &target = params[desc.NumParameters++]; target.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; @@ -1539,7 +1636,7 @@ ComputePipelineSignature *PipelineManager::getComputePipelineSignature(ID3D12Dev void beginSamplers() { signature->def.csLayout.samplersParamIndex = desc.NumParameters; } void endSamplers() { - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); G_ASSERT(rangeSize > 0); auto &target = params[desc.NumParameters++]; @@ -1569,7 +1666,7 @@ ComputePipelineSignature *PipelineManager::getComputePipelineSignature(ID3D12Dev { // compute has only one stage, all unbounded samplers should be added within a single begin-end block G_ASSERT(unboundedSamplersRootParam == nullptr); - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); signature->def.layout.bindlessSamplersParamIndex = desc.NumParameters++; unboundedSamplersRootParam = ¶ms[signature->def.layout.bindlessSamplersParamIndex]; unboundedSamplersRootParam->ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; @@ -1597,7 +1694,7 @@ ComputePipelineSignature *PipelineManager::getComputePipelineSignature(ID3D12Dev void beginShaderResourceViews() { signature->def.csLayout.shaderResourceViewParamIndex = desc.NumParameters; } void endShaderResourceViews() { - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); G_ASSERT(rangeSize > 0); auto &target = params[desc.NumParameters++]; @@ -1624,7 +1721,7 @@ ComputePipelineSignature *PipelineManager::getComputePipelineSignature(ID3D12Dev } void beginBindlessShaderResourceViews() { - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); signature->def.layout.bindlessShaderResourceViewParamIndex = desc.NumParameters++; bindlessSRVRootParam = ¶ms[signature->def.layout.bindlessShaderResourceViewParamIndex]; bindlessSRVRootParam->ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; @@ -1652,7 +1749,7 @@ ComputePipelineSignature *PipelineManager::getComputePipelineSignature(ID3D12Dev void beginUnorderedAccessViews() { signature->def.csLayout.unorderedAccessViewParamIndex = desc.NumParameters; } void endUnorderedAccessViews() { - G_ASSERT(desc.NumParameters < array_size(params)); + G_ASSERT(desc.NumParameters < countof(params)); G_ASSERT(rangeSize > 0); auto &target = params[desc.NumParameters++]; @@ -1734,7 +1831,7 @@ BasePipeline *PipelineManager::getGraphics(GraphicsProgramID program) } void PipelineManager::addCompute(ID3D12Device2 *device, PipelineCache &cache, ProgramID id, ComputeShaderModule shader, - RecoverablePipelineCompileBehavior on_error, bool give_name) + RecoverablePipelineCompileBehavior on_error, bool give_name, CSPreloaded preloaded) { G_ASSERTF(id.isCompute(), "addCompute called for a non compute program!"); uint32_t index = id.getIndex(); @@ -1751,6 +1848,33 @@ void PipelineManager::addCompute(ID3D12Device2 *device, PipelineCache &cache, Pr #endif if (auto signature = getComputePipelineSignature(device, cache, hasAccelerationStructure, shader.header.resourceUsageTable)) { + if (validateInGameSpikes && preloaded == CSPreloaded::Yes) + { + logerr("Pipeline creation during game! Patch for cache will be updated. Share game/cache/dx12_cache.blk file with graphics " + "programmers."); + needToUpdateCache = true; + + DataBlock cacheOutBlock; + if (auto computePipelinesOutBlock = cacheOutBlock.addNewBlock("compute_pipelines")) + { + pipeline::DataBlockEncodeVisitor visitor{*computePipelinesOutBlock, nullptr, 0}; + ComputePipelineIdentifier pipeline; + pipeline.hash = shader.ident.shaderHash; + visitor.encode(pipeline); + } + + if (auto feeaturesOutBlock = cacheOutBlock.addNewBlock("features")) + { + pipeline::DataBlockEncodeVisitor visitor{*feeaturesOutBlock}; + visitor.encode(pipeline::DeviceCapsAndShaderModelEncoder::EncodingMode::pipelines, + DeviceCapsAndShaderModel::fromDriverDesc(d3d::get_driver_desc())); + } + + char buffer[2 * 1024]{}; + ConstrainedMemSaveCB stringGen{buffer, countof(buffer)}; + cacheOutBlock.saveToTextStreamCompact(stringGen); + logerr("%s", buffer); + } pipelineGroup[index] = eastl::make_unique(*signature, eastl::move(shader), device, cache, on_error, give_name); } } @@ -1785,6 +1909,12 @@ void PipelineManager::addGraphics(ID3D12Device2 *device, PipelineCache &cache, F auto pixelShader = getPixelShader(ps); target = createGraphics(device, cache, fbs, vertexShader, pixelShader, on_error, give_name); + if (validateInGameSpikes && target) + { + logerr("Pipeline creation during game! Patch for cache will be updated. Share game/cache/dx12_cache.blk file with graphics " + "programmers."); + needToUpdateCache = true; + } } if (target) { @@ -2278,7 +2408,7 @@ template uint32_t maching_object_mask(const T &obj, U &container, V extractor) { uint32_t mask = 0; - for (uint32_t i = 0; i < array_size(container); ++i) + for (uint32_t i = 0; i < countof(container); ++i) { mask |= ((extractor(container[i]) == obj) ? 1u : 0u) << i; } diff --git a/prog/engine/drv/drv3d_DX12/pipeline.h b/prog/engine/drv/drv3d_DX12/pipeline.h index ee23c7f62..e24fb32f2 100644 --- a/prog/engine/drv/drv3d_DX12/pipeline.h +++ b/prog/engine/drv/drv3d_DX12/pipeline.h @@ -1,8 +1,24 @@ #pragma once #include +#include +#include #include #include +#include +#include + +#include "d3d12_error_handling.h" +#include "d3d12_debug_names.h" +#include "host_device_shared_memory_region.h" +#include "shader_program_id.h" +#include "pipeline_cache.h" +#include "d3d12_utils.h" +#include "render_state.h" +#include "descriptor_heap.h" +#include "image_view_state.h" +#include "tagged_handles.h" + #if !_TARGET_XBOXONE using CD3DX12_PIPELINE_STATE_STREAM_AS = @@ -16,7 +32,8 @@ namespace drv3d_dx12 namespace backend { class BindlessSetManager; -} +class ShaderModuleManager; +} // namespace backend // Wraps the global id of a buffer object, this makes it easier to access // the index and the buffer property bits its carrying. class BufferGlobalId @@ -73,11 +90,11 @@ class BufferGlobalId struct BufferState { ID3D12Resource *buffer = nullptr; - uint32_t size = 0; + uint64_t size = 0; uint32_t discardCount = 0; uint32_t currentDiscardIndex = 0; uint32_t fistDiscardFrame = 0; - uint32_t offset = 0; + uint64_t offset = 0; BufferGlobalId resourceId; eastl::unique_ptr srvs; eastl::unique_ptr uavs; @@ -93,11 +110,11 @@ struct BufferState uint8_t *cpuPointer; }; #endif - uint32_t totalSize() const { return discardCount * size; } + uint64_t totalSize() const { return discardCount * size; } - ValueRange usageRange() const { return make_value_range(offset, totalSize()); } + ValueRange usageRange() const { return make_value_range(offset, totalSize()); } - uint32_t currentOffset() const { return offset + currentDiscardIndex * size; } + uint64_t currentOffset() const { return offset + currentDiscardIndex * size; } D3D12_GPU_VIRTUAL_ADDRESS currentGPUPointer() const { return gpuPointer + currentDiscardIndex * size; } @@ -140,9 +157,9 @@ struct BufferState struct BufferReference { ID3D12Resource *buffer = nullptr; - uint32_t offset = 0; // only use this in conjunction with the buffer object, all pointers are + uint64_t offset = 0; // only use this in conjunction with the buffer object, all pointers are // already adjusted! - uint32_t size = 0; + uint64_t size = 0; BufferGlobalId resourceId; D3D12_CPU_DESCRIPTOR_HANDLE srv = {0}; D3D12_CPU_DESCRIPTOR_HANDLE uav = {0}; @@ -218,21 +235,21 @@ struct BufferResourceReference struct BufferResourceReferenceAndOffset : BufferResourceReference { - uint32_t offset = 0; + uint64_t offset = 0; BufferResourceReferenceAndOffset() = default; BufferResourceReferenceAndOffset(const BufferReference &ref) : BufferResourceReference{ref}, offset{ref.offset} {} BufferResourceReferenceAndOffset(const BufferState &buf) : BufferResourceReference{buf}, offset{BufferReference{buf}.offset} {} - BufferResourceReferenceAndOffset(const BufferReference &ref, uint32_t ofs) : BufferResourceReference{ref}, offset{ref.offset + ofs} + BufferResourceReferenceAndOffset(const BufferReference &ref, uint64_t ofs) : BufferResourceReference{ref}, offset{ref.offset + ofs} {} - BufferResourceReferenceAndOffset(const BufferState &buf, uint32_t ofs) : + BufferResourceReferenceAndOffset(const BufferState &buf, uint64_t ofs) : BufferResourceReference{buf}, offset{BufferReference{buf}.offset + ofs} {} BufferResourceReferenceAndOffset(const HostDeviceSharedMemoryRegion &buf) : - BufferResourceReference{buf}, offset{uint32_t(buf.range.front())} + BufferResourceReference{buf}, offset{uint64_t(buf.range.front())} {} - BufferResourceReferenceAndOffset(const HostDeviceSharedMemoryRegion &buf, uint32_t ofs) : - BufferResourceReference{buf}, offset{uint32_t(buf.range.front() + ofs)} + BufferResourceReferenceAndOffset(const HostDeviceSharedMemoryRegion &buf, uint64_t ofs) : + BufferResourceReference{buf}, offset{uint64_t(buf.range.front() + ofs)} {} bool operator==(const BufferResourceReferenceAndOffset &other) const @@ -245,30 +262,30 @@ struct BufferResourceReferenceAndOffset : BufferResourceReference struct BufferResourceReferenceAndRange : BufferResourceReferenceAndOffset { - uint32_t size = 0; + uint64_t size = 0; BufferResourceReferenceAndRange() = default; BufferResourceReferenceAndRange(const BufferReference &ref) : BufferResourceReferenceAndOffset{ref}, size{ref.size} {} BufferResourceReferenceAndRange(const BufferState &buf) : BufferResourceReferenceAndOffset{buf}, size{buf.size} {} - BufferResourceReferenceAndRange(const BufferReference &ref, uint32_t offset) : + BufferResourceReferenceAndRange(const BufferReference &ref, uint64_t offset) : BufferResourceReferenceAndOffset(ref, offset), size{ref.size - offset} {} - BufferResourceReferenceAndRange(const BufferState &buf, uint32_t offset) : + BufferResourceReferenceAndRange(const BufferState &buf, uint64_t offset) : BufferResourceReferenceAndOffset{buf, offset}, size{buf.size - offset} {} - BufferResourceReferenceAndRange(const BufferReference &ref, uint32_t offset, uint32_t sz) : + BufferResourceReferenceAndRange(const BufferReference &ref, uint64_t offset, uint64_t sz) : BufferResourceReferenceAndOffset{ref, offset}, size{sz} {} - BufferResourceReferenceAndRange(const BufferState &buf, uint32_t offset, uint32_t sz) : + BufferResourceReferenceAndRange(const BufferState &buf, uint64_t offset, uint64_t sz) : BufferResourceReferenceAndOffset{buf, offset}, size{sz} {} BufferResourceReferenceAndRange(const HostDeviceSharedMemoryRegion &buf) : - BufferResourceReferenceAndOffset{buf}, size{uint32_t(buf.range.size())} + BufferResourceReferenceAndOffset{buf}, size{buf.range.size()} {} - BufferResourceReferenceAndRange(const HostDeviceSharedMemoryRegion &buf, uint32_t ofs) : - BufferResourceReferenceAndOffset{buf, ofs}, size{uint32_t(buf.range.size() - ofs)} + BufferResourceReferenceAndRange(const HostDeviceSharedMemoryRegion &buf, uint64_t ofs) : + BufferResourceReferenceAndOffset{buf, ofs}, size{buf.range.size() - ofs} {} - BufferResourceReferenceAndRange(const HostDeviceSharedMemoryRegion &buf, uint32_t ofs, uint32_t sz) : + BufferResourceReferenceAndRange(const HostDeviceSharedMemoryRegion &buf, uint64_t ofs, uint64_t sz) : BufferResourceReferenceAndOffset{buf, ofs}, size{sz} {} @@ -288,15 +305,15 @@ struct BufferResourceReferenceAndAddress : BufferResourceReference BufferResourceReferenceAndAddress(const BufferReference &ref) : BufferResourceReference{ref}, gpuPointer{ref.gpuPointer} {} BufferResourceReferenceAndAddress(const BufferState &buf) : BufferResourceReference{buf}, gpuPointer{BufferReference{buf}.gpuPointer} {} - BufferResourceReferenceAndAddress(const BufferReference &ref, uint32_t offset) : + BufferResourceReferenceAndAddress(const BufferReference &ref, uint64_t offset) : BufferResourceReference{ref}, gpuPointer{ref.gpuPointer + offset} {} - BufferResourceReferenceAndAddress(const BufferState &buf, uint32_t offset) : + BufferResourceReferenceAndAddress(const BufferState &buf, uint64_t offset) : BufferResourceReference{buf}, gpuPointer{BufferReference{buf}.gpuPointer + offset} {} BufferResourceReferenceAndAddress(const HostDeviceSharedMemoryRegion &buf) : BufferResourceReference{buf}, gpuPointer{buf.gpuPointer} {} - BufferResourceReferenceAndAddress(const HostDeviceSharedMemoryRegion &buf, uint32_t offset) : + BufferResourceReferenceAndAddress(const HostDeviceSharedMemoryRegion &buf, uint64_t offset) : BufferResourceReference{buf}, gpuPointer{buf.gpuPointer + offset} {} @@ -315,31 +332,31 @@ struct BufferResourceReferenceAndAddress : BufferResourceReference struct BufferResourceReferenceAndAddressRange : BufferResourceReferenceAndAddress { - uint32_t size = 0; + uint64_t size = 0; BufferResourceReferenceAndAddressRange() = default; BufferResourceReferenceAndAddressRange(const BufferReference &ref) : BufferResourceReferenceAndAddress{ref}, size{ref.size} {} BufferResourceReferenceAndAddressRange(const BufferState &buf) : BufferResourceReferenceAndAddress{buf}, size{buf.size} {} - BufferResourceReferenceAndAddressRange(const BufferReference &ref, uint32_t offset) : + BufferResourceReferenceAndAddressRange(const BufferReference &ref, uint64_t offset) : BufferResourceReferenceAndAddress(ref, offset), size{ref.size - offset} {} - BufferResourceReferenceAndAddressRange(const BufferState &buf, uint32_t offset) : + BufferResourceReferenceAndAddressRange(const BufferState &buf, uint64_t offset) : BufferResourceReferenceAndAddress{buf, offset}, size{buf.size - offset} {} - BufferResourceReferenceAndAddressRange(const BufferReference &ref, uint32_t offset, uint32_t sz) : - BufferResourceReferenceAndAddress{ref, offset}, size{sz ? sz : uint32_t(ref.size - offset)} + BufferResourceReferenceAndAddressRange(const BufferReference &ref, uint64_t offset, uint64_t sz) : + BufferResourceReferenceAndAddress{ref, offset}, size{sz ? sz : uint64_t(ref.size - offset)} {} - BufferResourceReferenceAndAddressRange(const BufferState &buf, uint32_t offset, uint32_t sz) : - BufferResourceReferenceAndAddress{buf, offset}, size{sz ? sz : uint32_t(buf.size - offset)} + BufferResourceReferenceAndAddressRange(const BufferState &buf, uint64_t offset, uint64_t sz) : + BufferResourceReferenceAndAddress{buf, offset}, size{sz ? sz : uint64_t(buf.size - offset)} {} BufferResourceReferenceAndAddressRange(const HostDeviceSharedMemoryRegion &buf) : - BufferResourceReferenceAndAddress{buf}, size{uint32_t(buf.range.size())} + BufferResourceReferenceAndAddress{buf}, size{uint64_t(buf.range.size())} {} - BufferResourceReferenceAndAddressRange(const HostDeviceSharedMemoryRegion &buf, uint32_t offset) : - BufferResourceReferenceAndAddress{buf, offset}, size{uint32_t(buf.range.size() - offset)} + BufferResourceReferenceAndAddressRange(const HostDeviceSharedMemoryRegion &buf, uint64_t offset) : + BufferResourceReferenceAndAddress{buf, offset}, size{uint64_t(buf.range.size() - offset)} {} - BufferResourceReferenceAndAddressRange(const HostDeviceSharedMemoryRegion &buf, uint32_t offset, uint32_t sz) : - BufferResourceReferenceAndAddress{buf, offset}, size{sz ? sz : uint32_t(buf.range.size() - offset)} + BufferResourceReferenceAndAddressRange(const HostDeviceSharedMemoryRegion &buf, uint64_t offset, uint64_t sz) : + BufferResourceReferenceAndAddress{buf, offset}, size{sz ? sz : uint64_t(buf.range.size() - offset)} {} bool operator==(const BufferResourceReferenceAndAddressRange &other) const @@ -699,6 +716,11 @@ class PipelineVariant bool is_wire_frame, const RenderStateSystem::StaticState &static_state, const FramebufferLayout &fb_layout, RecoverablePipelineCompileBehavior on_error, bool give_name); + void errorPrintBlkString(const BasePipeline &base, const InputLayout &input_layout, bool is_wire_frame, + const RenderStateSystem::StaticState &static_state, const FramebufferLayout &fb_layout, D3D12_PRIMITIVE_TOPOLOGY_TYPE top); + void errorPrintMeshBlkString(const BasePipeline &base, bool is_wire_frame, const RenderStateSystem::StaticState &static_state, + const FramebufferLayout &fb_layout); + bool isReady() const { return nullptr != pipeline.Get(); } ID3D12PipelineState *get() const { return pipeline.Get(); } @@ -1044,6 +1066,14 @@ class BasePipeline variant.swapStaticRenderStateID(a, b); } } + + BasePipelineIdentifier getIdentifier() const + { + BasePipelineIdentifier ident; + ident.vs = vsModule.header.hash; + ident.ps = psModule.header.hash; + return ident; + } }; class ComputePipeline @@ -1092,9 +1122,11 @@ class ComputePipeline // pipeline yet. if (from_cache_only && !cacheTarget.CachedBlobSizeInBytes) { +#if DX12_REPORT_PIPELINE_CREATE_TIMING // turns off reporting of the profile, we don't want to know how long it took to _not_ // load a pipeline funcProfiler.fmt = nullptr; +#endif return true; } #else @@ -1120,9 +1152,11 @@ class ComputePipeline // with cache only, we stop here if (from_cache_only) { +#if DX12_REPORT_PIPELINE_CREATE_TIMING // turns off reporting of the profile, we don't want to know how long it took to _not_ // load a pipeline funcProfiler.fmt = nullptr; +#endif return true; } @@ -1304,7 +1338,7 @@ class PipelineManager : public backend::ShaderModuleManager, ComputePipeline *getCompute(ProgramID program); BasePipeline *getGraphics(GraphicsProgramID program); void addCompute(ID3D12Device2 *device, PipelineCache &cache, ProgramID id, ComputeShaderModule shader, - RecoverablePipelineCompileBehavior on_error, bool give_name); + RecoverablePipelineCompileBehavior on_error, bool give_name, CSPreloaded preloaded); void addGraphics(ID3D12Device2 *device, PipelineCache &cache, FramebufferLayoutManager &fbs, GraphicsProgramID program, ShaderID vs, ShaderID ps, RecoverablePipelineCompileBehavior on_error, bool give_name); @@ -1578,7 +1612,7 @@ class PipelineManager : public backend::ShaderModuleManager, pixelShaderComputeProgramIDMap[group].clear(); } void loadComputeShaderFromDump(ID3D12Device2 *device, PipelineCache &cache, ProgramID program, - RecoverablePipelineCompileBehavior on_error, bool give_name) + RecoverablePipelineCompileBehavior on_error, bool give_name, CSPreloaded preloaded) { auto shaderCompressionIndex = computeProgramIndexToDumpShaderIndex[program.getGroup()][program.getIndex()]; auto byteCode = @@ -1596,7 +1630,7 @@ class PipelineManager : public backend::ShaderModuleManager, } } - addCompute(device, cache, program, eastl::move(basicModule), on_error, give_name); + addCompute(device, cache, program, eastl::move(basicModule), on_error, give_name, preloaded); } BasePipeline *findLoadedPipeline(const backend::VertexShaderModuleRefStore &vs, const backend::PixelShaderModuleRefStore &ps) @@ -1627,7 +1661,7 @@ class PipelineManager : public backend::ShaderModuleManager, debug("DX12: precomiling graphics pipeline..."); char hashString[1 + 2 * sizeof(dxil::HashValue)]; - pipeline.base.vs.convertToString(hashString, array_size(hashString)); + pipeline.base.vs.convertToString(hashString, countof(hashString)); debug("DX12: Looking for VS %s...", hashString); auto vsID = findVertexShader(pipeline.base.vs); if (ShaderID::Null() == vsID) @@ -1635,7 +1669,7 @@ class PipelineManager : public backend::ShaderModuleManager, debug("DX12: ...shader not found"); continue; } - pipeline.base.ps.convertToString(hashString, array_size(hashString)); + pipeline.base.ps.convertToString(hashString, countof(hashString)); debug("DX12: Looking for PS %s...", hashString); auto psID = findPixelShader(pipeline.base.ps); if (ShaderID::Null() == psID) @@ -1716,7 +1750,7 @@ class PipelineManager : public backend::ShaderModuleManager, debug("DX12: precomiling mesh pipeline..."); char hashString[1 + 2 * sizeof(dxil::HashValue)]; - pipeline.base.vs.convertToString(hashString, array_size(hashString)); + pipeline.base.vs.convertToString(hashString, countof(hashString)); debug("DX12: Looking for VS %s...", hashString); auto vsID = findVertexShader(pipeline.base.vs); if (ShaderID::Null() == vsID) @@ -1724,7 +1758,7 @@ class PipelineManager : public backend::ShaderModuleManager, debug("DX12: ...shader not found"); continue; } - pipeline.base.ps.convertToString(hashString, array_size(hashString)); + pipeline.base.ps.convertToString(hashString, countof(hashString)); debug("DX12: Looking for PS %s...", hashString); auto psID = findPixelShader(pipeline.base.ps); if (ShaderID::Null() == psID) @@ -1801,7 +1835,7 @@ class PipelineManager : public backend::ShaderModuleManager, { debug("DX12: precomiling compute pipeline..."); char hashString[1 + 2 * sizeof(dxil::HashValue)]; - pipeline.base.hash.convertToString(hashString, array_size(hashString)); + pipeline.base.hash.convertToString(hashString, countof(hashString)); debug("DX12: Looking for CS %s...", hashString); bool found = false; enumerateShaderFromHash(pipeline.base.hash, [device, &pipeline_cache, &found, this](auto gi, auto si, auto vs_count) { @@ -1833,7 +1867,9 @@ class PipelineManager : public backend::ShaderModuleManager, return false; } debug("DX12: ...loading..."); - loadComputeShaderFromDump(device, pipeline_cache, progId, RecoverablePipelineCompileBehavior::REPORT_ERROR, true); + // CSPreloaded::No as we doing the preload right now + loadComputeShaderFromDump(device, pipeline_cache, progId, RecoverablePipelineCompileBehavior::REPORT_ERROR, true, + CSPreloaded::No); found = true; return false; }); @@ -1857,7 +1893,10 @@ class PipelineManager : public backend::ShaderModuleManager, graphics_pipelines); compileMeshPipelineSet(device, pipeline_cache, fbs, static_render_states, framebuffer_layouts, mesh_pipelines); compileComputePipelineSet(device, pipeline_cache, compute_pipelines); + validateInGameSpikes = dgs_get_settings()->getBlockByNameEx("dx12")->getBool("validateInGameSpikes", false); } + bool validateInGameSpikes = false; + bool needToUpdateCache = false; }; diff --git a/prog/engine/drv/drv3d_DX12/pipeline/blk_cache.cpp b/prog/engine/drv/drv3d_DX12/pipeline/blk_cache.cpp index bef58e5ca..5b00ab532 100644 --- a/prog/engine/drv/drv3d_DX12/pipeline/blk_cache.cpp +++ b/prog/engine/drv/drv3d_DX12/pipeline/blk_cache.cpp @@ -380,52 +380,102 @@ bool drv3d_dx12::pipeline::FramebufferLayoutDeEncoder::encode(DataBlock &blk, co return true; } -bool drv3d_dx12::pipeline::DeviceCapsAndShaderModelDeEncoder::decode(const DataBlock &blk, DeviceCapsAndShaderModel &target) const +bool drv3d_dx12::pipeline::DeviceCapsAndShaderModelDeEncoder::decode(const DataBlock &blk, EncodingMode mode, + DeviceCapsAndShaderModel &target) const { target.shaderModel.major = blk.getInt("shaderModelMajor", target.shaderModel.major); target.shaderModel.major = blk.getInt("shaderModelMinor", target.shaderModel.minor); + #define DX12_D3D_CAP(name) target.caps.name = blk.getBool(#name, target.caps.name); - DX12_D3D_CAP_SET + if (EncodingMode::full == mode) + { + DX12_D3D_CAP_SET + } + else if (EncodingMode::pipelines == mode) + { + DX12_D3D_CAP_SET_RELEVANT_FOR_PIPELINES + } #undef DX12_D3D_CAP return true; } -bool drv3d_dx12::pipeline::DeviceCapsAndShaderModelDeEncoder::encode(DataBlock &blk, const DeviceCapsAndShaderModel &source) const +bool drv3d_dx12::pipeline::DeviceCapsAndShaderModelDeEncoder::encode(DataBlock &blk, EncodingMode mode, + const DeviceCapsAndShaderModel &source) const { blk.setInt("shaderModelMajor", source.shaderModel.major); blk.setInt("shaderModelMinor", source.shaderModel.minor); #define DX12_D3D_CAP(name) blk.setBool(#name, source.caps.name); - DX12_D3D_CAP_SET + if (EncodingMode::full == mode) + { + DX12_D3D_CAP_SET + } + else if (EncodingMode::pipelines == mode) + { + DX12_D3D_CAP_SET_RELEVANT_FOR_PIPELINES + } #undef DX12_D3D_CAP return true; } -bool drv3d_dx12::pipeline::FeatureSupportResolver::decode(const DataBlock &blk, bool &target) const +bool drv3d_dx12::pipeline::FeatureSupportResolver::decode(const DataBlock &blk, CompatibilityMode mode, bool &target) const { DeviceCapsAndShaderModel compare{}; - if (!this->DeviceCapsAndShaderModelDeEncoder::decode(blk, compare)) + if (!this->DeviceCapsAndShaderModelDeEncoder::decode(blk, mode, compare)) { return false; } - target = compare.isCompatibleTo(features); + if (CompatibilityMode::full == mode) + { + target = compare.isCompatibleTo(features); + } + else if (CompatibilityMode::pipelines == mode) + { + target = compare.isPipelineCompatibleTo(features); + } + else + { + target = false; + } return true; } -bool drv3d_dx12::pipeline::GraphicsPipelineVariantDeEncoder::decode(const DataBlock &blk, GraphicsPipelineVariantState &target) const +bool drv3d_dx12::pipeline::FeatureSetChecker::checkFeatureSets(const DataBlock &blk) const { - if (blk.paramExists("featureSet") && featureSet) + if (!featureSet) { - auto fi = blk.getInt("featureSet", 0); - // when index is out of range, we assume unsupported - if (fi > featureSetCount) - { - return false; - } - if (!featureSet[fi]) + return true; + } + + auto fiNameId = blk.getNameId("featureSet"); + if (-1 == fiNameId) + { + return true; + } + int lastParamIndex = blk.findParam(fiNameId); + if (-1 == lastParamIndex) + { + return true; + } + + do + { + auto fi = blk.getInt(lastParamIndex); + if ((fi < featureSetCount) && featureSet[fi]) { - return false; + return true; } + lastParamIndex = blk.findParam(fiNameId, lastParamIndex); + } while (-1 != lastParamIndex); + return false; +} + +bool drv3d_dx12::pipeline::GraphicsPipelineVariantDeEncoder::decode(const DataBlock &blk, GraphicsPipelineVariantState &target) const +{ + if (!checkFeatureSets(blk)) + { + return false; } + if (!blk.paramExists("renderState") || !blk.paramExists("outputFormat") || !blk.paramExists("inputLayout") || !blk.paramExists("primitiveTopology")) { @@ -451,19 +501,11 @@ bool drv3d_dx12::pipeline::GraphicsPipelineVariantDeEncoder::encode(DataBlock &b bool drv3d_dx12::pipeline::MeshPipelineVariantDeEncoder::decode(const DataBlock &blk, MeshPipelineVariantState &target) const { - if (blk.paramExists("featureSet") && featureSet) + if (!checkFeatureSets(blk)) { - auto fi = blk.getInt("featureSet", 0); - // when index is out of range, we assume unsupported - if (fi > featureSetCount) - { - return false; - } - if (!featureSet[fi]) - { - return false; - } + return false; } + if (!blk.paramExists("renderState") || !blk.paramExists("outputFormat")) { return false; @@ -484,19 +526,11 @@ bool drv3d_dx12::pipeline::MeshPipelineVariantDeEncoder::encode(DataBlock &blk, bool drv3d_dx12::pipeline::ComputePipelineDeEncoder::decode(const DataBlock &blk, ComputePipelineIdentifier &target) const { - if (blk.paramExists("featureSet") && featureSet) + if (!checkFeatureSets(blk)) { - auto fi = blk.getInt("featureSet", 0); - // when index is out of range, we assume unsupported - if (fi > featureSetCount) - { - return false; - } - if (!featureSet[fi]) - { - return false; - } + return false; } + auto hash = blk.getStr("hash", nullptr); if (!hash) { diff --git a/prog/engine/drv/drv3d_DX12/pipeline/blk_cache.h b/prog/engine/drv/drv3d_DX12/pipeline/blk_cache.h index 5d7610d53..e2b91a0a8 100644 --- a/prog/engine/drv/drv3d_DX12/pipeline/blk_cache.h +++ b/prog/engine/drv/drv3d_DX12/pipeline/blk_cache.h @@ -1,5 +1,11 @@ #pragma once +#include + +#include "derived_span.h" +#include "pipeline_cache.h" + + namespace drv3d_dx12 { struct GraphicsPipelineVariantSet @@ -201,8 +207,14 @@ class DeviceCapsAndShaderModelDeEncoder : public DefaltInvokeDecoder +class FeatureSetChecker { bool *featureSet = nullptr; uint32_t featureSetCount = 0; +public: + FeatureSetChecker() = default; + FeatureSetChecker(bool *feature_set, uint32_t feature_set_count) : featureSet{feature_set}, featureSetCount{feature_set_count} {} + + // will report true when either no feature set entry is present or a feature set is present and supported + bool checkFeatureSets(const DataBlock &blk) const; +}; + +class GraphicsPipelineVariantDeEncoder : public EncoderBlockNameStore, protected FeatureSetChecker +{ public: static inline const char *blockFormat = "v_%u"; - GraphicsPipelineVariantDeEncoder(bool *feature_set, uint32_t feature_set_count) : - featureSet{feature_set}, featureSetCount{feature_set_count} + GraphicsPipelineVariantDeEncoder(bool *feature_set, uint32_t feature_set_count) : FeatureSetChecker{feature_set, feature_set_count} {} template bool invoke(const DataBlock &blk, T target) const @@ -317,18 +340,14 @@ using GraphicsPipelineDecoder = GraphicsPipelineDeEncoder; using GraphicsPipelineEncoder = GraphicsPipelineDeEncoder; class MeshPipelineVariantDeEncoder : public DefaltInvokeDecoder, - public EncoderBlockNameStore + public EncoderBlockNameStore, + protected FeatureSetChecker { - bool *featureSet = nullptr; - uint32_t featureSetCount = 0; - public: static inline const char *blockFormat = "v_%u"; MeshPipelineVariantDeEncoder() = default; - MeshPipelineVariantDeEncoder(bool *feature_set, uint32_t feature_set_count) : - featureSet{feature_set}, featureSetCount{feature_set_count} - {} + MeshPipelineVariantDeEncoder(bool *feature_set, uint32_t feature_set_count) : FeatureSetChecker{feature_set, feature_set_count} {} bool decode(const DataBlock &blk, MeshPipelineVariantState &target) const; bool encode(DataBlock &blk, const MeshPipelineVariantState &source) const; }; @@ -401,15 +420,15 @@ using MeshPipelineDecoder = MeshPipelineDeEncoder; using MeshPipelineEncoder = MeshPipelineDeEncoder; class ComputePipelineDeEncoder : public DefaltInvokeDecoder, - public EncoderBlockNameStore + public EncoderBlockNameStore, + protected FeatureSetChecker { bool *featureSet = nullptr; uint32_t featureSetCount = 0; public: static inline const char *blockFormat = "cp_%u"; - ComputePipelineDeEncoder(bool *feature_set, uint32_t feature_set_count) : featureSet{feature_set}, featureSetCount{feature_set_count} - {} + ComputePipelineDeEncoder(bool *feature_set, uint32_t feature_set_count) : FeatureSetChecker{feature_set, feature_set_count} {} bool decode(const DataBlock &blk, ComputePipelineIdentifier &target) const; bool encode(DataBlock &blk, const ComputePipelineIdentifier &target) const; }; diff --git a/prog/engine/drv/drv3d_DX12/pipeline_cache.cpp b/prog/engine/drv/drv3d_DX12/pipeline_cache.cpp index d7c80b827..66e34a34f 100644 --- a/prog/engine/drv/drv3d_DX12/pipeline_cache.cpp +++ b/prog/engine/drv/drv3d_DX12/pipeline_cache.cpp @@ -6,6 +6,9 @@ #include #include +#include "const_register_type.h" + + using namespace drv3d_dx12; #define DX12_ENABLE_CACHE_COMPRESSION 1 @@ -205,7 +208,7 @@ void PipelineCache::shutdown(const ShutdownParameters ¶ms) if (auto feeaturesOutBlock = cacheOutBlock.addNewBlock("features")) { pipeline::DataBlockEncodeVisitor visitor{*feeaturesOutBlock}; - visitor.encode(params.features); + visitor.encode(pipeline::DeviceCapsAndShaderModelEncoder::EncodingMode::pipelines, params.features); } cacheOutBlock.saveToTextFile("cache/dx12_cache.blk"); diff --git a/prog/engine/drv/drv3d_DX12/pipeline_cache.h b/prog/engine/drv/drv3d_DX12/pipeline_cache.h index ae4a27c0e..0ca0467d7 100644 --- a/prog/engine/drv/drv3d_DX12/pipeline_cache.h +++ b/prog/engine/drv/drv3d_DX12/pipeline_cache.h @@ -1,8 +1,17 @@ #pragma once +#include +#include +#include + #include "render_state.h" +#include "device_caps_and_shader_model.h" +#include "format_store.h" +#include "constants.h" +#include "shader.h" + -static const char CACHE_FILE_NAME[] = "cache/dx12.cache"; +inline const char CACHE_FILE_NAME[] = "cache/dx12.cache"; namespace drv3d_dx12 { diff --git a/prog/engine/drv/drv3d_DX12/platform.h b/prog/engine/drv/drv3d_DX12/platform.h index a75edfe61..dd5fa10cd 100644 --- a/prog/engine/drv/drv3d_DX12/platform.h +++ b/prog/engine/drv/drv3d_DX12/platform.h @@ -1,5 +1,12 @@ #pragma once +#include +#include + +#include "driver.h" +#include "winapi_helpers.h" + + namespace drv3d_dx12 { bool is_hdr_available(const ComPtr &output = {}); diff --git a/prog/engine/drv/drv3d_DX12/query_manager.h b/prog/engine/drv/drv3d_DX12/query_manager.h index f68147d66..66fc4af07 100644 --- a/prog/engine/drv/drv3d_DX12/query_manager.h +++ b/prog/engine/drv/drv3d_DX12/query_manager.h @@ -1,8 +1,14 @@ #pragma once +#include +#include + +#include "driver.h" +#include "pipeline.h" + namespace drv3d_dx12 { -static constexpr uint32_t heap_size = D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT / sizeof(uint64_t); +inline constexpr uint32_t heap_size = D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT / sizeof(uint64_t); class Device; class Query { diff --git a/prog/engine/drv/drv3d_DX12/render_state.h b/prog/engine/drv/drv3d_DX12/render_state.h index cf50b7cfa..33adccfe0 100644 --- a/prog/engine/drv/drv3d_DX12/render_state.h +++ b/prog/engine/drv/drv3d_DX12/render_state.h @@ -1,6 +1,18 @@ #pragma once +#include #include +#include +#include +#include +#include <3d/dag_renderStates.h> +#include <3d/dag_drv3d.h> + +#include "driver.h" +#include "bitfield.h" +#include "tagged_handles.h" +#include "dynamic_array.h" + #define MINIMUM_REPRESENTABLE_D32 3e-10 #define MINIMUM_REPRESENTABLE_D24 33e-8 @@ -8,6 +20,9 @@ namespace drv3d_dx12 { + +class DeviceContext; + BEGIN_BITFIELD_TYPE(PipelineOptionalDynamicStateMask, uint8_t) ADD_BITFIELD_MEMBER(hasDepthBoundsTest, 0, 1) ADD_BITFIELD_MEMBER(hasStencilTest, 1, 1) @@ -570,4 +585,4 @@ class RenderStateSystem eastl::vector staticStateTable; eastl::vector> publicStateTable; }; -} // namespace drv3d_dx12 \ No newline at end of file +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/render_target_mask_util.h b/prog/engine/drv/drv3d_DX12/render_target_mask_util.h new file mode 100644 index 000000000..895b30417 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/render_target_mask_util.h @@ -0,0 +1,63 @@ +#pragma once + + +// Input is 4x8 bits color channel mask and output will be a 8bit mask of render targets +inline uint32_t color_channel_mask_to_render_target_mask(uint32_t mask) +{ + // For each color chanel generate the used bit + const uint32_t channel0 = mask >> 0; + const uint32_t channel1 = mask >> 1; + const uint32_t channel2 = mask >> 2; + const uint32_t channel3 = mask >> 3; + // At this point the lower bit of each 4 bit block is now indicating if a target is used or not + const uint32_t channelsToSpacedTargetMask = channel0 | channel1 | channel2 | channel3; + // This erases the top 3 bits of each 4 bit block to compress from 4x8 bits to 8 bits. + const uint32_t t0 = (channelsToSpacedTargetMask >> 0) & 0x00000001; + const uint32_t t1 = (channelsToSpacedTargetMask >> 3) & 0x00000002; + const uint32_t t2 = (channelsToSpacedTargetMask >> 6) & 0x00000004; + const uint32_t t3 = (channelsToSpacedTargetMask >> 9) & 0x00000008; + const uint32_t t4 = (channelsToSpacedTargetMask >> 12) & 0x00000010; + const uint32_t t5 = (channelsToSpacedTargetMask >> 15) & 0x00000020; + const uint32_t t6 = (channelsToSpacedTargetMask >> 18) & 0x00000040; + const uint32_t t7 = (channelsToSpacedTargetMask >> 21) & 0x00000080; + const uint32_t combinedTargetMask = t0 | t1 | t2 | t3 | t4 | t5 | t6 | t7; + return combinedTargetMask; +} + +// Inputs a 8 bit mask of render targets and outputs a 4x8 channel mask, where if a target bit is +// set all corresponding channel bits will be set +inline uint32_t render_target_mask_to_color_channel_mask(uint32_t mask) +{ + // Spread out the individual target bits into the lowest bit of each corresponding 4 bit block, + // which is the indicator bit for the first channel (r) + const uint32_t t0 = (mask & 0x00000001) << 0; + const uint32_t t1 = (mask & 0x00000002) << 3; + const uint32_t t2 = (mask & 0x00000004) << 6; + const uint32_t t3 = (mask & 0x00000008) << 9; + const uint32_t t4 = (mask & 0x00000010) << 12; + const uint32_t t5 = (mask & 0x00000020) << 15; + const uint32_t t6 = (mask & 0x00000040) << 18; + const uint32_t t7 = (mask & 0x00000080) << 21; + const uint32_t r = t0 | t1 | t2 | t3 | t4 | t5 | t6 | t7; + // Replicate indicator bits from first channel (r) to all others (g, b and a) + const uint32_t g = r << 1; + const uint32_t b = r << 2; + const uint32_t a = r << 3; + return r | g | b | a; +} + +// Takes a 4x8 bit render target output channel mask and turns it into a 4x8 render target ouput mask +// where if any channel of a target is enabled all channels of the result are enabled. +// Simply speaking it turns all non 0 hex digits in the mask into F and all 0 are keept as 0. +inline uint32_t spread_color_chanel_mask_to_render_target_color_channel_mask(uint32_t mask) +{ + const uint32_t r = mask & 0x11111111; + const uint32_t g = mask & 0x22222222; + const uint32_t b = mask & 0x44444444; + const uint32_t a = mask & 0x88888888; + const uint32_t r1 = r | (r << 1) | (r << 2) | (r << 3); + const uint32_t g1 = g | (g << 1) | (g << 2) | (g >> 1); + const uint32_t b1 = b | (b << 1) | (b >> 1) | (b >> 2); + const uint32_t a1 = a | (a >> 1) | (a >> 2) | (a >> 3); + return r1 | g1 | b1 | a1; +} diff --git a/prog/engine/drv/drv3d_DX12/resource_manager/basic_buffer.h b/prog/engine/drv/drv3d_DX12/resource_manager/basic_buffer.h new file mode 100644 index 000000000..3ec68aa03 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/resource_manager/basic_buffer.h @@ -0,0 +1,81 @@ +#pragma once + +#include + +#include "driver.h" +#include "d3d12_error_handling.h" +#include "resource_memory.h" + +#include "resource_manager/heap_components.h" + + +namespace drv3d_dx12::resource_manager +{ + +// Basic buffer with some common stuff, like handling differences in memory model between PC and consoles. +struct BasicBuffer +{ + ComPtr buffer; + ResourceMemory bufferMemory; + // PC needs extra data, on consoles bufferMemory has everything we need +#if !_TARGET_XBOX + D3D12_GPU_VIRTUAL_ADDRESS gpuPointer = 0; + uint8_t *pointer = nullptr; +#endif + +#if _TARGET_XBOX + D3D12_GPU_VIRTUAL_ADDRESS getGPUPointer() const { return bufferMemory.getAddress(); } + uint8_t *getCPUPointer() const { return bufferMemory.asPointer(); } +#else + D3D12_GPU_VIRTUAL_ADDRESS getGPUPointer() const { return gpuPointer; } + uint8_t *getCPUPointer() const { return pointer; } +#endif + + HRESULT create(ID3D12Device *device, const D3D12_RESOURCE_DESC &desc, ResourceMemory mem, D3D12_RESOURCE_STATES initial_state, + bool map) + { + HRESULT errorCode = S_OK; +#if _TARGET_XBOX + G_UNUSED(map); + errorCode = + DX12_CHECK_RESULT_NO_OOM_CHECK(xbox_create_placed_resource(device, mem.getAddress(), desc, initial_state, nullptr, buffer)); + if (DX12_CHECK_FAIL(errorCode)) + { + return errorCode; + } +#else + errorCode = DX12_CHECK_RESULT_NO_OOM_CHECK( + device->CreatePlacedResource(mem.getHeap(), mem.getOffset(), &desc, initial_state, nullptr, COM_ARGS(&buffer))); + if (DX12_CHECK_FAIL(errorCode)) + { + return errorCode; + } + gpuPointer = buffer->GetGPUVirtualAddress(); + if (map) + { + D3D12_RANGE emptyRange{}; + buffer->Map(0, &emptyRange, reinterpret_cast(&pointer)); + } + else + { + pointer = nullptr; + } +#endif + bufferMemory = mem; + return errorCode; + } + + void reset(ResourceMemoryHeapProvider *heap) + { + if (bufferMemory && 0 == bufferMemory.getHeapID().isAlias) + { + heap->free(bufferMemory); + bufferMemory = {}; + } + buffer.Reset(); + } + + explicit operator bool() const { return static_cast(buffer); } +}; + +} // namespace drv3d_dx12::resource_manager \ No newline at end of file diff --git a/prog/engine/drv/drv3d_DX12/resource_memory_heap_basic_components.h b/prog/engine/drv/drv3d_DX12/resource_manager/basic_components.h similarity index 89% rename from prog/engine/drv/drv3d_DX12/resource_memory_heap_basic_components.h rename to prog/engine/drv/drv3d_DX12/resource_manager/basic_components.h index 65d688c3b..3ce67adbe 100644 --- a/prog/engine/drv/drv3d_DX12/resource_memory_heap_basic_components.h +++ b/prog/engine/drv/drv3d_DX12/resource_manager/basic_components.h @@ -1,200 +1,32 @@ #pragma once -#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include // psapi.h is not self-contained, it needs windows.h + +#include "driver.h" +#include "d3d12_utils.h" +#include "typed_bit_set.h" +#include "tagged_types.h" +#include "extents.h" +#include "format_store.h" +#include "container_mutex_wrapper.h" +#include "free_list_utils.h" +#include "image_global_subresource_id.h" + namespace drv3d_dx12 { // We try to work without this #define DX12_USE_ACTIVITY_LOCKING 0 -class ImageGlobalSubresouceId -{ -protected: - static constexpr uint32_t invalid_id = 0x00FFFFFF; - - uint32_t value = invalid_id; - - constexpr ImageGlobalSubresouceId(uint32_t v) : value{v} {} - - friend class ExtendedImageGlobalSubresouceId; - -public: - constexpr ImageGlobalSubresouceId() = default; - ~ImageGlobalSubresouceId() = default; - constexpr ImageGlobalSubresouceId(const ImageGlobalSubresouceId &) = default; - ImageGlobalSubresouceId &operator=(const ImageGlobalSubresouceId &) = default; - - constexpr bool isValid() const { return invalid_id != value; } - constexpr uint32_t index() const { return value; } - - static ImageGlobalSubresouceId make(uint32_t value) - { - G_ASSERT(value == (value & invalid_id)); - return {value}; - } - - static constexpr ImageGlobalSubresouceId makec(uint32_t value) { return {value}; } - - static constexpr ImageGlobalSubresouceId make_invalid() { return {}; } - - ImageGlobalSubresouceId &operator+=(uint32_t r) - { - G_ASSERT(isValid()); - value += r; - return *this; - } - - ImageGlobalSubresouceId &operator-=(uint32_t r) - { - G_ASSERT(isValid()); - value -= r; - return *this; - } - - ImageGlobalSubresouceId &operator++() - { - G_ASSERT(isValid()); - ++value; - return *this; - } - - ImageGlobalSubresouceId operator++(int) const - { - G_ASSERT(isValid()); - auto copy = *this; - return ++copy; - } - - ImageGlobalSubresouceId &operator--() - { - G_ASSERT(isValid()); - --value; - return *this; - } - - ImageGlobalSubresouceId operator--(int) const - { - G_ASSERT(isValid()); - auto copy = *this; - return --copy; - } - - operator DagorSafeArg() const { return {index()}; } - - constexpr SubresourceIndex toSubresouceIndex(ImageGlobalSubresouceId base) const - { - return SubresourceIndex::make(index() - base.index()); - } -}; - -static constexpr ImageGlobalSubresouceId swapchain_color_texture_global_id = ImageGlobalSubresouceId::makec(0); -static constexpr ImageGlobalSubresouceId swapchain_secondary_color_texture_global_id = ImageGlobalSubresouceId::makec(1); -static constexpr ImageGlobalSubresouceId first_dynamic_texture_global_id = ImageGlobalSubresouceId::makec(2); - -inline constexpr ImageGlobalSubresouceId operator+(const ImageGlobalSubresouceId &l, uint32_t r) -{ - return ImageGlobalSubresouceId::makec(l.index() + r); -} - -inline constexpr ImageGlobalSubresouceId operator+(const ImageGlobalSubresouceId &l, SubresourceIndex r) -{ - return ImageGlobalSubresouceId::makec(l.index() + r.index()); -} - -inline constexpr ImageGlobalSubresouceId operator-(const ImageGlobalSubresouceId &l, uint32_t r) -{ - return ImageGlobalSubresouceId::makec(l.index() - r); -} - -inline constexpr ImageGlobalSubresouceId operator-(const ImageGlobalSubresouceId &l, SubresourceIndex r) -{ - return ImageGlobalSubresouceId::makec(l.index() - r.index()); -} - -inline constexpr size_t operator-(const ImageGlobalSubresouceId &l, const ImageGlobalSubresouceId &r) { return l.index() - r.index(); } - -inline constexpr bool operator==(const ImageGlobalSubresouceId &l, const ImageGlobalSubresouceId &r) { return l.index() == r.index(); } - -inline constexpr bool operator!=(const ImageGlobalSubresouceId &l, const ImageGlobalSubresouceId &r) { return l.index() != r.index(); } - -inline constexpr bool operator<(const ImageGlobalSubresouceId &l, const ImageGlobalSubresouceId &r) { return l.index() < r.index(); } - -inline constexpr bool operator<=(const ImageGlobalSubresouceId &l, const ImageGlobalSubresouceId &r) { return l.index() <= r.index(); } - -inline constexpr bool operator>(const ImageGlobalSubresouceId &l, const ImageGlobalSubresouceId &r) { return l.index() > r.index(); } - -inline constexpr bool operator>=(const ImageGlobalSubresouceId &l, const ImageGlobalSubresouceId &r) { return l.index() >= r.index(); } - -using BareBoneImageGlobalSubresouceIdRange = ValueRange; - -class ExtendedImageGlobalSubresouceId -{ - using BareBoneType = ImageGlobalSubresouceId; - static constexpr uint32_t invalid_id = BareBoneType::invalid_id; - static constexpr uint32_t static_texture_bit = 1u << 31; - static constexpr uint32_t report_transitions_bit = 1u << 30; - static constexpr uint32_t index_mask = invalid_id; - static constexpr uint32_t status_mask = ~index_mask; - - uint32_t value = invalid_id; - - constexpr ExtendedImageGlobalSubresouceId(uint32_t v) : value{v} {} - -public: - constexpr ExtendedImageGlobalSubresouceId() = default; - ~ExtendedImageGlobalSubresouceId() = default; - constexpr ExtendedImageGlobalSubresouceId(const ExtendedImageGlobalSubresouceId &) = default; - ExtendedImageGlobalSubresouceId &operator=(const ExtendedImageGlobalSubresouceId &) = default; - - constexpr ImageGlobalSubresouceId asBareBone() const { return {index()}; } - - constexpr operator ImageGlobalSubresouceId() const { return asBareBone(); } - - static constexpr ExtendedImageGlobalSubresouceId make(ImageGlobalSubresouceId v) { return {v.index()}; } - - static ExtendedImageGlobalSubresouceId make(uint32_t v) - { - G_ASSERT(0 == (v & index_mask)); - return {v}; - } - - static ExtendedImageGlobalSubresouceId make_static(uint32_t v) - { - G_ASSERT(0 == (v & index_mask)); - return {v | static_texture_bit}; - } - - void setStatic() { value |= static_texture_bit; } - void setNonStatic() { value &= ~static_texture_bit; } - - void enableTransitionReporting() { value |= report_transitions_bit; } - void disableTransitionReporting() { value &= ~report_transitions_bit; } - - constexpr bool isValid() const { return invalid_id != (value & index_mask); } - constexpr uint32_t index() const { return value & index_mask; } - constexpr bool isStatic() const { return 0 != (value & static_texture_bit); } - constexpr bool shouldReportTransitions() const { return 0 != (value & report_transitions_bit); } - - constexpr ExtendedImageGlobalSubresouceId add(uint32_t v) const { return {value + v}; } - - constexpr ExtendedImageGlobalSubresouceId add(SubresourceCount v) const { return {value + v.count()}; } - - constexpr ExtendedImageGlobalSubresouceId sub(uint32_t v) const { return {value - v}; } - - operator DagorSafeArg() const { return {index()}; } - - constexpr SubresourceIndex toSubresouceIndex(ImageGlobalSubresouceId base) const - { - return SubresourceIndex::make(index() - base.index()); - } -}; - -inline constexpr ExtendedImageGlobalSubresouceId operator+(const ExtendedImageGlobalSubresouceId &l, uint32_t r) { return l.add(r); } - -inline constexpr ExtendedImageGlobalSubresouceId operator-(const ExtendedImageGlobalSubresouceId &l, uint32_t r) { return l.sub(r); } - -using ExtendedImageGlobalSubresouceIdRange = ValueRange; - namespace resource_manager { class ConcurrentAccessControler diff --git a/prog/engine/drv/drv3d_DX12/resource_memory_heap_buffer_components.h b/prog/engine/drv/drv3d_DX12/resource_manager/buffer_components.h similarity index 94% rename from prog/engine/drv/drv3d_DX12/resource_memory_heap_buffer_components.h rename to prog/engine/drv/drv3d_DX12/resource_manager/buffer_components.h index a4f4a3fc1..1e2967ab2 100644 --- a/prog/engine/drv/drv3d_DX12/resource_memory_heap_buffer_components.h +++ b/prog/engine/drv/drv3d_DX12/resource_manager/buffer_components.h @@ -1,5 +1,16 @@ #pragma once +#include +#include + +#include "device_memory_class.h" +#include "pipeline.h" +#include "container_mutex_wrapper.h" +#include "bindless.h" + +#include "resource_manager/host_shared_components.h" + + namespace drv3d_dx12 { namespace resource_manager @@ -43,9 +54,9 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider // TODO can be derived from this pointer and properties BufferGlobalId resId; D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE; - eastl::vector> freeRanges; + eastl::vector> freeRanges; - bool free(ValueRange range) + bool free(ValueRange range) { free_list_insert_and_coalesce(freeRanges, range); return freeRanges.front().size() == bufferMemory.size(); @@ -56,7 +67,7 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider // This allocate is a bit more simple than a general purpose allocation as it has the goal to // keep similarly aligned things in the same buffer heaps together So allocation from this will // fail if no free range is found that has a matching offset / address alignment. - ValueRange allocate(uint32_t size, uint32_t alignment) + ValueRange allocate(uint64_t size, uint64_t alignment) { if (freeRanges.empty()) { @@ -78,14 +89,14 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider { ref->start += size; } - return make_value_range(s, size); + return make_value_range(s, size); } }; struct StandbyInfo { uint64_t progress; uint32_t index; - ValueRange range; + ValueRange range; }; struct BufferHeapState { @@ -94,11 +105,11 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider // list of buffers can be big, up to 2k, so saving the free slots should be more efficient than searching on each allocate eastl::vector freeBufferSlots; - eastl::pair> tryAllocateFromReadyList(ResourceHeapProperties properties, uint32_t size, - D3D12_RESOURCE_FLAGS flags, uint32_t offset_alignment, bool allow_offset) + eastl::pair> tryAllocateFromReadyList(ResourceHeapProperties properties, uint64_t size, + D3D12_RESOURCE_FLAGS flags, uint64_t offset_alignment, bool allow_offset) { Heap *selectedHeap = nullptr; - ValueRange allocationRange; + ValueRange allocationRange; // Do backward search so we always use the most recently added and we have a chance to free // older stuff. Also erase is a bit more efficient. auto ref = eastl::find_if(rbegin(bufferHeapDiscardStandbyList), rend(bufferHeapDiscardStandbyList), @@ -138,11 +149,11 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider return {selectedHeap, allocationRange}; } - eastl::pair> trySuballocateFromExistingHeaps(ResourceHeapProperties properties, uint32_t size, + eastl::pair> trySuballocateFromExistingHeaps(ResourceHeapProperties properties, uint64_t size, D3D12_RESOURCE_FLAGS flags, uint32_t offset_alignment) { Heap *selectedHeap = nullptr; - ValueRange allocationRange; + ValueRange allocationRange; for (auto &heap : bufferHeaps) { if (!heap) @@ -199,7 +210,7 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider return result; } - size_t freeBufferHeap(BufferHeap *manager, uint32_t index, ValueRange range, const char *name) + size_t freeBufferHeap(BufferHeap *manager, uint32_t index, ValueRange range, const char *name) { auto &heap = bufferHeaps[index]; auto memoryHeapID = heap.bufferMemory.getHeapID(); @@ -216,7 +227,7 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider return result; } - BufferGlobalId createBufferHeap(BufferHeap *manager, DXGIAdapter *adapter, ID3D12Device *device, uint32_t allocation_size, + BufferGlobalId createBufferHeap(BufferHeap *manager, DXGIAdapter *adapter, ID3D12Device *device, uint64_t allocation_size, ResourceHeapProperties properties, D3D12_RESOURCE_FLAGS flags, DeviceMemoryClass memory_class, const char *name) { BufferGlobalId result; @@ -259,7 +270,7 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider return result; } - newHeap.freeRanges.push_back(make_value_range(0, allocation_size)); + newHeap.freeRanges.push_back(make_value_range(0, allocation_size)); newHeap.flags = flags; result = adoptBufferHeap(eastl::move(newHeap)); @@ -559,7 +570,7 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider return result; } - BufferState allocateBuffer(DXGIAdapter *adapter, ID3D12Device *device, uint32_t size, uint32_t structure_size, + BufferState allocateBuffer(DXGIAdapter *adapter, ID3D12Device *device, uint64_t size, uint32_t structure_size, uint32_t discard_count, DeviceMemoryClass memory_class, D3D12_RESOURCE_FLAGS flags, uint32_t cflags, const char *name, bool disable_sub_alloc, bool name_objects) { @@ -568,15 +579,15 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider auto heapProperties = getProperties(flags, memory_class, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT); Heap *selectedHeap = nullptr; - ValueRange allocationRange; + ValueRange allocationRange; size_t memoryAllocatedSize = 0; G_ASSERTF(discard_count > 0, "discard count has to be at least one"); - uint32_t payloadSize = size; + uint64_t payloadSize = size; auto offsetAlignment = calculateOffsetAlignment(cflags, max(1, structure_size)); BufferState result; if (canUseSubAlloc) { - payloadSize = align_value(payloadSize, offsetAlignment); + payloadSize = align_value(payloadSize, offsetAlignment); auto bufferHeapStateAccess = bufferHeapState.access(); // First try to allocate from ready list @@ -594,7 +605,7 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider if (!selectedHeap) { auto resIndex = bufferHeapStateAccess->createBufferHeap(this, adapter, device, - align_value(payloadSize * discard_count, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT), heapProperties, flags, + align_value(payloadSize * discard_count, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT), heapProperties, flags, memory_class, nullptr); auto heapIndex = resIndex.index(); @@ -603,7 +614,7 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider { selectedHeap = &bufferHeapStateAccess->bufferHeaps[heapIndex]; memoryAllocatedSize = selectedHeap->bufferMemory.size(); - allocationRange = make_value_range(0, payloadSize); + allocationRange = make_value_range(0, payloadSize); if (selectedHeap->freeRanges.front().stop == payloadSize) { selectedHeap->freeRanges.pop_back(); @@ -648,7 +659,7 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider { if (discard_count > 1) { - payloadSize = align_value(payloadSize, offsetAlignment); + payloadSize = align_value(payloadSize, offsetAlignment); #if DX12_REPORT_BUFFER_PADDING auto padd = payloadSize - size; if (padd) @@ -659,7 +670,7 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider // if we have to pay possibly lots of overhead, try to cram as many discards into it as // possible - discard_count = align_value(payloadSize * discard_count, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT) / payloadSize; + discard_count = align_value(payloadSize * discard_count, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT) / payloadSize; } else { @@ -671,7 +682,7 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider } #endif } - uint32_t totalSize = payloadSize * discard_count; + uint64_t totalSize = payloadSize * discard_count; { auto bufferHeapStateAccess = bufferHeapState.access(); @@ -682,7 +693,7 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider if (!selectedHeap) { auto resIndex = bufferHeapStateAccess->createBufferHeap(this, adapter, device, - align_value(totalSize, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT), heapProperties, flags, memory_class, name); + align_value(totalSize, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT), heapProperties, flags, memory_class, name); auto heapIndex = resIndex.index(); @@ -690,7 +701,7 @@ class BufferHeap : public PersistentBidirectionalMemoryProvider { selectedHeap = &bufferHeapStateAccess->bufferHeaps[heapIndex]; memoryAllocatedSize = selectedHeap->bufferMemory.size(); - allocationRange = make_value_range(0, totalSize); + allocationRange = make_value_range(0, totalSize); if (selectedHeap->freeRanges.front().stop == totalSize) { selectedHeap->freeRanges.pop_back(); diff --git a/prog/engine/drv/drv3d_DX12/resource_memory_heap_descriptor_components.h b/prog/engine/drv/drv3d_DX12/resource_manager/descriptor_components.h similarity index 98% rename from prog/engine/drv/drv3d_DX12/resource_memory_heap_descriptor_components.h rename to prog/engine/drv/drv3d_DX12/resource_manager/descriptor_components.h index 8891ab2a5..df5af2ac6 100644 --- a/prog/engine/drv/drv3d_DX12/resource_memory_heap_descriptor_components.h +++ b/prog/engine/drv/drv3d_DX12/resource_manager/descriptor_components.h @@ -1,5 +1,12 @@ #pragma once +#include "descriptor_heap.h" +#include "pipeline.h" +#include "format_store.h" + +#include "resource_manager/object_components.h" + + namespace drv3d_dx12 { namespace resource_manager diff --git a/prog/engine/drv/drv3d_DX12/resource_manager/esram_components.h b/prog/engine/drv/drv3d_DX12/resource_manager/esram_components.h new file mode 100644 index 000000000..59fa0d8fc --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/resource_manager/esram_components.h @@ -0,0 +1,15 @@ +#pragma once + +#include "driver.h" + +#include "resource_manager/heap_components.h" + + +#if DX12_USE_ESRAM +#include "resource_manager/esram_components_xbox.h" +#else +namespace drv3d_dx12::resource_manager +{ +using ESRamPageMappingProvider = ResourceMemoryHeapProvider; +} +#endif diff --git a/prog/engine/drv/drv3d_DX12/resource_memory_heap_heap_components.cpp b/prog/engine/drv/drv3d_DX12/resource_manager/heap_components.cpp similarity index 99% rename from prog/engine/drv/drv3d_DX12/resource_memory_heap_heap_components.cpp rename to prog/engine/drv/drv3d_DX12/resource_manager/heap_components.cpp index 81c16362a..0f69fcdff 100644 --- a/prog/engine/drv/drv3d_DX12/resource_memory_heap_heap_components.cpp +++ b/prog/engine/drv/drv3d_DX12/resource_manager/heap_components.cpp @@ -530,7 +530,7 @@ uint64_t MemoryBudgetObserver::getHeapSizeFromAllocationSize(uint64_t size, Reso #endif #if _TARGET_XBOX -#include "resource_memory_heap_heap_components_xbox.inl.cpp" +#include "resource_manager/heap_components_xbox.inl.cpp" #else D3D12_RESOURCE_STATES ResourceMemoryHeapProvider::propertiesToInitialState(D3D12_RESOURCE_DIMENSION dim, D3D12_RESOURCE_FLAGS flags, DeviceMemoryClass) diff --git a/prog/engine/drv/drv3d_DX12/resource_memory_heap_heap_components.h b/prog/engine/drv/drv3d_DX12/resource_manager/heap_components.h similarity index 95% rename from prog/engine/drv/drv3d_DX12/resource_memory_heap_heap_components.h rename to prog/engine/drv/drv3d_DX12/resource_manager/heap_components.h index 2bebc8586..946042936 100644 --- a/prog/engine/drv/drv3d_DX12/resource_memory_heap_heap_components.h +++ b/prog/engine/drv/drv3d_DX12/resource_manager/heap_components.h @@ -1,8 +1,27 @@ #pragma once +#include +#include +#include +#include +#include + +#include "driver.h" +#include "typed_bit_set.h" +#include "pipeline.h" +#include "resource_memory.h" +#include "device_memory_class.h" +#include "value_range.h" +#include "free_list_utils.h" + +#include "resource_manager/descriptor_components.h" + + namespace drv3d_dx12 { struct RaytraceAccelerationStructure; +class Image; + namespace resource_manager { @@ -462,7 +481,7 @@ class MemoryBudgetObserver : public ResourceHeapFeatureController uint64_t getHeapSizeFromAllocationSize(uint64_t size, ResourceHeapProperties properties, AllocationFlags flags); }; #else -#include "resource_memory_heap_heap_components_xbox.inl.h" +#include "resource_manager/heap_components_xbox.inl.h" #endif class ResourceMemoryHeapBase : public MemoryBudgetObserver @@ -520,21 +539,21 @@ class ResourceMemoryHeapBase : public MemoryBudgetObserver RaytraceBottomLevelAccelerationStructureRefnerence, RaytraceTopLevelAccelerationStructureRefnerence>; struct HeapResourceInfo { - ValueRange range; + ValueRange range; AnyResourceReference resource; HeapResourceInfo() = default; HeapResourceInfo(const HeapResourceInfo &) = default; - HeapResourceInfo(ValueRange r) : range{r} {} + HeapResourceInfo(ValueRange r) : range{r} {} }; struct BasicResourceHeap { - using FreeRangeSetType = eastl::vector>; + using FreeRangeSetType = eastl::vector>; using UsedRangeSetType = eastl::vector; FreeRangeSetType freeRanges; UsedRangeSetType usedRanges; - ValueRange lockedRange{}; - uint32_t totalSize = 0; + ValueRange lockedRange{}; + uint64_t totalSize = 0; static constexpr uint64_t fragmentation_range = 10000; uint32_t fragmentation = 0; @@ -546,14 +565,14 @@ class ResourceMemoryHeapBase : public MemoryBudgetObserver fragmentation = fragmentation_range - ((ctx.maxSize() * fragmentation_range) / ctx.totalSize()); } - uint32_t freeSize() const + uint64_t freeSize() const { - return eastl::accumulate(begin(freeRanges), end(freeRanges), 0, [](uint32_t v, auto range) { return v + range.size(); }); + return eastl::accumulate(begin(freeRanges), end(freeRanges), 0, [](uint64_t v, auto range) { return v + range.size(); }); } - uint32_t allocatedSize() const { return totalSize - freeSize(); } + uint64_t allocatedSize() const { return totalSize - freeSize(); } - void lock(ValueRange range) + void lock(ValueRange range) { G_ASSERT(!isLocked()); lockedRange = range; @@ -565,14 +584,14 @@ class ResourceMemoryHeapBase : public MemoryBudgetObserver bool isFree() const { return freeRanges.front().size() == totalSize; } - auto findUsedInfo(ValueRange range) + auto findUsedInfo(ValueRange range) { return eastl::lower_bound(begin(usedRanges), end(usedRanges), range, [](auto &info, auto range) // { return info.range.front() < range.front(); }); } - void freeRange(ValueRange range) + void freeRange(ValueRange range) { auto rangeRef = findUsedInfo(range); G_ASSERT(rangeRef != end(usedRanges) && rangeRef->range == range); @@ -585,7 +604,7 @@ class ResourceMemoryHeapBase : public MemoryBudgetObserver } template - void updateMemoryRangeUse(ValueRange range, T &&ref) + void updateMemoryRangeUse(ValueRange range, T &&ref) { auto rangeRef = findUsedInfo(range); G_ASSERT(rangeRef != end(usedRanges) && rangeRef->range == range); @@ -607,7 +626,7 @@ class ResourceMemoryHeapBase : public MemoryBudgetObserver { continue; } - auto alignedStart = align_value(at->front(), alloc_info.Alignment); + auto alignedStart = align_value(at->front(), alloc_info.Alignment); auto possibleAllocRange = make_value_range(alignedStart, alloc_info.SizeInBytes); if (possibleAllocRange.isSubRangeOf(*at)) { @@ -636,7 +655,7 @@ class ResourceMemoryHeapBase : public MemoryBudgetObserver { continue; } - auto alignedStart = align_value(at->front(), alloc_info.Alignment); + auto alignedStart = align_value(at->front(), alloc_info.Alignment); auto possibleAllocRange = make_value_range(alignedStart, alloc_info.SizeInBytes); if (!possibleAllocRange.isSubRangeOf(*at)) { @@ -656,9 +675,9 @@ class ResourceMemoryHeapBase : public MemoryBudgetObserver bool isValidRange(FreeRangeSetType::iterator selected) { return end(freeRanges) != selected; } - ValueRange allocateFromRange(const D3D12_RESOURCE_ALLOCATION_INFO &alloc_info, FreeRangeSetType::iterator selected) + ValueRange allocateFromRange(const D3D12_RESOURCE_ALLOCATION_INFO &alloc_info, FreeRangeSetType::iterator selected) { - auto alignedStart = align_value(selected->front(), alloc_info.Alignment); + auto alignedStart = align_value(selected->front(), alloc_info.Alignment); ; auto possibleAllocRange = make_value_range(alignedStart, alloc_info.SizeInBytes); @@ -843,7 +862,7 @@ class ResourceMemoryHeapBase : public MemoryBudgetObserver explicit operator bool() const { return (end(parent.freeRanges) != freeRangePos) || (end(parent.usedRanges) != usedRangePos); } // Undefined behavior if static_cast(*this) == false - const ValueRange &getRange() const + const ValueRange &getRange() const { G_ASSERT(static_cast(*this)); if (isFreeRange()) @@ -854,10 +873,10 @@ class ResourceMemoryHeapBase : public MemoryBudgetObserver } // Undefined behavior if static_cast(*this) == false - const ValueRange &operator*() const { return getRange(); } + const ValueRange &operator*() const { return getRange(); } // Undefined behavior if static_cast(*this) == false - const ValueRange *operator->() const { return &getRange(); } + const ValueRange *operator->() const { return &getRange(); } // Undefined behavior if isUsedRange returns false for this auto getUsedResource() const -> eastl::add_lvalue_reference_tresource)> @@ -906,7 +925,7 @@ class ResourceMemoryHeapProvider : public ResourceMemoryHeapBase } // only valid if isPartOf(mem) is true - uint32_t calculateOffset(ResourceMemory mem) const { return mem.asPointer() - heap.get(); } + uint64_t calculateOffset(ResourceMemory mem) const { return mem.asPointer() - heap.get(); } uint8_t *heapPointer() const { return heap.get(); } @@ -1039,7 +1058,7 @@ class ResourceMemoryHeapProvider : public ResourceMemoryHeapBase { ResourceHeapProperties properties; OSSpinlockScopedLock lock{heapGroupMutex}; - for (properties.raw = 0; properties.raw < array_size(groups); ++properties.raw) + for (properties.raw = 0; properties.raw < countof(groups); ++properties.raw) { auto &group = groups[properties.raw]; clb.visitHeapGroup(properties.raw, group.size(), true, 0 != properties.isCPUCoherent, 0 != properties.isGPUExecutable); @@ -1159,7 +1178,7 @@ class ResourceMemoryHeapProvider : public ResourceMemoryHeapBase { ResourceHeapProperties props; OSSpinlockScopedLock lock{heapGroupMutex}; - for (props.raw = 0; props.raw < array_size(groups); ++props.raw) + for (props.raw = 0; props.raw < countof(groups); ++props.raw) { auto &group = groups[props.raw]; clb.visitHeapGroup(props.raw, group.size(), props.isCPUVisible(), diff --git a/prog/engine/drv/drv3d_DX12/resource_memory_heap_host_shared_components.h b/prog/engine/drv/drv3d_DX12/resource_manager/host_shared_components.h similarity index 95% rename from prog/engine/drv/drv3d_DX12/resource_memory_heap_host_shared_components.h rename to prog/engine/drv/drv3d_DX12/resource_manager/host_shared_components.h index 5df0191de..194a4fc78 100644 --- a/prog/engine/drv/drv3d_DX12/resource_memory_heap_host_shared_components.h +++ b/prog/engine/drv/drv3d_DX12/resource_manager/host_shared_components.h @@ -1,75 +1,23 @@ #pragma once -namespace drv3d_dx12 -{ -namespace resource_manager -{ -// Basic buffer with some common stuff, like handling differences in memory model between PC and consoles. -struct BasicBuffer -{ - ComPtr buffer; - ResourceMemory bufferMemory; - // PC needs extra data, on consoles bufferMemory has everything we need -#if !_TARGET_XBOX - D3D12_GPU_VIRTUAL_ADDRESS gpuPointer = 0; - uint8_t *pointer = nullptr; -#endif - -#if _TARGET_XBOX - D3D12_GPU_VIRTUAL_ADDRESS getGPUPointer() const { return bufferMemory.getAddress(); } - uint8_t *getCPUPointer() const { return bufferMemory.asPointer(); } -#else - D3D12_GPU_VIRTUAL_ADDRESS getGPUPointer() const { return gpuPointer; } - uint8_t *getCPUPointer() const { return pointer; } -#endif +#include +#include - HRESULT create(ID3D12Device *device, const D3D12_RESOURCE_DESC &desc, ResourceMemory mem, D3D12_RESOURCE_STATES initial_state, - bool map) - { - HRESULT errorCode = S_OK; -#if _TARGET_XBOX - G_UNUSED(map); - errorCode = - DX12_CHECK_RESULT_NO_OOM_CHECK(xbox_create_placed_resource(device, mem.getAddress(), desc, initial_state, nullptr, buffer)); - if (DX12_CHECK_FAIL(errorCode)) - { - return errorCode; - } -#else - errorCode = DX12_CHECK_RESULT_NO_OOM_CHECK( - device->CreatePlacedResource(mem.getHeap(), mem.getOffset(), &desc, initial_state, nullptr, COM_ARGS(&buffer))); - if (DX12_CHECK_FAIL(errorCode)) - { - return errorCode; - } - gpuPointer = buffer->GetGPUVirtualAddress(); - if (map) - { - D3D12_RANGE emptyRange{}; - buffer->Map(0, &emptyRange, reinterpret_cast(&pointer)); - } - else - { - pointer = nullptr; - } -#endif - bufferMemory = mem; - return errorCode; - } +#include "driver.h" +#include "constants.h" +#include "resource_memory.h" +#include "host_device_shared_memory_region.h" +#include "d3d12_error_handling.h" +#include "container_mutex_wrapper.h" - void reset(ResourceMemoryHeapProvider *heap) - { - if (bufferMemory && 0 == bufferMemory.getHeapID().isAlias) - { - heap->free(bufferMemory); - bufferMemory = {}; - } - buffer.Reset(); - } +#include "resource_manager/basic_buffer.h" +#include "resource_manager/esram_components.h" - explicit operator bool() const { return static_cast(buffer); } -}; +namespace drv3d_dx12 +{ +namespace resource_manager +{ class RingMemoryBase : public ESRamPageMappingProvider { using BaseType = ESRamPageMappingProvider; diff --git a/prog/engine/drv/drv3d_DX12/resource_memory_heap_object_components.h b/prog/engine/drv/drv3d_DX12/resource_manager/image.h similarity index 54% rename from prog/engine/drv/drv3d_DX12/resource_memory_heap_object_components.h rename to prog/engine/drv/drv3d_DX12/resource_manager/image.h index 69856dd17..f825980d0 100644 --- a/prog/engine/drv/drv3d_DX12/resource_memory_heap_object_components.h +++ b/prog/engine/drv/drv3d_DX12/resource_manager/image.h @@ -1,83 +1,26 @@ #pragma once #include -#if _TARGET_XBOX -#if _TARGET_SCARLETT -#include -#else -#include -#endif +#include +#include #include -inline void intrusive_ptr_add_ref(XGTextureAddressComputer *ptr) { ptr->AddRef(); } -inline void intrusive_ptr_release(XGTextureAddressComputer *ptr) { ptr->Release(); } -#endif +#include "driver.h" +#include "constants.h" +#include "extents.h" +#include "image_view_state.h" +#include "resource_memory.h" +#include "container_mutex_wrapper.h" +#include "image_global_subresource_id.h" +#include "texture_subresource_util.h" #if DX12_USE_ESRAM -#include -namespace drv3d_dx12 -{ -struct EsramResource -{ - XGMemoryLayoutMapping mapping{}; - Image *dramStorage = nullptr; - int mappingIndex = -1; - - explicit operator bool() const { return -1 != mappingIndex; } -}; -} // namespace drv3d_dx12 +#include "resource_manager/esram_resource_xbox.h" #endif + namespace drv3d_dx12 { -// size is not important here -struct ImageInfo -{ - D3D12_RESOURCE_DIMENSION type; - D3D12_RESOURCE_FLAGS usage; - Extent3D size; - ArrayLayerCount arrays; - MipMapCount mips; - FormatStore format; - D3D12_TEXTURE_LAYOUT memoryLayout; - DeviceMemoryClass memoryClass; - bool allocateSubresourceIDs; - - SubresourceCount getSubResourceCount() const { return SubresourcePerFormatPlaneCount::make(mips, arrays) * format.getPlanes(); } - - D3D12_RESOURCE_DESC asDesc() const - { - D3D12_RESOURCE_DESC desc; - desc.SampleDesc.Count = 1; - desc.SampleDesc.Quality = 0; - desc.Layout = memoryLayout; - desc.Flags = usage; - desc.Format = format.asDxGiTextureCreateFormat(); - desc.Dimension = type; - desc.Width = size.width; - desc.MipLevels = mips.count(); - desc.Alignment = calculate_texture_alignment(size.width, size.height, size.depth, 1, memoryLayout, usage, format); - switch (type) - { - default: - case D3D12_RESOURCE_DIMENSION_UNKNOWN: - case D3D12_RESOURCE_DIMENSION_BUFFER: fatal("DX12: Invalid texture dimension"); return desc; - case D3D12_RESOURCE_DIMENSION_TEXTURE1D: - desc.Height = 1; - desc.DepthOrArraySize = arrays.count(); - break; - case D3D12_RESOURCE_DIMENSION_TEXTURE2D: - desc.Height = size.height; - desc.DepthOrArraySize = arrays.count(); - break; - case D3D12_RESOURCE_DIMENSION_TEXTURE3D: - desc.Height = size.height; - desc.DepthOrArraySize = size.depth; - break; - } - return desc; - } -}; #define DX12_IMAGE_DEBUG_NAMES 1 class Image @@ -97,6 +40,7 @@ class Image EsramResource esramResource{}; #endif ContainerMutexWrapper debugName; + bool isMultisampledImage = false; D3D12_RESOURCE_DIMENSION imageType = D3D12_RESOURCE_DIMENSION_TEXTURE2D; ArrayLayerCount layerCount{}; @@ -204,6 +148,9 @@ class Image } } + bool isMultisampled() const { return isMultisampledImage; } + void setMultisampled(bool is_multisampled) { isMultisampledImage = is_multisampled; } + ExtendedImageGlobalSubresouceId getGlobalSubResourceIdBase() const { return globalSubResBase; } bool hasTrackedState() const { return globalSubResBase.isValid(); } @@ -256,17 +203,18 @@ class Image } Image(ResourceMemory mem, ComPtr img, D3D12_RESOURCE_DIMENSION type, D3D12_TEXTURE_LAYOUT layout, FormatStore fmt, - Extent3D ext, MipMapCount levels, ArrayLayerCount layers, ImageGlobalSubresouceId sub_res_base) : + Extent3D ext, MipMapCount levels, ArrayLayerCount layers, ImageGlobalSubresouceId sub_res_base, bool is_multisampled) : memory{mem}, - imageType(type), + isMultisampledImage{is_multisampled}, + imageType{type}, #ifdef _TARGET_XBOX - textureLayout(layout), + textureLayout{layout}, #endif - layerCount(layers), - image(eastl::move(img)), - format(fmt), - extent(ext), - mipLevels(levels), + layerCount{layers}, + image{eastl::move(img)}, + format{fmt}, + extent{ext}, + mipLevels{levels}, globalSubResBase{ExtendedImageGlobalSubresouceId::make(sub_res_base)} { G_UNUSED(layout); @@ -274,14 +222,16 @@ class Image #if DX12_USE_ESRAM Image(ResourceMemory mem, ComPtr img, D3D12_RESOURCE_DIMENSION type, FormatStore fmt, Extent3D ext, - MipMapCount levels, ArrayLayerCount layers, ImageGlobalSubresouceId sub_res_base, const EsramResource &esram_resource) : + MipMapCount levels, ArrayLayerCount layers, ImageGlobalSubresouceId sub_res_base, const EsramResource &esram_resource, + bool is_multisampled) : memory{mem}, - imageType(type), - layerCount(layers), - image(eastl::move(img)), - format(fmt), - extent(ext), - mipLevels(levels), + isMultisampledImage{is_multisampled}, + imageType{type}, + layerCount{layers}, + image{eastl::move(img)}, + format{fmt}, + extent{ext}, + mipLevels{levels}, globalSubResBase{ExtendedImageGlobalSubresouceId::make(sub_res_base)}, esramResource{esram_resource} {} @@ -332,204 +282,4 @@ class Image void updateFormat(FormatStore fmt) { format = fmt; } }; -struct ImageCreateResult -{ - Image *image; - D3D12_RESOURCE_STATES state; -}; -namespace resource_manager -{ -class BufferObjectProvider : public GlobalSubresourceIdProvider -{ - using BaseType = GlobalSubresourceIdProvider; - -protected: - OSSpinlock bufferPoolGuard; - ObjectPool bufferPool; - - void shutdown() - { - OSSpinlockScopedLock lock{bufferPoolGuard}; - auto sz = bufferPool.size(); - G_ASSERTF(0 == sz, "DX12: Shutdown without destroying all buffers, there are still %u buffers alive!", sz); -#if DAGOR_DBGLEVEL > 0 - bufferPool.iterateAllocated([](auto buffer) { G_ASSERTF(false, "DX12: Buffer <%s> still alive!", buffer->getBufName()); }); -#endif - G_UNUSED(sz); - bufferPool.freeAll(); - - BaseType::shutdown(); - } - -public: - template - GenericBufferInterface *newBufferObject(Args &&...args) - { - void *memory = nullptr; - { - OSSpinlockScopedLock lock{bufferPoolGuard}; - memory = bufferPool.acquire(); - } - // The constructor may kick off additional buffer allocations or frees so it can not be done - // under the locked bufferPoolGuard - auto buffer = ::new (memory) GenericBufferInterface(eastl::forward(args)...); - if (!buffer->getDeviceBuffer() && !buffer->isStreamBuffer()) - { - buffer->destroy(); - buffer = nullptr; - } - return buffer; - } - - void deleteBufferObject(GenericBufferInterface *buffer) - { - // Have to destruct here to prevent possible recursive locking. - buffer->~GenericBufferInterface(); - OSSpinlockScopedLock lock{bufferPoolGuard}; - bufferPool.release(buffer); - } - - template - void visitBufferObjects(T clb) - { - OSSpinlockScopedLock lock{bufferPoolGuard}; - bufferPool.iterateAllocated(clb); - } - - void reserveBufferObjects(size_t size) - { - OSSpinlockScopedLock lock{bufferPoolGuard}; - bufferPool.reserve(size); - } - - size_t getBufferObjectCapacity() - { - OSSpinlockScopedLock lock{bufferPoolGuard}; - return bufferPool.capacity(); - } - - size_t getActiveBufferObjectCount() - { - OSSpinlockScopedLock lock{bufferPoolGuard}; - return bufferPool.size(); - } -}; - -class TextureObjectProvider : public BufferObjectProvider -{ - using BaseType = BufferObjectProvider; - -protected: - struct PendingForCompletedFrameData : BaseType::PendingForCompletedFrameData - { - eastl::vector freedTextureObjects; - }; - - ContainerMutexWrapper, OSSpinlock> texturePool; - - void shutdown() - { - auto poolAccess = texturePool.access(); - auto sz = poolAccess->size(); - G_ASSERTF(0 == sz, "DX12: Shutdown without destroying all textures, there are still %u textures alive!", sz); -#if DAGOR_DBGLEVEL > 0 - poolAccess->iterateAllocated([](auto tex) { G_ASSERTF(false, "DX12: Texture <%s> still alive!", tex->getResName()); }); -#endif - G_UNUSED(sz); - poolAccess->freeAll(); - - BaseType::shutdown(); - } - - void completeFrameExecution(const CompletedFrameExecutionInfo &info, PendingForCompletedFrameData &data) - { - if (!data.freedTextureObjects.empty()) - { - eastl::sort(begin(data.freedTextureObjects), end(data.freedTextureObjects)); - texturePool.access()->free(begin(data.freedTextureObjects), end(data.freedTextureObjects)); - data.freedTextureObjects.clear(); - } - BaseType::completeFrameExecution(info, data); - } - -public: - template - TextureInterfaceBase *newTextureObject(Args &&...args) - { - return texturePool.access()->allocate(eastl::forward(args)...); - } - - template - void visitTextureObjects(T clb) - { - texturePool.access()->iterateAllocated(clb); - } - - void deleteTextureObjectOnFrameCompletion(TextureInterfaceBase *texture) - { - accessRecodingPendingFrameCompletion( - [=](auto &data) { data.freedTextureObjects.push_back(texture); }); - } - - void reserveTextureObjects(size_t count) { texturePool.access()->reserve(count); } - - size_t getTextureObjectCapacity() { return texturePool.access()->capacity(); } - - size_t getActiveTextureObjectCount() { return texturePool.access()->size(); } -}; - -class ImageObjectProvider : public TextureObjectProvider -{ - using BaseType = TextureObjectProvider; - -protected: - ContainerMutexWrapper, OSSpinlock> imageObjectPool; - - ImageObjectProvider() = default; - ~ImageObjectProvider() = default; - ImageObjectProvider(const ImageObjectProvider &) = delete; - ImageObjectProvider &operator=(const ImageObjectProvider &) = delete; - ImageObjectProvider(ImageObjectProvider &&) = delete; - ImageObjectProvider &operator=(ImageObjectProvider &&) = delete; - - void shutdown() - { - imageObjectPool.access()->freeAll(); - BaseType::shutdown(); - } - - void preRecovery() - { - imageObjectPool.access()->freeAll(); - BaseType::preRecovery(); - } - - template - Image *newImageObject(Args &&...args) - { - return imageObjectPool.access()->allocate(eastl::forward(args)...); - } - - void deleteImageObject(Image *image) { imageObjectPool.access()->free(image); } - - void deleteImageObjects(eastl::span images) - { - auto imageObjectPoolAccess = imageObjectPool.access(); - for (auto image : images) - { - imageObjectPoolAccess->free(image); - } - } - -public: - template - void visitImageObjects(T &&clb) - { - imageObjectPool.access()->iterateAllocated(eastl::forward(clb)); - } - - bool isImageAlive(Image *img) { return imageObjectPool.access()->isAllocated(img); } -}; - -} // namespace resource_manager -} // namespace drv3d_dx12 +} // namespace drv3d_dx12 \ No newline at end of file diff --git a/prog/engine/drv/drv3d_DX12/resource_manager/object_components.h b/prog/engine/drv/drv3d_DX12/resource_manager/object_components.h new file mode 100644 index 000000000..ad69d2a9d --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/resource_manager/object_components.h @@ -0,0 +1,348 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "texture.h" +#include "extents.h" +#include "format_store.h" +#include "device_memory_class.h" +#include "image_view_state.h" +#include "buffer.h" +#include "resource_memory.h" +#include "texture_subresource_util.h" + +#include "resource_manager/basic_components.h" +#include "resource_manager/image.h" + + +#if _TARGET_XBOX +#if _TARGET_SCARLETT +#include +#else +#include +#endif +#include +inline void intrusive_ptr_add_ref(XGTextureAddressComputer *ptr) { ptr->AddRef(); } + +inline void intrusive_ptr_release(XGTextureAddressComputer *ptr) { ptr->Release(); } +#endif + + +namespace drv3d_dx12 +{ + +inline uint64_t calculate_texture_alignment(uint64_t width, uint32_t height, uint32_t depth, uint32_t samples, + D3D12_TEXTURE_LAYOUT layout, D3D12_RESOURCE_FLAGS flags, drv3d_dx12::FormatStore format) +{ + if (D3D12_TEXTURE_LAYOUT_UNKNOWN != layout) + { + if (samples > 1) + { + return D3D12_DEFAULT_MSAA_RESOURCE_PLACEMENT_ALIGNMENT; + } + else + { + return D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT; + } + } + + if ((1 == samples) && ((D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET | D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL) & flags)) + { + return D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT; + } + + uint32_t blockSizeX = 1, blockSizeY = 1; + auto bytesPerBlock = format.getBytesPerPixelBlock(&blockSizeX, &blockSizeY); + const uint32_t textureWidthInBlocks = (width + blockSizeX - 1) / blockSizeX; + const uint32_t textureHeightInBlocks = (height + blockSizeY - 1) / blockSizeY; + + const uint32_t TILE_MEM_SIZE = 4 * 1024; + const uint32_t blocksInTile = TILE_MEM_SIZE / bytesPerBlock; + // MSDN documentation says about "near-equilateral" size for the tile + const uint32_t blocksInTileX = get_bigger_pow2(sqrt(blocksInTile)); + const uint32_t blocksInTileY = get_bigger_pow2(blocksInTile / blocksInTileX); + const uint32_t MAX_TILES_COUNT_FOR_SMALL_RES = 16; + const uint32_t tilesCount = ((textureWidthInBlocks + blocksInTileX - 1) / blocksInTileX) * + ((textureHeightInBlocks + blocksInTileY - 1) / blocksInTileY) * depth; + // This check is neccessary according to debug layer and dx12 documentation: + // https://docs.microsoft.com/en-us/windows/win32/api/d3d12/ns-d3d12-d3d12_resource_desc#alignment + const bool smallAmountOfTiles = tilesCount <= MAX_TILES_COUNT_FOR_SMALL_RES; + + if (samples > 1) + { + if (smallAmountOfTiles) + { + return D3D12_SMALL_MSAA_RESOURCE_PLACEMENT_ALIGNMENT; + } + else + { + return D3D12_DEFAULT_MSAA_RESOURCE_PLACEMENT_ALIGNMENT; + } + } + else + { + if (smallAmountOfTiles) + { + return D3D12_SMALL_RESOURCE_PLACEMENT_ALIGNMENT; + } + else + { + return D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT; + } + } +} + +// size is not important here +struct ImageInfo +{ + D3D12_RESOURCE_DIMENSION type = D3D12_RESOURCE_DIMENSION_UNKNOWN; + D3D12_RESOURCE_FLAGS usage = D3D12_RESOURCE_FLAG_NONE; + Extent3D size = {}; + ArrayLayerCount arrays; + MipMapCount mips; + FormatStore format; + D3D12_TEXTURE_LAYOUT memoryLayout = D3D12_TEXTURE_LAYOUT_UNKNOWN; + DeviceMemoryClass memoryClass = DeviceMemoryClass::INVALID; + bool allocateSubresourceIDs = false; + DXGI_SAMPLE_DESC sampleDesc = {1, 0}; + + SubresourceCount getSubResourceCount() const { return SubresourcePerFormatPlaneCount::make(mips, arrays) * format.getPlanes(); } + + D3D12_RESOURCE_DESC asDesc() const + { + D3D12_RESOURCE_DESC desc; + desc.SampleDesc = sampleDesc; + desc.Layout = memoryLayout; + desc.Flags = usage; + desc.Format = format.asDxGiTextureCreateFormat(); + desc.Dimension = type; + desc.Width = size.width; + desc.MipLevels = mips.count(); + desc.Alignment = calculate_texture_alignment(size.width, size.height, size.depth, 1, memoryLayout, usage, format); + switch (type) + { + default: + case D3D12_RESOURCE_DIMENSION_UNKNOWN: + case D3D12_RESOURCE_DIMENSION_BUFFER: fatal("DX12: Invalid texture dimension"); return desc; + case D3D12_RESOURCE_DIMENSION_TEXTURE1D: + desc.Height = 1; + desc.DepthOrArraySize = arrays.count(); + break; + case D3D12_RESOURCE_DIMENSION_TEXTURE2D: + desc.Height = size.height; + desc.DepthOrArraySize = arrays.count(); + break; + case D3D12_RESOURCE_DIMENSION_TEXTURE3D: + desc.Height = size.height; + desc.DepthOrArraySize = size.depth; + break; + } + return desc; + } +}; + +struct ImageCreateResult +{ + Image *image; + D3D12_RESOURCE_STATES state; +}; +namespace resource_manager +{ +class BufferObjectProvider : public GlobalSubresourceIdProvider +{ + using BaseType = GlobalSubresourceIdProvider; + +protected: + OSSpinlock bufferPoolGuard; + ObjectPool bufferPool; + + void shutdown() + { + OSSpinlockScopedLock lock{bufferPoolGuard}; + auto sz = bufferPool.size(); + G_ASSERTF(0 == sz, "DX12: Shutdown without destroying all buffers, there are still %u buffers alive!", sz); +#if DAGOR_DBGLEVEL > 0 + bufferPool.iterateAllocated([](auto buffer) { G_ASSERTF(false, "DX12: Buffer <%s> still alive!", buffer->getBufName()); }); +#endif + G_UNUSED(sz); + bufferPool.freeAll(); + + BaseType::shutdown(); + } + +public: + template + GenericBufferInterface *newBufferObject(Args &&...args) + { + void *memory = nullptr; + { + OSSpinlockScopedLock lock{bufferPoolGuard}; + memory = bufferPool.acquire(); + } + // The constructor may kick off additional buffer allocations or frees so it can not be done + // under the locked bufferPoolGuard + auto buffer = ::new (memory) GenericBufferInterface(eastl::forward(args)...); + if (!buffer->getDeviceBuffer() && !buffer->isStreamBuffer()) + { + buffer->destroy(); + buffer = nullptr; + } + return buffer; + } + + void deleteBufferObject(GenericBufferInterface *buffer) + { + // Have to destruct here to prevent possible recursive locking. + buffer->~GenericBufferInterface(); + OSSpinlockScopedLock lock{bufferPoolGuard}; + bufferPool.release(buffer); + } + + template + void visitBufferObjects(T clb) + { + OSSpinlockScopedLock lock{bufferPoolGuard}; + bufferPool.iterateAllocated(clb); + } + + void reserveBufferObjects(size_t size) + { + OSSpinlockScopedLock lock{bufferPoolGuard}; + bufferPool.reserve(size); + } + + size_t getBufferObjectCapacity() + { + OSSpinlockScopedLock lock{bufferPoolGuard}; + return bufferPool.capacity(); + } + + size_t getActiveBufferObjectCount() + { + OSSpinlockScopedLock lock{bufferPoolGuard}; + return bufferPool.size(); + } +}; + +class TextureObjectProvider : public BufferObjectProvider +{ + using BaseType = BufferObjectProvider; + +protected: + struct PendingForCompletedFrameData : BaseType::PendingForCompletedFrameData + { + eastl::vector freedTextureObjects; + }; + + ContainerMutexWrapper, OSSpinlock> texturePool; + + void shutdown() + { + auto poolAccess = texturePool.access(); + auto sz = poolAccess->size(); + G_ASSERTF(0 == sz, "DX12: Shutdown without destroying all textures, there are still %u textures alive!", sz); +#if DAGOR_DBGLEVEL > 0 + poolAccess->iterateAllocated([](auto tex) { G_ASSERTF(false, "DX12: Texture <%s> still alive!", tex->getResName()); }); +#endif + G_UNUSED(sz); + poolAccess->freeAll(); + + BaseType::shutdown(); + } + + void completeFrameExecution(const CompletedFrameExecutionInfo &info, PendingForCompletedFrameData &data) + { + if (!data.freedTextureObjects.empty()) + { + eastl::sort(begin(data.freedTextureObjects), end(data.freedTextureObjects)); + texturePool.access()->free(begin(data.freedTextureObjects), end(data.freedTextureObjects)); + data.freedTextureObjects.clear(); + } + BaseType::completeFrameExecution(info, data); + } + +public: + template + TextureInterfaceBase *newTextureObject(Args &&...args) + { + return texturePool.access()->allocate(eastl::forward(args)...); + } + + template + void visitTextureObjects(T clb) + { + texturePool.access()->iterateAllocated(clb); + } + + void deleteTextureObjectOnFrameCompletion(TextureInterfaceBase *texture) + { + accessRecodingPendingFrameCompletion( + [=](auto &data) { data.freedTextureObjects.push_back(texture); }); + } + + void reserveTextureObjects(size_t count) { texturePool.access()->reserve(count); } + + size_t getTextureObjectCapacity() { return texturePool.access()->capacity(); } + + size_t getActiveTextureObjectCount() { return texturePool.access()->size(); } +}; + +class ImageObjectProvider : public TextureObjectProvider +{ + using BaseType = TextureObjectProvider; + +protected: + ContainerMutexWrapper, OSSpinlock> imageObjectPool; + + ImageObjectProvider() = default; + ~ImageObjectProvider() = default; + ImageObjectProvider(const ImageObjectProvider &) = delete; + ImageObjectProvider &operator=(const ImageObjectProvider &) = delete; + ImageObjectProvider(ImageObjectProvider &&) = delete; + ImageObjectProvider &operator=(ImageObjectProvider &&) = delete; + + void shutdown() + { + imageObjectPool.access()->freeAll(); + BaseType::shutdown(); + } + + void preRecovery() + { + imageObjectPool.access()->freeAll(); + BaseType::preRecovery(); + } + + template + Image *newImageObject(Args &&...args) + { + return imageObjectPool.access()->allocate(eastl::forward(args)...); + } + + void deleteImageObject(Image *image) { imageObjectPool.access()->free(image); } + + void deleteImageObjects(eastl::span images) + { + auto imageObjectPoolAccess = imageObjectPool.access(); + for (auto image : images) + { + imageObjectPoolAccess->free(image); + } + } + +public: + template + void visitImageObjects(T &&clb) + { + imageObjectPool.access()->iterateAllocated(eastl::forward(clb)); + } + + bool isImageAlive(Image *img) { return imageObjectPool.access()->isAllocated(img); } +}; + +} // namespace resource_manager +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/resource_manager/raytrace_acceleration_structure.h b/prog/engine/drv/drv3d_DX12/resource_manager/raytrace_acceleration_structure.h new file mode 100644 index 000000000..aa2d24e60 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/resource_manager/raytrace_acceleration_structure.h @@ -0,0 +1,31 @@ +#pragma once + +#include <3d/rayTrace/dag_drvRayTrace.h> + +#include "driver.h" +#include "resource_memory.h" + +#include "resource_manager/basic_buffer.h" + + +namespace drv3d_dx12 +{ + +#if D3D_HAS_RAY_TRACING +struct RaytraceAccelerationStructure : protected resource_manager::BasicBuffer +{ + D3D12_CPU_DESCRIPTOR_HANDLE handle{}; + + using resource_manager::BasicBuffer::create; + using resource_manager::BasicBuffer::getGPUPointer; + using resource_manager::BasicBuffer::reset; + + ID3D12Resource *getResourceHandle() { return buffer.Get(); } + + size_t size() const { return bufferMemory.size(); } + + ResourceMemory getMemory() const { return bufferMemory; } +}; +#endif + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/resource_memory_heap_rtx_components.h b/prog/engine/drv/drv3d_DX12/resource_manager/rtx_components.h similarity index 94% rename from prog/engine/drv/drv3d_DX12/resource_memory_heap_rtx_components.h rename to prog/engine/drv/drv3d_DX12/resource_manager/rtx_components.h index d53d01f13..943602e33 100644 --- a/prog/engine/drv/drv3d_DX12/resource_memory_heap_rtx_components.h +++ b/prog/engine/drv/drv3d_DX12/resource_manager/rtx_components.h @@ -1,25 +1,19 @@ #pragma once +#include +#include <3d/rayTrace/dag_drvRayTrace.h> +#include +#include -namespace drv3d_dx12 -{ - -#if D3D_HAS_RAY_TRACING -struct RaytraceAccelerationStructure : protected resource_manager::BasicBuffer -{ - D3D12_CPU_DESCRIPTOR_HANDLE handle{}; +#include "driver.h" +#include "container_mutex_wrapper.h" - using resource_manager::BasicBuffer::create; - using resource_manager::BasicBuffer::getGPUPointer; - using resource_manager::BasicBuffer::reset; +#include "resource_manager/raytrace_acceleration_structure.h" +#include "resource_manager/buffer_components.h" - ID3D12Resource *getResourceHandle() { return buffer.Get(); } - size_t size() const { return bufferMemory.size(); } - - ResourceMemory getMemory() const { return bufferMemory; } -}; -#endif +namespace drv3d_dx12 +{ namespace resource_manager { diff --git a/prog/engine/drv/drv3d_DX12/resource_memory.h b/prog/engine/drv/drv3d_DX12/resource_memory.h new file mode 100644 index 000000000..ccdba9782 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/resource_memory.h @@ -0,0 +1,159 @@ +#pragma once + +#include + +#include "driver.h" +#include "value_range.h" + + +namespace drv3d_dx12 +{ + +union HeapID +{ + using ValueType = uint32_t; + static constexpr uint32_t alias_bits = 1; + static constexpr uint32_t group_bits = 4; + static constexpr uint32_t index_bits = (8 * sizeof(ValueType)) - group_bits - alias_bits; + + ValueType raw = 0; + struct + { + ValueType isAlias : alias_bits; + ValueType group : group_bits; + ValueType index : index_bits; + }; +}; + +#if _TARGET_XBOX +struct VirtualFreeCaller +{ + void operator()(void *pointer) { VirtualFree(pointer, 0, MEM_RELEASE); } +}; + +class ResourceMemory +{ + uint8_t *heap = nullptr; + uint64_t sz = 0; + HeapID heapID; + +public: + ResourceMemory() = default; + ~ResourceMemory() = default; + + ResourceMemory(const ResourceMemory &) = default; + ResourceMemory &operator=(const ResourceMemory &) = default; + + ResourceMemory(ResourceMemory &&) = default; + ResourceMemory &operator=(ResourceMemory &&) = default; + + ResourceMemory(uint8_t *h, uint64_t s, HeapID heap_id) : heap{h}, sz{s}, heapID{heap_id} {} + + explicit operator bool() const { return heap != nullptr; } + + uint64_t size() const { return sz; } + + uintptr_t getAddress() const { return reinterpret_cast(heap); } + + uint8_t *asPointer() const { return heap; } + + ResourceMemory subRange(uint64_t offset, uint64_t o_size) const + { + G_ASSERT(offset + o_size <= size()); + return {heap + offset, o_size, heapID}; + } + + ResourceMemory aliasSubRange(uint32_t new_index, uint64_t offset, uint64_t o_size) const + { + G_ASSERT(offset + o_size <= size()); + HeapID newHeapID = heapID; + newHeapID.isAlias = 1; + newHeapID.index = new_index; + return {heap + offset, o_size, newHeapID}; + } + + bool isSubRangeOf(const ResourceMemory &mem) const + { + // NOTE: this can not check heapID as aliasing may change the heap id (from a real heap to a aliasing heap). + return make_value_range(heap, size()).isSubRangeOf(make_value_range(mem.heap, mem.size())); + } + + bool intersectsWith(const ResourceMemory &mem) const + { + return make_value_range(heap, size()).overlaps(make_value_range(mem.heap, mem.size())); + } + + uint64_t calculateOffset(const ResourceMemory &sub) const { return sub.heap - heap; } + + HeapID getHeapID() const { return heapID; } +}; +#else +class ResourceMemory +{ + ID3D12Heap *heap = nullptr; + ValueRange range; + HeapID heapID; + +public: + ResourceMemory() = default; + ~ResourceMemory() = default; + + ResourceMemory(const ResourceMemory &) = default; + ResourceMemory &operator=(const ResourceMemory &) = default; + + ResourceMemory(ResourceMemory &&) = default; + ResourceMemory &operator=(ResourceMemory &&) = default; + + ResourceMemory(ID3D12Heap *h, ValueRange r, HeapID heap_id) : heap{h}, range{r}, heapID{heap_id} {} + + ID3D12Heap *getHeap() const { return heap; } + + ValueRange getRange() const { return range; } + + explicit operator bool() const { return heap != nullptr; } + + uint64_t size() const { return range.size(); } + + uintptr_t getOffset() const { return range.front(); } + + ResourceMemory subRange(uint64_t offset, uint64_t o_size) const + { + G_ASSERT(offset + o_size <= range.size()); + ResourceMemory r; + r.heap = heap; + r.range = make_value_range(getOffset() + offset, o_size); + r.heapID = heapID; + return r; + } + + ResourceMemory aliasSubRange(uint32_t new_index, uint64_t offset, uint64_t o_size) const + { + G_ASSERT(offset + o_size <= range.size()); + ResourceMemory r; + r.heap = heap; + r.range = make_value_range(getOffset() + offset, o_size); + r.heapID = heapID; + r.heapID.isAlias = 1; + r.heapID.index = new_index; + return r; + } + + bool isSubRangeOf(const ResourceMemory &mem) const + { + // NOTE: this can not check heapID as aliasing may change the heap id (from a real heap to a aliasing heap). + if (mem.heap != heap) + { + return false; + } + return range.isSubRangeOf(mem.range); + } + + bool intersectsWith(const ResourceMemory &mem) const { return (heap == mem.heap) && range.overlaps(mem.range); } + + uint64_t calculateOffset(const ResourceMemory &sub) const { return sub.range.start - range.start; } + + HeapID getHeapID() const { return heapID; } +}; +#endif + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/resource_memory_heap.cpp b/prog/engine/drv/drv3d_DX12/resource_memory_heap.cpp index d0de8200f..146a0a8ea 100644 --- a/prog/engine/drv/drv3d_DX12/resource_memory_heap.cpp +++ b/prog/engine/drv/drv3d_DX12/resource_memory_heap.cpp @@ -67,7 +67,7 @@ ImageCreateResult TextureImageFactory::createTexture(DXGIAdapter *adapter, ID3D1 : ImageGlobalSubresouceId::make_invalid(); result.image = newImageObject(ResourceMemory{}, eastl::move(texture), ii.type, ii.memoryLayout, ii.format, ii.size, ii.mips, - ii.arrays, subResIdBase); + ii.arrays, subResIdBase, ii.sampleDesc.Count > 1); recordTextureAllocated(result.image->getMipLevelRange(), result.image->getArrayLayers(), result.image->getBaseExtent(), result.image->getMemory().size(), result.image->getFormat(), name); @@ -106,8 +106,8 @@ ImageCreateResult TextureImageFactory::createTexture(DXGIAdapter *adapter, ID3D1 ? allocateGlobalResourceIdRange(ii.getSubResourceCount()) : ImageGlobalSubresouceId::make_invalid(); - result.image = - newImageObject(memory, eastl::move(texture), ii.type, ii.memoryLayout, ii.format, ii.size, ii.mips, ii.arrays, subResIdBase); + result.image = newImageObject(memory, eastl::move(texture), ii.type, ii.memoryLayout, ii.format, ii.size, ii.mips, ii.arrays, + subResIdBase, ii.sampleDesc.Count > 1); updateMemoryRangeUse(memory, result.image); recordTextureAllocated(result.image->getMipLevelRange(), result.image->getArrayLayers(), result.image->getBaseExtent(), @@ -133,7 +133,7 @@ ImageCreateResult TextureImageFactory::createTexture(DXGIAdapter *adapter, ID3D1 (ii.allocateSubresourceIDs) ? allocateGlobalResourceIdRange(ii.getSubResourceCount()) : ImageGlobalSubresouceId::make_invalid(); result.image = newImageObject(ResourceMemory{}, eastl::move(texture), ii.type, ii.memoryLayout, ii.format, ii.size, ii.mips, - ii.arrays, subResIdBase); + ii.arrays, subResIdBase, ii.sampleDesc.Count > 1); } result.image->setGPUChangeable( @@ -158,7 +158,7 @@ Image *TextureImageFactory::adoptTexture(ID3D12Resource *texture, const char *na auto subResIdBase = allocateGlobalResourceIdRange(idCount); auto result = newImageObject(ResourceMemory{}, eastl::move(texture_ref), D3D12_RESOURCE_DIMENSION_TEXTURE2D, desc.Layout, dagorFormat, Extent3D{uint32_t(desc.Width), uint32_t(desc.Height), 1}, MipMapCount::make(desc.MipLevels), - ArrayLayerCount::make(desc.DepthOrArraySize), subResIdBase); + ArrayLayerCount::make(desc.DepthOrArraySize), subResIdBase, false); recordTextureAdopted(result->getMipLevelRange(), result->getArrayLayers(), result->getBaseExtent(), result->getFormat(), name); @@ -589,7 +589,7 @@ ImageCreateResult AliasHeapProvider::placeTextureInHeap(ID3D12Device *device, :: auto subResIdBase = allocateGlobalResourceIdRange(subResCount); result.image = newImageObject(memory, eastl::move(texture), dxDesc.Dimension, dxDesc.Layout, fmt, ext, mipMapCount, arrayLayerCount, - subResIdBase); + subResIdBase, dxDesc.SampleDesc.Count > 1); recordTexturePlacedInUserResourceHeap(result.image->getMipLevelRange(), result.image->getArrayLayers(), result.image->getBaseExtent(), result.image->getMemory().size(), result.image->getFormat(), name); @@ -607,8 +607,6 @@ ResourceHeapGroupProperties AliasHeapProvider::getResourceHeapGroupProperties(:: { ResourceHeapGroupProperties result; result.flags = 0; - // currently limited to 32 bits because of internal structures - constexpr uint64_t software_limit = 0x7FFF0000; if (isUMASystem()) { @@ -635,7 +633,7 @@ ResourceHeapGroupProperties AliasHeapProvider::getResourceHeapGroupProperties(:: // On PC there is currently no HW where we could access and control usage of on chip memory result.isOnChip = false; - result.maxResourceSize = result.maxHeapSize = min(result.maxHeapSize, software_limit); + result.maxResourceSize = result.maxHeapSize; return result; } #else @@ -645,8 +643,6 @@ ResourceHeapGroupProperties AliasHeapProvider::getResourceHeapGroupProperties(:: ResourceHeapGroupProperties result; result.flags = 0; - // currently limited to 32 bits because of internal structures - constexpr uint64_t software_limit = 0x7FFF0000; result.isCPUVisible = true; result.isGPULocal = true; @@ -664,7 +660,7 @@ ResourceHeapGroupProperties AliasHeapProvider::getResourceHeapGroupProperties(:: result.isOnChip = false; size_t gameLimit = 0, gameUsed = 0; xbox_get_memory_status(gameUsed, gameLimit); - result.maxResourceSize = result.maxHeapSize = min(gameLimit, software_limit); + result.maxResourceSize = result.maxHeapSize = gameLimit; } return result; @@ -926,8 +922,8 @@ ImageCreateResult AliasHeapProvider::aliasTexture(ID3D12Device *device, const Im } auto subResIdBase = allocateGlobalResourceIdRange(ii.getSubResourceCount()); - result.image = - newImageObject(memory, eastl::move(texture), ii.type, ii.memoryLayout, ii.format, ii.size, ii.mips, ii.arrays, subResIdBase); + result.image = newImageObject(memory, eastl::move(texture), ii.type, ii.memoryLayout, ii.format, ii.size, ii.mips, ii.arrays, + subResIdBase, ii.sampleDesc.Count > 1); heap.images.push_back(result.image); } @@ -1107,7 +1103,7 @@ void begin_selectable_row(const char *text) ImGui::Selectable(text, false, ImGuiSelectableFlags_SpanAllColumns | ImGuiSelectableFlags_AllowItemOverlap); } -void draw_segment(void *base, ValueRange range, uint32_t max, const char *text) +void draw_segment(void *base, ValueRange range, uint64_t max, const char *text) { ByteUnits size = range.size(); char strBuf[MAX_OBJECT_NAME_LENGTH + 64]; @@ -1115,15 +1111,15 @@ void draw_segment(void *base, ValueRange range, uint32_t max, const ch sprintf_s(strBuf, "%s###%p+%u", text, base, range.front()); begin_selectable_row(strBuf); ImGui::TableSetColumnIndex(2); - ImGui::Text("%08X", range.front()); + ImGui::Text("%016llX", range.front()); ImGui::TableSetColumnIndex(3); - ImGui::Text("%08X", range.back() + 1); + ImGui::Text("%016llX", range.back() + 1); ImGui::TableSetColumnIndex(6); ImGui::Text("%.2f %s", size.units(), size.name()); G_UNUSED(max); } -void draw_segment(void *base, uint32_t from, uint32_t to, uint32_t max, const char *text) +void draw_segment(void *base, uint64_t from, uint64_t to, uint64_t max, const char *text) { ByteUnits size = to - from; char strBuf[MAX_OBJECT_NAME_LENGTH + 64]; @@ -1131,9 +1127,9 @@ void draw_segment(void *base, uint32_t from, uint32_t to, uint32_t max, const ch sprintf_s(strBuf, "%s###%p+%u", text, base, from); begin_selectable_row(strBuf); ImGui::TableSetColumnIndex(2); - ImGui::Text("%08X", from); + ImGui::Text("%016llX", from); ImGui::TableSetColumnIndex(3); - ImGui::Text("%08X", to); + ImGui::Text("%016llX", to); ImGui::TableSetColumnIndex(6); ImGui::Text("%.2f %s", size.units(), size.name()); G_UNUSED(max); @@ -1775,7 +1771,7 @@ MetricsVisualizer::GraphDisplayInfo MetricsVisualizer::drawGraphViewControls(Gra GraphDisplayInfo result = getGraphDisplayInfo(graph); auto m = static_cast(result.mode); - ImGui::Combo("Graph Mode", &m, graph_mode_name_table, array_size(graph_mode_name_table)); + ImGui::Combo("Graph Mode", &m, graph_mode_name_table, countof(graph_mode_name_table)); result.mode = static_cast(m); ImGui::SliderScalar("Window size", ImGuiDataType_U64, &result.windowSize, &min_graph_window_size, &max_graph_window_size, nullptr, ImGuiSliderFlags_AlwaysClamp); @@ -1894,7 +1890,7 @@ MetricsVisualizer::PlotData MetricsVisualizer::setupPlotXRange(ConcurrentMetrics void MetricsVisualizer::drawMetricsCaptureControls() { constexpr int max_selectors_per_row = 5; - constexpr int child_element_count = ((array_size(metric_name_table) + max_selectors_per_row - 1) / max_selectors_per_row); + constexpr int child_element_count = ((countof(metric_name_table) + max_selectors_per_row - 1) / max_selectors_per_row); int child_height = (child_element_count + 2) * ImGui::GetFrameHeightWithSpacing(); if (begin_sub_section("DX12-Live-Metrics-Capture-Controls", "Capture metrics", child_height)) @@ -1967,7 +1963,7 @@ void MetricsVisualizer::drawMetricsEvnetsViewFilterControls() constexpr int non_filter_metrics = 3; constexpr int max_selectors_per_row = 5; constexpr int child_element_count = - ((array_size(metric_name_table) - non_filter_metrics + max_selectors_per_row - 1) / max_selectors_per_row); + ((countof(metric_name_table) - non_filter_metrics + max_selectors_per_row - 1) / max_selectors_per_row); int child_height = (child_element_count + 6) * ImGui::GetFrameHeightWithSpacing(); if (begin_sub_section("DX12-Live-Metrics-Event-Filter-Controls", "Events view filters", child_height)) @@ -3793,9 +3789,9 @@ void DebugView::drawUserHeapsTable() auto mem = image->getMemory(); auto offset = heap.memory.calculateOffset(mem); ImGui::TableSetColumnIndex(2); - ImGui::Text("%08X", offset); + ImGui::Text("%016llX", offset); ImGui::TableSetColumnIndex(3); - ImGui::Text("%08X", offset + mem.size()); + ImGui::Text("%016llX", offset + mem.size()); ImGui::TableSetColumnIndex(4); auto sizeUnits = size_to_unit_table(mem.size()); ImGui::Text("%.f %s", compute_unit_type_size(mem.size(), sizeUnits), get_unit_name(sizeUnits)); @@ -3812,9 +3808,9 @@ void DebugView::drawUserHeapsTable() auto &mem = buffer.bufferMemory; auto offset = heap.memory.calculateOffset(mem); ImGui::TableSetColumnIndex(2); - ImGui::Text("%08X", offset); + ImGui::Text("%016llX", offset); ImGui::TableSetColumnIndex(3); - ImGui::Text("%08X", offset + mem.size()); + ImGui::Text("%016llX", offset + mem.size()); ImGui::TableSetColumnIndex(4); auto sizeUnits = size_to_unit_table(mem.size()); ImGui::Text("%.f %s", compute_unit_type_size(mem.size(), sizeUnits), get_unit_name(sizeUnits)); @@ -4565,7 +4561,7 @@ void DebugView::drawHeapsTable() // have to get access to buffer heaps before to heaps as otherwise we have a ordering issue and deadlock. auto bufferHeapStateAccess = bufferHeapState.access(); OSSpinlockScopedLock lock{heapGroupMutex}; - for (properties.raw = 0; properties.raw < array_size(groups); ++properties.raw) + for (properties.raw = 0; properties.raw < countof(groups); ++properties.raw) { auto &group = groups[properties.raw]; FragmentationCalculatorContext overalFragmentation; diff --git a/prog/engine/drv/drv3d_DX12/resource_memory_heap.h b/prog/engine/drv/drv3d_DX12/resource_memory_heap.h index 74caa32bf..79a8161cb 100644 --- a/prog/engine/drv/drv3d_DX12/resource_memory_heap.h +++ b/prog/engine/drv/drv3d_DX12/resource_memory_heap.h @@ -1,5 +1,6 @@ #pragma once +#include "value_range.h" #include "free_list_utils.h" #if _TARGET_XBOX @@ -12,26 +13,13 @@ #define HEAP_LOG(...) #endif -#include "resource_memory_heap_basic_components.h" -#include "resource_memory_heap_object_components.h" -#include "resource_memory_heap_descriptor_components.h" -#include "resource_memory_heap_heap_components.h" - -#if DX12_USE_ESRAM -#include "resource_memory_heap_esram_components_xbox.h" -#else -namespace drv3d_dx12 -{ -namespace resource_manager -{ -using ESRamPageMappingProvider = ResourceMemoryHeapProvider; -} // namespace resource_manager -} // namespace drv3d_dx12 -#endif - -#include "resource_memory_heap_host_shared_components.h" -#include "resource_memory_heap_buffer_components.h" -#include "resource_memory_heap_rtx_components.h" +#include "resource_manager/object_components.h" +#include "resource_manager/descriptor_components.h" +#include "resource_manager/heap_components.h" +#include "resource_manager/esram_components.h" +#include "resource_manager/host_shared_components.h" +#include "resource_manager/buffer_components.h" +#include "resource_manager/rtx_components.h" namespace drv3d_dx12 @@ -478,7 +466,7 @@ class FrameFinalizer : public SamplerDescriptorProvider #else G_UNUSED(adapter); #endif - for (; info.historyIndex < array_size(finalizerData); ++info.historyIndex) + for (; info.historyIndex < countof(finalizerData); ++info.historyIndex) { BaseType::completeFrameExecution(info, finalizerData[info.historyIndex]); } @@ -499,7 +487,7 @@ class FrameFinalizer : public SamplerDescriptorProvider #else G_UNUSED(adapter); #endif - for (; info.historyIndex < array_size(finalizerData); ++info.historyIndex) + for (; info.historyIndex < countof(finalizerData); ++info.historyIndex) { BaseType::completeFrameExecution(info, finalizerData[info.historyIndex]); } @@ -564,7 +552,7 @@ class DebugViewBase : public FrameFinalizer char *getEventObjectNameFilterBasePointer() { return metricsVisualizerState.eventObjectNameFilter; } - size_t getEventObjectNameFilterMaxLength() { return array_size(metricsVisualizerState.eventObjectNameFilter); } + size_t getEventObjectNameFilterMaxLength() { return countof(metricsVisualizerState.eventObjectNameFilter); } bool checkStatusFlag(StatusFlag flag) const { return metricsVisualizerState.statusFlags.test(flag); } @@ -588,7 +576,7 @@ class DebugViewBase : public FrameFinalizer template void iterateGraphDisplayInfos(T clb) { - for (size_t i = 0; i < array_size(metricsVisualizerState.graphDisplayInfos); ++i) + for (size_t i = 0; i < countof(metricsVisualizerState.graphDisplayInfos); ++i) { clb(static_cast(i), metricsVisualizerState.graphDisplayInfos[i]); } diff --git a/prog/engine/drv/drv3d_DX12/resource_state_tracker.h b/prog/engine/drv/drv3d_DX12/resource_state_tracker.h index c14f679d4..8d88bb368 100644 --- a/prog/engine/drv/drv3d_DX12/resource_state_tracker.h +++ b/prog/engine/drv/drv3d_DX12/resource_state_tracker.h @@ -3,6 +3,20 @@ #include #include #include +#include +#include <3d/dag_drv3dConsts.h> + +#include "driver.h" +#include "constants.h" +#include "typed_bit_set.h" +#include "d3d12_utils.h" +#include "format_store.h" +#include "pipeline.h" +#include "stateful_command_buffer.h" +#include "image_global_subresource_id.h" + +#include "resource_manager/image.h" + namespace dag { @@ -18,7 +32,7 @@ DAG_DECLARE_RELOCATABLE(D3D12_RESOURCE_BARRIER); namespace drv3d_dx12 { // Meta stage, stage values of this indicate that any stage be meant (used for resource activation) -static constexpr uint32_t STAGE_ANY = ~uint32_t(0); +inline constexpr uint32_t STAGE_ANY = ~uint32_t(0); #if DX12_REPORT_TRANSITION_INFO #define REPORT debug @@ -556,7 +570,7 @@ inline char *make_resource_barrier_string_from_state(char *str, size_t len, D3D1 buf[ofs++] = '|'; buf[ofs++] = ' '; } - auto ln = array_size(src); + auto ln = countof(src); auto space = ln - 1; if ((len - ofs - 1) < space) return ofs; @@ -2740,6 +2754,21 @@ class ResourceUsageManager : protected ResourceStateTracker texture->getPlaneCount(), D3D12_RESOURCE_STATES_STATIC_TEXTURE_READ_STATE); } + // now only resolving one subresource + void useTextureAsResolveSource(BarrierBatcher &barriers, SplitTransitionTracker &stt, Image *texture) + { + G_ASSERT(texture->hasTrackedState()); + transitionTexture(barriers, stt, texture, texture->getGlobalSubResourceIdBase(), SubresourceIndex::make(0), 1, + texture->getSubresourcesPerPlane(), texture->getPlaneCount(), D3D12_RESOURCE_STATE_RESOLVE_SOURCE); + } + + void useTextureAsResolveDestination(BarrierBatcher &barriers, SplitTransitionTracker &stt, Image *texture) + { + G_ASSERT(texture->hasTrackedState()); + transitionTexture(barriers, stt, texture, texture->getGlobalSubResourceIdBase(), SubresourceIndex::make(0), 1, + texture->getSubresourcesPerPlane(), texture->getPlaneCount(), D3D12_RESOURCE_STATE_RESOLVE_DEST); + } + void useTextureAsBlitSource(BarrierBatcher &barriers, SplitTransitionTracker &stt, Image *texture, ImageViewState view) { if (!texture->hasTrackedState()) @@ -4349,6 +4378,17 @@ class ResourceUsageManagerWithHistory : protected ResourceUsageManager BaseType::finishUseTextureAsCopyDestination(barriers, stt, texture, sub_res); } + // TODO: Similar to present (should we track? Its a driver internal thing...) + void useTextureAsResolveSource(BarrierBatcher &barriers, SplitTransitionTracker &stt, Image *texture) + { + BaseType::useTextureAsResolveSource(barriers, stt, texture); + } + + void useTextureAsResolveDestination(BarrierBatcher &barriers, SplitTransitionTracker &stt, Image *texture) + { + BaseType::useTextureAsResolveDestination(barriers, stt, texture); + } + void useTextureAsBlitSource(BarrierBatcher &barriers, SplitTransitionTracker &stt, Image *texture, ImageViewState view) { if (!texture->hasTrackedState()) diff --git a/prog/engine/drv/drv3d_DX12/resource_usage_debugger.cpp b/prog/engine/drv/drv3d_DX12/resource_usage_debugger.cpp index 33925b312..7531ec63b 100644 --- a/prog/engine/drv/drv3d_DX12/resource_usage_debugger.cpp +++ b/prog/engine/drv/drv3d_DX12/resource_usage_debugger.cpp @@ -89,7 +89,7 @@ char *translate_to_string(char (&buf)[N], D3D12_RESOURCE_STATES state) auto start = buf; auto ed = buf + N - 1; auto concat = [&at, ed, start](const auto &s) { - auto len = array_size(s) - 1; + auto len = countof(s) - 1; auto left = ed - at; if (left >= 3 && start != at) { @@ -172,7 +172,7 @@ char *translate_to_string(char (&buf)[N], ResourceBarrier barrier) auto start = buf; auto ed = buf + N - 1; auto concat = [&at, ed, start](const auto &s) { - auto len = array_size(s) - 1; + auto len = countof(s) - 1; auto left = ed - at; if (left >= 3 && start != at) { diff --git a/prog/engine/drv/drv3d_DX12/sampler_state.h b/prog/engine/drv/drv3d_DX12/sampler_state.h new file mode 100644 index 000000000..74a85d943 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/sampler_state.h @@ -0,0 +1,206 @@ +#pragma once + +#include <3d/dag_sampler.h> + +#include "bitfield.h" +#include "half_float.h" +#include "d3d12_d3d_translation.h" + + +namespace drv3d_dx12 +{ + +BEGIN_BITFIELD_TYPE(SamplerState, uint32_t) + enum + { + BIAS_BITS = 16, + BIAS_OFFSET = 0, + MIP_BITS = 1, + MIP_SHIFT = BIAS_OFFSET + BIAS_BITS, + FILTER_BITS = 1, + FILTER_SHIFT = MIP_SHIFT + MIP_BITS, + // Instead of using N bits per coord, we store all coords in one value, this safes 2 bits + COORD_VALUE_COUNT = (D3D12_TEXTURE_ADDRESS_MODE_MIRROR_ONCE - D3D12_TEXTURE_ADDRESS_MODE_WRAP) + 1, + COORD_MAX_VALUE = COORD_VALUE_COUNT * COORD_VALUE_COUNT * COORD_VALUE_COUNT, + COORD_BITS = BitsNeeded::VALUE, + COORD_SHIFT = FILTER_SHIFT + FILTER_BITS, + ANISO_BITS = 3, + ANISO_SHIFT = COORD_SHIFT + COORD_BITS, + BORDER_BITS = 2, + BORDER_SHIFT = ANISO_SHIFT + ANISO_BITS, + IS_COMPARE_BITS = 1, + IS_COMPARE_SHIFT = BORDER_BITS + BORDER_SHIFT, + FLOAT_EXP_BASE = 127, + FLOAT_EXP_SHIFT = 23, + }; + ADD_BITFIELD_MEMBER(mipMapMode, MIP_SHIFT, MIP_BITS) + ADD_BITFIELD_MEMBER(filterMode, FILTER_SHIFT, FILTER_BITS) + ADD_BITFIELD_MEMBER(borderColorMode, BORDER_SHIFT, BORDER_BITS) + ADD_BITFIELD_MEMBER(anisotropicValue, ANISO_SHIFT, ANISO_BITS) + ADD_BITFIELD_MEMBER(coordModes, COORD_SHIFT, COORD_BITS) + ADD_BITFIELD_MEMBER(biasBits, BIAS_OFFSET, BIAS_BITS) + ADD_BITFIELD_MEMBER(isCompare, IS_COMPARE_SHIFT, IS_COMPARE_BITS) + + BEGIN_BITFIELD_TYPE(iee754Float, uint32_t) + float asFloat; + uint32_t asUint; + int32_t asInt; + ADD_BITFIELD_MEMBER(mantissa, 0, 23) + ADD_BITFIELD_MEMBER(exponent, 23, 8) + ADD_BITFIELD_MEMBER(sign, 31, 1) + END_BITFIELD_TYPE() + + D3D12_SAMPLER_DESC asDesc() const + { + D3D12_SAMPLER_DESC result; + + result.MaxAnisotropy = getAnisoInt(); + if (result.MaxAnisotropy > 1) + result.Filter = D3D12_ENCODE_ANISOTROPIC_FILTER(static_cast(isCompare)); + else + result.Filter = D3D12_ENCODE_BASIC_FILTER(getFilter(), getFilter(), getMip(), static_cast(isCompare)); + + result.AddressU = getU(); + result.AddressV = getV(); + result.AddressW = getW(); + result.MipLODBias = getBias(); + result.ComparisonFunc = isCompare ? D3D12_COMPARISON_FUNC_LESS_EQUAL : D3D12_COMPARISON_FUNC_ALWAYS; + result.MinLOD = 0; + result.MaxLOD = FLT_MAX; + result.BorderColor[0] = static_cast(borderColorMode) & 1 ? 1.f : 0.f; + result.BorderColor[1] = static_cast(borderColorMode) & 1 ? 1.f : 0.f; + result.BorderColor[2] = static_cast(borderColorMode) & 1 ? 1.f : 0.f; + result.BorderColor[3] = static_cast(borderColorMode) & 2 ? 1.f : 0.f; + + return result; + } + + void setMip(D3D12_FILTER_TYPE mip) { mipMapMode = (uint32_t)mip; } + D3D12_FILTER_TYPE getMip() const { return static_cast(static_cast(mipMapMode)); } + void setFilter(D3D12_FILTER_TYPE filter) { filterMode = filter; } + D3D12_FILTER_TYPE getFilter() const { return static_cast(static_cast(filterMode)); } + void setCoordModes(D3D12_TEXTURE_ADDRESS_MODE u, D3D12_TEXTURE_ADDRESS_MODE v, D3D12_TEXTURE_ADDRESS_MODE w) + { + auto rawU = static_cast(u) - D3D12_TEXTURE_ADDRESS_MODE_WRAP; + auto rawV = static_cast(v) - D3D12_TEXTURE_ADDRESS_MODE_WRAP; + auto rawW = static_cast(w) - D3D12_TEXTURE_ADDRESS_MODE_WRAP; + coordModes = rawW * COORD_VALUE_COUNT * COORD_VALUE_COUNT + rawV * COORD_VALUE_COUNT + rawU; + } + void setU(D3D12_TEXTURE_ADDRESS_MODE u) + { + auto oldRawU = coordModes % COORD_VALUE_COUNT; + auto newRawU = static_cast(u) - D3D12_TEXTURE_ADDRESS_MODE_WRAP; + coordModes -= oldRawU; + coordModes += newRawU; + } + void setV(D3D12_TEXTURE_ADDRESS_MODE v) + { + auto oldRawV = (coordModes / COORD_VALUE_COUNT) % COORD_VALUE_COUNT; + auto newRawV = static_cast(v) - D3D12_TEXTURE_ADDRESS_MODE_WRAP; + coordModes -= oldRawV * COORD_VALUE_COUNT; + coordModes += newRawV * COORD_VALUE_COUNT; + } + void setW(D3D12_TEXTURE_ADDRESS_MODE w) + { + auto oldRawW = (coordModes / COORD_VALUE_COUNT / COORD_VALUE_COUNT) % COORD_VALUE_COUNT; + auto newRawW = static_cast(w) - D3D12_TEXTURE_ADDRESS_MODE_WRAP; + coordModes -= oldRawW * COORD_VALUE_COUNT * COORD_VALUE_COUNT; + coordModes += newRawW * COORD_VALUE_COUNT * COORD_VALUE_COUNT; + } + D3D12_TEXTURE_ADDRESS_MODE getU() const + { + auto rawValue = coordModes % COORD_VALUE_COUNT; + return static_cast(D3D12_TEXTURE_ADDRESS_MODE_WRAP + rawValue); + } + D3D12_TEXTURE_ADDRESS_MODE getV() const + { + auto rawValue = (coordModes / COORD_VALUE_COUNT) % COORD_VALUE_COUNT; + return static_cast(D3D12_TEXTURE_ADDRESS_MODE_WRAP + rawValue); + } + D3D12_TEXTURE_ADDRESS_MODE getW() const + { + auto rawValue = (coordModes / COORD_VALUE_COUNT) / COORD_VALUE_COUNT; + return static_cast(D3D12_TEXTURE_ADDRESS_MODE_WRAP + rawValue); + } + void setBias(float b) { biasBits = half_float::convert_from_float(b); } + float getBias() const { return half_float::convert_to_float(biasBits); } + void setAniso(float a) + { + // some float magic, falls flat on its face if it is not ieee-754 + // extracts exponent and subtracts the base + // clamps the result into range from 0 to 4 which represents 1,2,4,8 and 16 as floats + // negative values are treated as positive + // values from range 0 - 1 are rounded up + // everything else is rounded down + iee754Float f; + f.asFloat = a; + int32_t value = f.exponent - FLOAT_EXP_BASE; + // clamp from 1 to 16 + value = clamp(value, 0, 4); + anisotropicValue = value; + } + float getAniso() const + { + iee754Float f; + f.exponent = FLOAT_EXP_BASE + anisotropicValue; + return f.asFloat; + } + uint32_t getAnisoInt() const { return 1u << static_cast(anisotropicValue); } + // Same restrictions as with vulkan, either color is white or black and its either fully + // transparent or opaque + void setBorder(E3DCOLOR color) { borderColorMode = ((color.r || color.g || color.b) ? 1 : 0) | (color.a ? 2 : 0); } + E3DCOLOR getBorder() const + { + E3DCOLOR result; + result.r = static_cast(borderColorMode) & 1 ? 0xFF : 0; + result.g = static_cast(borderColorMode) & 1 ? 0xFF : 0; + result.b = static_cast(borderColorMode) & 1 ? 0xFF : 0; + result.a = static_cast(borderColorMode) & 2 ? 0xFF : 0; + return result; + } + + bool needsBorderColor() const + { + return (D3D12_TEXTURE_ADDRESS_MODE_BORDER == getU()) || (D3D12_TEXTURE_ADDRESS_MODE_BORDER == getV()) || + (D3D12_TEXTURE_ADDRESS_MODE_BORDER == getW()); + } + + bool normalizeSelf() + { + bool wasNormalized = false; + if (!needsBorderColor()) + { + setBorder(0); + wasNormalized = true; + } + return wasNormalized; + } + + SamplerState normalize() const + { + // normalization is when border color is not needed we default to color 0 + SamplerState copy = *this; + copy.normalizeSelf(); + return copy; + } + + static SamplerState fromSamplerInfo(const d3d::SamplerInfo &info); +END_BITFIELD_TYPE() + + +inline SamplerState SamplerState::fromSamplerInfo(const d3d::SamplerInfo &info) +{ + SamplerState state; + state.isCompare = info.filter_mode == d3d::FilterMode::Compare; + state.setFilter(translate_filter_type_to_dx12(static_cast(info.filter_mode))); + state.setMip(translate_mip_filter_type_to_dx12(static_cast(info.mip_map_mode))); + state.setU(translate_texture_address_mode_to_dx12(static_cast(info.address_mode_u))); + state.setV(translate_texture_address_mode_to_dx12(static_cast(info.address_mode_v))); + state.setW(translate_texture_address_mode_to_dx12(static_cast(info.address_mode_w))); + state.setBias(info.mip_map_bias); + state.setAniso(info.anisotropic_max); + state.setBorder(info.border_color); + return state; +} + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/shader.cpp b/prog/engine/drv/drv3d_DX12/shader.cpp index 362f99721..10dd61832 100644 --- a/prog/engine/drv/drv3d_DX12/shader.cpp +++ b/prog/engine/drv/drv3d_DX12/shader.cpp @@ -497,7 +497,7 @@ void ShaderProgramDatabase::initNullPixelShader(DeviceContext &ctx) { dxil::ShaderHeader nullHeader = {}; nullHeader.shaderType = static_cast(dxil::ShaderStage::PIXEL); - auto nullShader = make_span(null_pixel_shader, array_size(null_pixel_shader)); + auto nullShader = make_span(null_pixel_shader, countof(null_pixel_shader)); auto nPSH = newRawPixelShader(ctx, nullHeader, nullShader); nullPixelShader = nPSH; } @@ -532,7 +532,7 @@ ShaderID ShaderProgramDatabase::newRawPixelShader(DeviceContext &ctx, const dxil return id; } -ProgramID ShaderProgramDatabase::newComputeProgram(DeviceContext &ctx, const void *data) +ProgramID ShaderProgramDatabase::newComputeProgram(DeviceContext &ctx, const void *data, CSPreloaded preloaded) { auto basicModule = decode_shader_layout((const uint8_t *)data); if (!basicModule) @@ -551,7 +551,7 @@ ProgramID ShaderProgramDatabase::newComputeProgram(DeviceContext &ctx, const voi ScopedLockWriteTemplate lock(dataGuard); program = shaderProgramGroups.addComputeShaderProgram(); } - ctx.addComputeProgram(program, eastl::move(module)); + ctx.addComputeProgram(program, eastl::move(module), preloaded); return program; } diff --git a/prog/engine/drv/drv3d_DX12/shader.h b/prog/engine/drv/drv3d_DX12/shader.h index 0bc6fd320..b6b30f003 100644 --- a/prog/engine/drv/drv3d_DX12/shader.h +++ b/prog/engine/drv/drv3d_DX12/shader.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -13,9 +14,17 @@ #include #include -static constexpr uint32_t MAX_VERTEX_ATTRIBUTES = 16; -static constexpr uint32_t MAX_VERTEX_INPUT_STREAMS = 4; -static constexpr uint32_t MAX_SEMANTIC_INDEX = VSDR_TEXC14 + 1; +#include "driver.h" +#include "shader_program_id.h" +#include "tagged_handles.h" +#include "dynamic_array.h" +#include "byte_units.h" +#include "bitfield.h" + + +inline constexpr uint32_t MAX_VERTEX_ATTRIBUTES = 16; +inline constexpr uint32_t MAX_VERTEX_INPUT_STREAMS = 4; +inline constexpr uint32_t MAX_SEMANTIC_INDEX = VSDR_TEXC14 + 1; namespace drv3d_dx12 { @@ -1297,7 +1306,7 @@ class ShaderProgramDatabase ShaderID newRawPixelShader(DeviceContext &ctx, const dxil::ShaderHeader &header, dag::ConstSpan byte_code); public: - ProgramID newComputeProgram(DeviceContext &ctx, const void *data); + ProgramID newComputeProgram(DeviceContext &ctx, const void *data, CSPreloaded preloaded); ProgramID newGraphicsProgram(DeviceContext &ctx, InputLayoutID vdecl, ShaderID vs, ShaderID ps); InputLayoutID getInputLayoutForGraphicsProgram(ProgramID program); GraphicsProgramUsageInfo getGraphicsProgramForStateUpdate(ProgramID program); @@ -1396,7 +1405,7 @@ class ScriptedShadersBinDumpManager template void enumerateShaderFromHash(const dxil::HashValue &hash, T reciever) const { - for (uint32_t gi = 0; gi < array_size(dumps); ++gi) + for (uint32_t gi = 0; gi < countof(dumps); ++gi) { auto &group = dumps[gi]; if (!group.owner) @@ -1758,7 +1767,7 @@ class ShaderModuleManager : public ScriptedShadersBinDumpManager return ShaderID::make(0, si); } } - for (uint32_t gi = 0; gi < array_size(shaderGroup); ++gi) + for (uint32_t gi = 0; gi < countof(shaderGroup); ++gi) { auto &group = shaderGroup[gi].vertex; for (uint32_t si = 0; si < group.size(); ++si) @@ -1786,7 +1795,7 @@ class ShaderModuleManager : public ScriptedShadersBinDumpManager return ShaderID::make(0, si); } } - for (uint32_t gi = 0; gi < array_size(shaderGroup); ++gi) + for (uint32_t gi = 0; gi < countof(shaderGroup); ++gi) { auto &group = shaderGroup[gi].pixel; for (uint32_t si = 0; si < group.size(); ++si) diff --git a/prog/engine/drv/drv3d_DX12/shader_byte_code_id.h b/prog/engine/drv/drv3d_DX12/shader_byte_code_id.h new file mode 100644 index 000000000..793ea2993 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/shader_byte_code_id.h @@ -0,0 +1,52 @@ +#pragma once + + +namespace drv3d_dx12 +{ + + +class ShaderByteCodeId +{ + union + { + uint32_t value; + struct + { + uint32_t group : 3; + uint32_t index : 29; + }; + }; + +public: + // ShaderByteCodeId() = default; + // explicit ShaderByteCodeId(uint32_t value) : value{value} {} + explicit operator bool() const { return -1 != value; } + bool operator!() const { return -1 == value; } + + uint32_t getGroup() const { return group; } + uint32_t getIndex() const { return index; } + + int32_t exportValue() const { return value; } + + static ShaderByteCodeId Null() + { + ShaderByteCodeId result; + result.value = -1; + return result; + } + + static ShaderByteCodeId make(uint32_t group, uint32_t index) + { + ShaderByteCodeId result; + result.group = group; + result.index = index; + return result; + } + + friend bool operator<(const ShaderByteCodeId l, const ShaderByteCodeId r) { return l.value < r.value; } + friend bool operator>(const ShaderByteCodeId l, const ShaderByteCodeId r) { return l.value > r.value; } + friend bool operator!=(const ShaderByteCodeId l, const ShaderByteCodeId r) { return l.value != r.value; } + friend bool operator==(const ShaderByteCodeId l, const ShaderByteCodeId r) { return l.value == r.value; } +}; + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/shader_program_id.h b/prog/engine/drv/drv3d_DX12/shader_program_id.h new file mode 100644 index 000000000..4a42eb7aa --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/shader_program_id.h @@ -0,0 +1,196 @@ +#pragma once + +namespace drv3d_dx12 +{ + +class ShaderID +{ + union + { + int32_t value; + struct + { + uint32_t group : 3; + uint32_t index : 29; + }; + }; + +public: + // ShaderID() = default; + // explicit ShaderID(uint32_t value) : value{value} {} + explicit operator bool() const { return -1 != value; } + bool operator!() const { return -1 == value; } + + uint32_t getGroup() const { return group; } + uint32_t getIndex() const { return index; } + + int32_t exportValue() const { return value; } + + static ShaderID Null() + { + ShaderID result; + result.value = -1; + return result; + } + + static ShaderID importValue(int32_t value) + { + ShaderID result; + result.value = value; + return result; + } + + static ShaderID make(uint32_t group, uint32_t index) + { + ShaderID result; + result.group = group; + result.index = index; + return result; + } + + friend bool operator<(const ShaderID l, const ShaderID r) { return l.value < r.value; } + + friend bool operator>(const ShaderID l, const ShaderID r) { return l.value > r.value; } + + friend bool operator!=(const ShaderID l, const ShaderID r) { return l.value != r.value; } + + friend bool operator==(const ShaderID l, const ShaderID r) { return l.value == r.value; } +}; + +// comes from ShaderID::group member having 3 bits -> 0-7 +inline constexpr uint32_t max_scripted_shaders_bin_groups = 8; + +class ProgramID +{ + union + { + int32_t value; + struct + { + uint32_t type : 2; + uint32_t group : 3; + uint32_t index : 27; + }; + }; + +public: + bool operator!() const { return -1 == value; } + explicit operator bool() const { return -1 == value; } + + uint32_t getType() const { return type; } + uint32_t getIndex() const { return index; } + uint32_t getGroup() const { return group; } + + int32_t exportValue() const { return value; } + + static constexpr uint32_t type_graphics = 0; + static constexpr uint32_t type_compute = 1; + static constexpr uint32_t type_raytrace = 2; + + bool isGraphics() const { return type_graphics == type; } + bool isCompute() const { return type_compute == type; } + bool isRaytrace() const { return type_raytrace == type; } + + static ProgramID Null() + { + ProgramID result; + result.value = -1; + return result; + } + + static ProgramID importValue(int32_t value) + { + ProgramID result; + result.value = value; + return result; + } + + static ProgramID asGraphicsProgram(uint32_t group, uint32_t index) + { + ProgramID result; + result.type = type_graphics; + result.group = group; + result.index = index; + return result; + } + + static ProgramID asComputeProgram(uint32_t group, uint32_t index) + { + ProgramID result; + result.type = type_compute; + result.group = group; + result.index = index; + return result; + } + + static ProgramID asRaytraceProgram(uint32_t group, uint32_t index) + { + ProgramID result; + result.type = type_raytrace; + result.group = group; + result.index = index; + return result; + } + + friend bool operator<(const ProgramID l, const ProgramID r) { return l.value < r.value; } + + friend bool operator>(const ProgramID l, const ProgramID r) { return l.value > r.value; } + + friend bool operator!=(const ProgramID l, const ProgramID r) { return l.value != r.value; } + + friend bool operator==(const ProgramID l, const ProgramID r) { return l.value == r.value; } +}; + +class GraphicsProgramID +{ + union + { + int32_t value; + struct + { + uint32_t group : 3; + uint32_t index : 29; + }; + }; + +public: + bool operator!() const { return -1 == value; } + explicit operator bool() const { return -1 == value; } + + uint32_t getIndex() const { return index; } + uint32_t getGroup() const { return group; } + + int32_t exportValue() const { return value; } + + static GraphicsProgramID Null() + { + GraphicsProgramID result; + result.value = -1; + return result; + } + + static GraphicsProgramID importValue(int32_t value) + { + GraphicsProgramID result; + result.value = value; + return result; + } + + static GraphicsProgramID make(uint32_t group, uint32_t index) + { + GraphicsProgramID result; + result.group = group; + result.index = index; + return result; + } + + friend bool operator<(const GraphicsProgramID l, const GraphicsProgramID r) { return l.value < r.value; } + + friend bool operator>(const GraphicsProgramID l, const GraphicsProgramID r) { return l.value > r.value; } + + friend bool operator!=(const GraphicsProgramID l, const GraphicsProgramID r) { return l.value != r.value; } + + friend bool operator==(const GraphicsProgramID l, const GraphicsProgramID r) { return l.value == r.value; } +}; + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/stateful_command_buffer.h b/prog/engine/drv/drv3d_DX12/stateful_command_buffer.h index ff2fc548c..17bf001fc 100644 --- a/prog/engine/drv/drv3d_DX12/stateful_command_buffer.h +++ b/prog/engine/drv/drv3d_DX12/stateful_command_buffer.h @@ -1,6 +1,16 @@ #pragma once + +#include + +#include "driver.h" +#include "validation.h" +#include "constants.h" +#include "d3d12_utils.h" #include "versioned_com_ptr.h" -#include +#include "render_state.h" +#include "pipeline_cache.h" +#include "command_list.h" + namespace drv3d_dx12 { @@ -297,6 +307,11 @@ class StatefulCommandBuffer { cmd.clearRenderTargetView(view, color, rect_count, rects); } + void resolveSubresource(ID3D12Resource *dst_resource, UINT dst_subresource, ID3D12Resource *src_resource, UINT src_subresource, + DXGI_FORMAT format) + { + cmd.resolveSubresource(dst_resource, dst_subresource, src_resource, src_subresource, format); + } void clearDepthStencilView(D3D12_CPU_DESCRIPTOR_HANDLE view, D3D12_CLEAR_FLAGS flags, FLOAT d, UINT8 s, UINT rect_count, const D3D12_RECT *rects) { diff --git a/prog/engine/drv/drv3d_DX12/swapchain.cpp b/prog/engine/drv/drv3d_DX12/swapchain.cpp index 4e694276d..15258c9ee 100644 --- a/prog/engine/drv/drv3d_DX12/swapchain.cpp +++ b/prog/engine/drv/drv3d_DX12/swapchain.cpp @@ -101,7 +101,6 @@ BaseTex *frontend::Swapchain::getDepthStencilTexture(Device &device, Extent2D ex ii.arrays = ArrayLayerCount::make(1); ii.mips = MipMapCount::make(1); ii.format = getDepthStencilFormat(); - ii.memoryLayout = D3D12_TEXTURE_LAYOUT_UNKNOWN; ii.memoryClass = DeviceMemoryClass::DEVICE_RESIDENT_IMAGE; ii.allocateSubresourceIDs = true; swapchainDepthStencilTex->tex.image = device.createImageNoContextLock(ii, "swapchain depth stencil target"); @@ -132,7 +131,7 @@ void backend::Swapchain::registerSwapchainView(D3DDevice *device, Image *image, if (info.state.isSRV()) { - auto desc = info.state.asSRVDesc(D3D12_RESOURCE_DIMENSION_TEXTURE2D); + auto desc = info.state.asSRVDesc(D3D12_RESOURCE_DIMENSION_TEXTURE2D, image->isMultisampled()); for (auto &buffer : colorTargets) { auto descriptor = swapchainBufferSRVHeap.allocate(device); @@ -154,7 +153,7 @@ void backend::Swapchain::registerSwapchainView(D3DDevice *device, Image *image, } else if (info.state.isRTV()) { - auto desc = info.state.asRTVDesc(D3D12_RESOURCE_DIMENSION_TEXTURE2D); + auto desc = info.state.asRTVDesc(D3D12_RESOURCE_DIMENSION_TEXTURE2D, image->isMultisampled()); for (auto &buffer : colorTargets) { auto descriptor = swapchainBufferRTVHeap.allocate(device); @@ -177,7 +176,7 @@ void backend::Swapchain::registerSwapchainView(D3DDevice *device, Image *image, { secondarySwapchainViewSet.push_back(info); - auto desc = info.state.asRTVDesc(D3D12_RESOURCE_DIMENSION_TEXTURE2D); + auto desc = info.state.asRTVDesc(D3D12_RESOURCE_DIMENSION_TEXTURE2D, image->isMultisampled()); for (auto &buffer : secondaryColorTargets) { @@ -623,7 +622,7 @@ void backend::Swapchain::bufferResize(Device &device, const Extent2D &extent, Fo if (!colorTarget) { colorTarget.reset(new Image({}, ComPtr{}, D3D12_RESOURCE_DIMENSION_TEXTURE2D, D3D12_TEXTURE_LAYOUT_UNKNOWN, - color_format, ext, MipMapCount::make(1), ArrayLayerCount::make(1), idBase)); + color_format, ext, MipMapCount::make(1), ArrayLayerCount::make(1), idBase, false)); colorTarget->setGPUChangeable(true); } else diff --git a/prog/engine/drv/drv3d_DX12/swapchain.h b/prog/engine/drv/drv3d_DX12/swapchain.h index 0b2095cf1..fdbb71589 100644 --- a/prog/engine/drv/drv3d_DX12/swapchain.h +++ b/prog/engine/drv/drv3d_DX12/swapchain.h @@ -1,12 +1,23 @@ #pragma once +#include +#include #include +#include +#include + +#include "extents.h" +#include "image_view_state.h" +#include "descriptor_heap.h" +#include "winapi_helpers.h" + namespace drv3d_dx12 { struct BaseTex; class Image; class Device; +class DeviceContext; enum class PresentationMode { diff --git a/prog/engine/drv/drv3d_DX12/tagged_handle.h b/prog/engine/drv/drv3d_DX12/tagged_handle.h new file mode 100644 index 000000000..19d4fefa9 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/tagged_handle.h @@ -0,0 +1,33 @@ +#pragma once + + +template +class TaggedHandle +{ + H h; // -V730_NOINIT + +public: + bool interlockedIsNull() const { return interlocked_acquire_load(h) == NullValue; } + void interlockedSet(H v) { interlocked_release_store(h, v); } + H get() const { return h; } + explicit TaggedHandle(H a) : h(a) {} + TaggedHandle() {} + bool operator!() const { return h == NullValue; } + friend bool operator==(TaggedHandle l, TaggedHandle r) { return l.get() == r.get(); } + friend bool operator!=(TaggedHandle l, TaggedHandle r) { return l.get() != r.get(); } + friend bool operator<(TaggedHandle l, TaggedHandle r) { return l.get() < r.get(); } + friend bool operator>(TaggedHandle l, TaggedHandle r) { return l.get() > r.get(); } + friend H operator-(TaggedHandle l, TaggedHandle r) { return l.get() - r.get(); } + static TaggedHandle Null() { return TaggedHandle(NullValue); } + static TaggedHandle make(H value) { return TaggedHandle{value}; } +}; + +template +inline TaggedHandle genereate_next_id(TaggedHandle last_id) +{ + auto value = last_id.get() + 1; + if (value == NullValue) + ++value; + G_ASSERT(value != NullValue); + return TaggedHandle{value}; +} diff --git a/prog/engine/drv/drv3d_DX12/tagged_handles.h b/prog/engine/drv/drv3d_DX12/tagged_handles.h new file mode 100644 index 000000000..e09b3aa83 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/tagged_handles.h @@ -0,0 +1,20 @@ +#pragma once + +#include "tagged_handle.h" + +namespace drv3d_dx12 +{ + +struct InputLayoutIDTag; +struct StaticRenderStateIDTag; +struct InternalInputLayoutIDTag; +struct FramebufferLayoutIDTag; +struct BindlessSetIdTag; + +using InputLayoutID = TaggedHandle; +using StaticRenderStateID = TaggedHandle; +using InternalInputLayoutID = TaggedHandle; +using FramebufferLayoutID = TaggedHandle; +using BindlessSetId = TaggedHandle; + +}; // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/tagged_type.h b/prog/engine/drv/drv3d_DX12/tagged_type.h new file mode 100644 index 000000000..03de1b458 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/tagged_type.h @@ -0,0 +1,277 @@ +#pragma once + +#include +#include + + +template +class TaggedIndexType +{ + I value{}; + + constexpr TaggedIndexType(I v) : value{v} {} + +public: + using ValueType = I; + + constexpr TaggedIndexType() = default; + ~TaggedIndexType() = default; + + TaggedIndexType(const TaggedIndexType &) = default; + TaggedIndexType &operator=(const TaggedIndexType &) = default; + + static constexpr TaggedIndexType make(I v) { return {v}; } + + constexpr I index() const { return value; } + + friend bool operator==(const TaggedIndexType &l, const TaggedIndexType &r) { return l.value == r.value; } + + friend bool operator!=(const TaggedIndexType &l, const TaggedIndexType &r) { return l.value != r.value; } + + friend bool operator<(const TaggedIndexType &l, const TaggedIndexType &r) { return l.value < r.value; } + + friend bool operator>(const TaggedIndexType &l, const TaggedIndexType &r) { return l.value > r.value; } + + friend bool operator<=(const TaggedIndexType &l, const TaggedIndexType &r) { return l.value <= r.value; } + + friend bool operator>=(const TaggedIndexType &l, const TaggedIndexType &r) { return l.value >= r.value; } + + friend int operator-(const TaggedIndexType &l, const TaggedIndexType &r) { return l.value - r.value; } + + friend TaggedIndexType operator+(const TaggedIndexType &l, I r) { return {I(l.value + r)}; } + + friend TaggedIndexType operator-(const TaggedIndexType &l, I r) { return {I(l.value - r)}; } + + TaggedIndexType &operator+=(I r) + { + value += r; + return *this; + } + + TaggedIndexType &operator-=(I r) + { + value -= r; + return *this; + } + + template + TaggedIndexType &operator+=(C r) + { + *this = *this + r; + return *this; + } + + template + TaggedIndexType &operator-=(C r) + { + *this = *this - r; + return *this; + } + + TaggedIndexType &operator++() + { + ++value; + return *this; + } + + TaggedIndexType &operator--() + { + --value; + return *this; + } + + TaggedIndexType operator++(int) const + { + auto copy = *this; + return ++copy; + } + + TaggedIndexType operator--(int) const + { + auto copy = *this; + return --copy; + } + + operator DagorSafeArg() const { return {index()}; } +}; + +template +class TaggedRangeType : private ValueRange +{ + using RangeType = ValueRange; + +public: + using ValueType = typename ValueRange::ValueType; + using RangeType::begin; + using RangeType::end; + using RangeType::isInside; + using RangeType::isValidRange; + using RangeType::size; + using RangeType::ValueRange; + + constexpr IT front() const { return RangeType::front(); } + constexpr IT back() const { return RangeType::back(); } + + constexpr TaggedRangeType front(ValueType offset) const { return {IT(this->start + offset), this->stop}; } + constexpr TaggedRangeType back(ValueType offset) const { return {IT(this->stop - offset), this->stop}; } + + void resize(uint32_t count) { this->stop = this->start + count; } + + constexpr TaggedRangeType subRange(IT offset, uint32_t count) const { return make(this->start + offset, count); } + + constexpr TaggedRangeType subRange(uint32_t offset, uint32_t count) const { return make(this->start + offset, count); } + + static constexpr TaggedRangeType make(IT base, uint32_t count) { return {base, base + count}; } + + static constexpr TaggedRangeType make(uint32_t base, uint32_t count) { return {IT::make(base), IT::make(base + count)}; } + + static constexpr TaggedRangeType make_invalid() { return {IT::make(1), IT::make(0)}; } +}; + +template +class TaggedCountType +{ +public: + using ValueType = IT; + using IndexValueType = typename ValueType::ValueType; + using RangeType = TaggedRangeType; + +private: + IndexValueType value{}; + + constexpr TaggedCountType(IndexValueType v) : value{v} {} + +public: + struct Iterator + { + IndexValueType at{}; + + constexpr Iterator() = default; + ~Iterator() = default; + constexpr Iterator(const Iterator &) = default; + Iterator &operator=(const Iterator &) = default; + constexpr Iterator(IndexValueType v) : at(v) {} + constexpr ValueType operator*() const { return ValueType::make(at); } + Iterator &operator++() + { + ++at; + return *this; + } + Iterator operator++(int) { return at++; } + Iterator &operator--() + { + --at; + return *this; + } + Iterator operator--(int) { return at--; } + friend constexpr bool operator==(Iterator l, Iterator r) { return l.at == r.at; } + friend constexpr bool operator!=(Iterator l, Iterator r) { return l.at != r.at; } + }; + constexpr Iterator begin() const { return {0}; } + constexpr Iterator end() const { return {value}; } + + constexpr TaggedCountType() = default; + ~TaggedCountType() = default; + + TaggedCountType(const TaggedCountType &) = default; + TaggedCountType &operator=(const TaggedCountType &) = default; + + static constexpr TaggedCountType make(IndexValueType v) { return {v}; } + + constexpr IndexValueType count() const { return value; } + + constexpr RangeType asRange() const { return RangeType::make(0, value); } + // Allow implicit conversion to range as count is a specialized range + constexpr operator RangeType() const { return asRange(); } + + constexpr RangeType front(ValueType offset) const { return RangeType::make(offset, value - offset.index()); } + constexpr RangeType back(ValueType offset) const { return RangeType::make(value - offset.index(), offset.index()); } + + operator DagorSafeArg() const { return {count()}; } + + friend bool operator==(const TaggedCountType &l, const TaggedCountType &r) { return l.value == r.value; } + + friend bool operator!=(const TaggedCountType &l, const TaggedCountType &r) { return l.value != r.value; } + + friend bool operator<=(const TaggedCountType &l, const TaggedCountType &r) { return l.value <= r.value; } + + friend bool operator>=(const TaggedCountType &l, const TaggedCountType &r) { return l.value >= r.value; } + + friend bool operator<(const TaggedCountType &l, const TaggedCountType &r) { return l.value < r.value; } + + friend bool operator>(const TaggedCountType &l, const TaggedCountType &r) { return l.value > r.value; } + + friend bool operator==(const RangeType &l, const TaggedCountType &r) { return 0 == l.front().index() && l.size() == r.value; } + + friend bool operator!=(const RangeType &l, const TaggedCountType &r) { return 0 != l.front().index() && l.size() != r.value; } + + friend bool operator==(const TaggedCountType &l, const RangeType &r) { return 0 == r.front().index() && r.size() == l.value; } + + friend bool operator!=(const TaggedCountType &l, const RangeType &r) { return 0 != r.front().index() && r.size() != l.value; } +}; + +template +inline constexpr bool operator==(const TaggedCountType &l, const typename TaggedCountType::ValueType &r) +{ + return l.count() == r.index(); +} + +template +inline constexpr bool operator!=(const TaggedCountType &l, const typename TaggedCountType::ValueType &r) +{ + return l.count() != r.index(); +} + +template +inline constexpr bool operator<=(const TaggedCountType &l, const typename TaggedCountType::ValueType &r) +{ + return l.count() <= r.index(); +} + +template +inline constexpr bool operator>=(const TaggedCountType &l, const typename TaggedCountType::ValueType &r) +{ + return l.count() >= r.index(); +} + +template +inline constexpr bool operator<(const TaggedCountType &l, const typename TaggedCountType::ValueType &r) +{ + return l.count() < r.index(); +} + +template +inline constexpr bool operator>(const TaggedCountType &l, const typename TaggedCountType::ValueType &r) +{ + return l.count() > r.index(); +} + +template +inline constexpr bool operator==(const typename TaggedCountType::ValueType &l, const TaggedCountType &r) +{ + return l.index() == r.count(); +} + +template +inline constexpr bool operator!=(const typename TaggedCountType::ValueType &l, const TaggedCountType &r) +{ + return l.index() != r.count(); +} + +template +inline constexpr bool operator<=(const typename TaggedCountType::ValueType &l, const TaggedCountType &r) +{ + return l.index() <= r.count(); +} + +template +inline constexpr bool operator>=(const typename TaggedCountType::ValueType &l, const TaggedCountType &r) +{ + return l.index() >= r.count(); +} + +template +inline constexpr bool operator<(const typename TaggedCountType::ValueType &l, const TaggedCountType &r) +{ + return l.index() < r.count(); +} diff --git a/prog/engine/drv/drv3d_DX12/tagged_types.h b/prog/engine/drv/drv3d_DX12/tagged_types.h new file mode 100644 index 000000000..a205a363e --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/tagged_types.h @@ -0,0 +1,105 @@ +#pragma once + +#include "tagged_type.h" + + +// Various type-safe strong aliases for ints. +namespace drv3d_dx12 +{ + +struct MipMapIndexTag; +using MipMapIndex = TaggedIndexType; +using MipMapRange = TaggedRangeType; +using MipMapCount = TaggedCountType; + +struct SubresourceIndexTag; +using SubresourceIndex = TaggedIndexType; +using SubresourceRange = TaggedRangeType; +using SubresourceCount = TaggedCountType; + +struct ArrayLayerIndexTag; +using ArrayLayerIndex = TaggedIndexType; +using ArrayLayerRange = TaggedRangeType; +using ArrayLayerCount = TaggedCountType; + +struct FormatPlaneIndexTag; +using FormatPlaneIndex = TaggedIndexType; +using FormatPlaneRange = TaggedRangeType; +using FormatPlaneCount = TaggedCountType; + +class SubresourcePerFormatPlaneCount +{ +public: + using ValueType = SubresourceIndex::ValueType; + +private: + ValueType value{}; + + constexpr SubresourcePerFormatPlaneCount(ValueType v) : value{v} {} + +public: + constexpr SubresourcePerFormatPlaneCount() = default; + ~SubresourcePerFormatPlaneCount() = default; + + SubresourcePerFormatPlaneCount(const SubresourcePerFormatPlaneCount &) = default; + SubresourcePerFormatPlaneCount &operator=(const SubresourcePerFormatPlaneCount &) = default; + + static constexpr SubresourcePerFormatPlaneCount make(ValueType v) { return {v}; } + static constexpr SubresourcePerFormatPlaneCount make(MipMapCount mip, ArrayLayerCount layers) + { + return {ValueType(mip.count() * layers.count())}; + } + static constexpr SubresourcePerFormatPlaneCount make(ArrayLayerCount layers, MipMapCount mip) + { + return {ValueType(mip.count() * layers.count())}; + } + + constexpr ValueType count() const { return value; } + + operator DagorSafeArg() const { return {count()}; } +}; + +inline SubresourcePerFormatPlaneCount operator*(const MipMapCount &l, const ArrayLayerCount &r) +{ + return SubresourcePerFormatPlaneCount::make(l, r); +} + +inline SubresourcePerFormatPlaneCount operator*(const ArrayLayerCount &l, const MipMapCount &r) +{ + return SubresourcePerFormatPlaneCount::make(l, r); +} + +// Per plane subres count times the plane index yields the subres range of the plane index +inline SubresourceRange operator*(const SubresourcePerFormatPlaneCount &l, const FormatPlaneIndex &r) +{ + return SubresourceRange::make(l.count() * r.index(), l.count()); +} + +// To keep it associative index times per plane yields also the subres range of the plane index +inline SubresourceRange operator*(const FormatPlaneIndex &l, const SubresourcePerFormatPlaneCount &r) +{ + return SubresourceRange::make(r.count() * l.index(), r.count()); +} + +// Per plane subres count times the plane count yields the total subres count +inline SubresourceCount operator*(const SubresourcePerFormatPlaneCount &l, const FormatPlaneCount &r) +{ + return SubresourceCount::make(l.count() * r.count()); +} + +inline SubresourceCount operator*(const FormatPlaneCount &r, const SubresourcePerFormatPlaneCount &l) +{ + return SubresourceCount::make(l.count() * r.count()); +} + +inline SubresourceIndex calculate_subresource_index(MipMapIndex mip, ArrayLayerIndex array, MipMapCount mips_per_array) +{ + return SubresourceIndex::make(mip.index() + (array.index() * mips_per_array.count())); +} + +inline SubresourceIndex operator+(const SubresourceIndex &l, const SubresourcePerFormatPlaneCount &r) +{ + return SubresourceIndex::make(l.index() + r.count()); +} + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/texture.cpp b/prog/engine/drv/drv3d_DX12/texture.cpp index e420f414c..590bfc3db 100644 --- a/prog/engine/drv/drv3d_DX12/texture.cpp +++ b/prog/engine/drv/drv3d_DX12/texture.cpp @@ -118,23 +118,20 @@ bool create_tex2d(D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_t h, uint auto &device = get_device(); uint32_t &flg = bt_in->cflg; - G_ASSERT(!((flg & TEXCF_MULTISAMPLED) && initial_data != nullptr)); + G_ASSERT(!((flg & TEXCF_SAMPLECOUNT_MASK) && initial_data != nullptr)); G_ASSERT(!((flg & TEXCF_LOADONCE) && (flg & (TEXCF_DYNAMIC | TEXCF_RTARGET)))); - if ((flg & TEXFMT_MASK) == TEXFMT_MSAA_MAX_SAMPLES) // we always support forced sample count - return false; if (flg & TEXCF_VARIABLE_RATE) { // Check rules for VRS textures G_ASSERTF_RETURN(TEXFMT_R8UI == (flg & TEXFMT_MASK), false, "Variable Rate Textures can only use R8 UINT format"); G_ASSERTF_RETURN(0 == (flg & TEXCF_RTARGET), false, "Variable Rate Textures can not be used as render target"); - G_ASSERTF_RETURN(0 == (flg & TEXCF_MULTISAMPLED), false, "Variable Rate Textures can not be multisampled"); + G_ASSERTF_RETURN(0 == (flg & TEXCF_SAMPLECOUNT_MASK), false, "Variable Rate Textures can not be multisampled"); G_ASSERTF_RETURN(1 == array_size, false, "Variable Rate Textures can not be a arrayed texture"); G_ASSERTF_RETURN(false == cube, false, "Variable Rate Texture can not be a cube map"); G_ASSERTF_RETURN(1 == levels, false, "Variable Rate Texture can only have one mip level"); } - ImageInfo desc; desc.type = D3D12_RESOURCE_DIMENSION_TEXTURE2D; desc.size.width = w; @@ -143,9 +140,9 @@ bool create_tex2d(D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_t h, uint desc.mips = MipMapCount::make(levels); desc.arrays = ArrayLayerCount::make((cube ? 6 : 1) * array_size); desc.format = FormatStore::fromCreateFlags(flg); - desc.usage = D3D12_RESOURCE_FLAG_NONE; desc.memoryLayout = (flg & TEXCF_TILED_RESOURCE) ? D3D12_TEXTURE_LAYOUT_64KB_UNDEFINED_SWIZZLE : D3D12_TEXTURE_LAYOUT_UNKNOWN; desc.allocateSubresourceIDs = needs_subresource_tracking(flg); + desc.sampleDesc.Count = get_sample_count(flg); flg = BaseTex::update_flags_for_linear_layout(flg, desc.format); desc.memoryClass = BaseTex::get_memory_class(flg); @@ -188,13 +185,6 @@ bool create_tex2d(D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_t h, uint G_ASSERT(!(isDepth && (flg & TEXCF_READABLE))); - - if (flg & (TEXCF_MULTISAMPLED | TEXCF_MSAATARGET)) - { - logwarn("DX12: Requested multisampled texture, but DX12 backend has no support"); - flg &= ~(TEXCF_MULTISAMPLED | TEXCF_MSAATARGET); - } - if (!temp_alloc) TEXQL_PRE_CLEAN(bt_in->ressize()); @@ -443,7 +433,7 @@ bool create_tex3d(D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_t h, uint auto &device = get_device(); auto &ctx = device.getContext(); - G_ASSERT((flg & TEXCF_MULTISAMPLED) == 0); + G_ASSERT((flg & TEXCF_SAMPLECOUNT_MASK) == 0); G_ASSERT(!((flg & TEXCF_LOADONCE) && (flg & TEXCF_DYNAMIC))); G_ASSERTF_RETURN(0 == (flg & TEXCF_VARIABLE_RATE), false, "Variable Rate Texture can not be a volumetric texture"); @@ -455,7 +445,6 @@ bool create_tex3d(D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_t h, uint desc.mips = MipMapCount::make(levels); desc.arrays = ArrayLayerCount::make(1); desc.format = FormatStore::fromCreateFlags(flg); - desc.usage = D3D12_RESOURCE_FLAG_NONE; desc.memoryLayout = (flg & TEXCF_TILED_RESOURCE) ? D3D12_TEXTURE_LAYOUT_64KB_UNDEFINED_SWIZZLE : D3D12_TEXTURE_LAYOUT_UNKNOWN; desc.allocateSubresourceIDs = needs_subresource_tracking(flg); @@ -689,6 +678,267 @@ bool create_tex3d(D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_t h, uint } } // namespace +ImageViewState BaseTex::getViewInfoUAV(MipMapIndex mip, ArrayLayerIndex layer, bool as_uint) const +{ + ImageViewState result; + if (resType == RES3D_TEX) + { + result.isCubemap = 0; + result.isArray = 0; + result.setSingleArrayRange(ArrayLayerIndex::make(0)); + G_ASSERTF(layer.index() == 0, "UAV view for layer/face %u requested, but texture was 2d and has no layers/faces", layer); + } + else if (resType == RES3D_CUBETEX) + { + result.isCubemap = 1; + result.isArray = 0; + result.setArrayRange(getArrayCount().front(layer)); + G_ASSERTF(layer < getArrayCount(), + "UAV view for layer/face %u requested, but texture was cubemap and has only 6 " + "faces", + layer); + } + else if (resType == RES3D_ARRTEX) + { + result.isArray = 1; + result.isCubemap = isArrayCube(); + result.setArrayRange(getArrayCount().front(layer)); + G_ASSERTF(layer < getArrayCount(), "UAV view for layer/face %u requested, but texture has only %u layers", layer, getArrayCount()); + } + else if (resType == RES3D_VOLTEX) + { + result.isArray = 0; + result.isCubemap = 0; + result.setDepthLayerRange(0, max(1, depth >> mip.index())); + G_ASSERTF(layer.index() == 0, "UAV view for layer/face %u requested, but texture was 3d and has no layers/faces", layer); + } + if (as_uint) + { + G_ASSERT(getFormat().getBytesPerPixelBlock() == 4 || + (getFormat().getBytesPerPixelBlock() == 8 && d3d::get_driver_desc().caps.hasShader64BitIntegerResources)); + if (getFormat().getBytesPerPixelBlock() == 4) + result.setFormat(FormatStore::fromCreateFlags(TEXFMT_R32UI)); + else if (getFormat().getBytesPerPixelBlock() == 8) + result.setFormat(FormatStore::fromCreateFlags(TEXFMT_R32G32UI)); + } + else + { + result.setFormat(getFormat().getLinearVariant()); + } + result.setSingleMipMapRange(mip); + result.setUAV(); + return result; +} + +ImageViewState BaseTex::getViewInfoRenderTarget(MipMapIndex mip, ArrayLayerIndex layer, bool as_const) const +{ + FormatStore format = allowSrgbWrite() ? getFormat() : getFormat().getLinearVariant(); + ImageViewState result; + result.isArray = resType == RES3D_ARRTEX; + result.isCubemap = resType == RES3D_CUBETEX; + result.setFormat(format); + result.setSingleMipMapRange(mip); + + if (layer.index() < d3d::RENDER_TO_WHOLE_ARRAY) + { + result.setSingleArrayRange(layer); + } + else + { + if (RES3D_VOLTEX == resType) + { + result.setDepthLayerRange(0, max(1, depth >> mip.index())); + } + else + { + result.setArrayRange(getArrayCount()); + } + } + + if (format.isColor()) + { + result.setRTV(); + } + else + { + result.setDSV(as_const); + } + + return result; +} + +ImageViewState BaseTex::getViewInfo() const +{ + ImageViewState result; + result.setFormat(allowSrgbRead() ? getFormat() : getFormat().getLinearVariant()); + result.isArray = resType == RES3D_ARRTEX ? 1 : 0; + result.isCubemap = resType == RES3D_CUBETEX ? 1 : (resType == RES3D_ARRTEX ? int(isArrayCube()) : 0); + int32_t baseMip = clamp(maxMipLevel, 0, max(0, (int32_t)tex.realMipLevels - 1)); + int32_t mipCount = (minMipLevel - maxMipLevel) + 1; + if (mipCount <= 0 || baseMip + mipCount > tex.realMipLevels) + { + mipCount = tex.realMipLevels - baseMip; + } + if (isStub()) + { + baseMip = 0; + mipCount = 1; + } + result.setMipBase(MipMapIndex::make(baseMip)); + result.setMipCount(max(mipCount, 1)); + result.setArrayRange(getArrayCount()); + result.setSRV(); + result.sampleStencil = sampleStencil(); + return result; +} + +FormatStore BaseTex::getFormat() const { return tex.image ? tex.image->getFormat() : fmt; } + +void BaseTex::updateDeviceSampler() +{ + sampler = get_device().getSampler(samplerState); + lastSamplerState = samplerState; +} + +D3D12_CPU_DESCRIPTOR_HANDLE BaseTex::getDeviceSampler() +{ + if (!sampler.ptr || samplerState != lastSamplerState) + { + updateDeviceSampler(); + } + + return sampler; +} + +Extent3D BaseTex::getMipmapExtent(uint32_t level) const +{ + Extent3D result; + result.width = max(width >> level, 1); + result.height = max(height >> level, 1); + result.depth = resType == RES3D_VOLTEX ? max(depth >> level, 1) : 1; + return result; +} + +void BaseTex::updateTexName() +{ + // don't propagate down to stub images + if (isStub()) + return; + if (tex.image) + { + get_device().setTexName(tex.image, getResName()); + } +} + +void BaseTex::notifySamplerChange() +{ + for (uint32_t s = 0; s < STAGE_MAX_EXT; ++s) + { + if (srvBindingStages[s].any()) + { + dirty_sampler(this, s, srvBindingStages[s]); + } + } +} + +void BaseTex::notifySRViewChange() +{ + for (uint32_t s = 0; s < STAGE_MAX_EXT; ++s) + { + if (srvBindingStages[s].any()) + { + dirty_srv(this, s, srvBindingStages[s]); + } + } +} + +void BaseTex::notifyTextureReplaceFinish() +{ + // if we are ending up here, we are still holding the lock of the state tracker + // to avoid a deadlock by reentering we have to do the dirtying without a lock + for (uint32_t s = 0; s < STAGE_MAX_EXT; ++s) + { + if (srvBindingStages[s].any()) + { + dirty_srv_and_sampler_no_lock(this, s, srvBindingStages[s]); + } + } +} + +void BaseTex::dirtyBoundSRVsNoLock() +{ + for (uint32_t s = 0; s < STAGE_MAX_EXT; ++s) + { + if (srvBindingStages[s].any()) + { + dirty_srv_no_lock(this, s, srvBindingStages[s]); + } + } +} + +void BaseTex::dirtyBoundUAVsNoLock() +{ + for (uint32_t s = 0; s < STAGE_MAX_EXT; ++s) + { + if (uavBindingStages[s].any()) + { + dirty_uav_no_lock(this, s, uavBindingStages[s]); + } + } +} + +void BaseTex::setUAVBinding(uint32_t stage, uint32_t index, bool s) +{ + uavBindingStages[stage].set(index, s); + stateBitSet.set(acitve_binding_used_offset); + if (s) + { + stateBitSet.reset(active_binding_was_copied_to_stage_offset); + } +} + +void BaseTex::setSRVBinding(uint32_t stage, uint32_t index, bool s) +{ + srvBindingStages[stage].set(index, s); + stateBitSet.set(acitve_binding_used_offset); +} + +void BaseTex::setRTVBinding(uint32_t index, bool s) +{ + G_ASSERT(index < Driver3dRenderTarget::MAX_SIMRT); + stateBitSet.set(active_binding_rtv_offset + index, s); + stateBitSet.set(acitve_binding_used_offset); + if (s) + { + stateBitSet.reset(active_binding_was_copied_to_stage_offset); + stateBitSet.set(active_binding_dirty_rt); + } +} + +void BaseTex::setDSVBinding(bool s) +{ + stateBitSet.set(active_binding_dsv_offset, s); + stateBitSet.set(acitve_binding_used_offset); + if (s) + { + stateBitSet.reset(active_binding_was_copied_to_stage_offset); + stateBitSet.set(active_binding_dirty_rt); + } +} + +eastl::bitset BaseTex::getRTVBinding() const +{ + eastl::bitset ret; + ret.from_uint64((stateBitSet >> active_binding_rtv_offset).to_uint64()); + return ret; +} + +void BaseTex::setUsedWithBindless() +{ + stateBitSet.set(active_binding_bindless_used_offset); + stateBitSet.set(acitve_binding_used_offset); +} + void BaseTex::setParams(int w, int h, int d, int levels, const char *stat_name) { G_ASSERT(levels > 0); @@ -702,6 +952,19 @@ void BaseTex::setParams(int w, int h, int d, int levels, const char *stat_name) setTexName(stat_name); } +ArrayLayerCount BaseTex::getArrayCount() const +{ + if (resType == RES3D_CUBETEX) + { + return ArrayLayerCount::make(6); + } + else if (resType == RES3D_ARRTEX) + { + return ArrayLayerCount::make((isArrayCube() ? 6 : 1) * depth); + } + return ArrayLayerCount::make(1); +} + void BaseTex::setResApiName(const char *name) const { G_UNUSED(name); @@ -711,8 +974,28 @@ void BaseTex::setResApiName(const char *name) const #endif } +BaseTex::BaseTex(int res_type, uint32_t cflg_) : + resType(res_type), cflg(cflg_), minMipLevel(20), maxMipLevel(0), lockFlags(0), depth(1), width(0), height(0) +{ + samplerState.setBias(default_lodbias); + samplerState.setAniso(default_aniso); + samplerState.setW(D3D12_TEXTURE_ADDRESS_MODE_WRAP); + samplerState.setV(D3D12_TEXTURE_ADDRESS_MODE_WRAP); + samplerState.setU(D3D12_TEXTURE_ADDRESS_MODE_WRAP); + samplerState.setMip(D3D12_FILTER_TYPE_LINEAR); + samplerState.setFilter(D3D12_FILTER_TYPE_LINEAR); + + if (RES3D_CUBETEX == resType) + { + samplerState.setV(D3D12_TEXTURE_ADDRESS_MODE_CLAMP); + samplerState.setU(D3D12_TEXTURE_ADDRESS_MODE_CLAMP); + } +} + BaseTex::~BaseTex() { setRld(nullptr); } +void BaseTex::resolve(Image *dst) { get_device().getContext().resolveMultiSampleImage(tex.image, dst); } + BaseTexture *BaseTex::makeTmpTexResCopy(int w, int h, int d, int l, bool staging_tex) { STORE_RETURN_ADDRESS(); @@ -815,6 +1098,15 @@ bool BaseTex::allocateTex() return false; } +void BaseTex::discardTex() +{ + if (stubTexIdx >= 0) + { + releaseTex(); + recreate(); + } +} + bool BaseTex::recreate() { STORE_RETURN_ADDRESS(); @@ -1075,8 +1367,15 @@ int BaseTex::update(BaseTexture *src) #endif STORE_RETURN_ADDRESS(); ScopedCommitLock ctxLock{get_device().getContext()}; - // passing no regions is fast whole resource copy - get_device().getContext().copyImage(sTex->tex.image, tex.image, make_whole_resource_copy_info()); + if (sTex->isMultisampled()) + { + sTex->resolve(tex.image); + } + else + { + // passing no regions is fast whole resource copy + get_device().getContext().copyImage(sTex->tex.image, tex.image, make_whole_resource_copy_info()); + } return 1; } @@ -1198,6 +1497,12 @@ int BaseTex::generateMips() return 1; } +bool BaseTex::setReloadCallback(IReloadData *_rld) +{ + setRld(_rld); + return true; +} + static constexpr int TEX_COPIED = 1 << 30; void D3DTextures::release(uint64_t progress) @@ -1936,6 +2241,117 @@ int BaseTex::ressize() const getArrayCount().count()); } +int BaseTex::getinfo(TextureInfo &ti, int level) const +{ + level = clamp(level, 0, mipLevels - 1); + + ti.w = max(1u, width >> level); + ti.h = max(1u, height >> level); + switch (resType) + { + case RES3D_CUBETEX: + ti.d = 1; + ti.a = 6; + break; + case RES3D_CUBEARRTEX: + case RES3D_ARRTEX: + ti.d = 1; + ti.a = getArrayCount().count(); + break; + case RES3D_VOLTEX: + ti.d = max(1u, getDepthSlices() >> level); + ti.a = 1; + break; + default: + ti.d = 1; + ti.a = 1; + break; + } + + ti.mipLevels = mipLevels; + ti.resType = resType; + ti.cflg = cflg; + return 1; +} + +int BaseTex::texaddr(int a) +{ + samplerState.setW(translate_texture_address_mode_to_dx12(a)); + samplerState.setV(translate_texture_address_mode_to_dx12(a)); + samplerState.setU(translate_texture_address_mode_to_dx12(a)); + notifySamplerChange(); + return 1; +} + +int BaseTex::texaddru(int a) +{ + samplerState.setU(translate_texture_address_mode_to_dx12(a)); + notifySamplerChange(); + return 1; +} + +int BaseTex::texaddrv(int a) +{ + samplerState.setV(translate_texture_address_mode_to_dx12(a)); + notifySamplerChange(); + return 1; +} + +int BaseTex::texaddrw(int a) +{ + if (RES3D_VOLTEX == resType) + { + samplerState.setW(translate_texture_address_mode_to_dx12(a)); + notifySamplerChange(); + return 1; + } + return 0; +} + +int BaseTex::texbordercolor(E3DCOLOR c) +{ + samplerState.setBorder(c); + notifySamplerChange(); + return 1; +} + +int BaseTex::texfilter(int m) +{ + samplerState.isCompare = m == TEXFILTER_COMPARE; + samplerState.setFilter(translate_filter_type_to_dx12(m)); + notifySamplerChange(); + return 1; +} + +int BaseTex::texmipmap(int m) +{ + samplerState.setMip(translate_mip_filter_type_to_dx12(m)); + notifySamplerChange(); + return 1; +} + +int BaseTex::texlod(float mipmaplod) +{ + samplerState.setBias(mipmaplod); + notifySamplerChange(); + return 1; +} + +int BaseTex::texmiplevel(int minlevel, int maxlevel) +{ + maxMipLevel = (minlevel >= 0) ? minlevel : 0; + minMipLevel = (maxlevel >= 0) ? maxlevel : (mipLevels - 1); + notifySRViewChange(); + return 1; +} + +int BaseTex::setAnisotropy(int level) +{ + samplerState.setAniso(clamp(level, 1, 16)); + notifySamplerChange(); + return 1; +} + static Texture *create_tex_internal(TexImage32 *img, int w, int h, int flg, int levels, const char *stat_name, Texture *baseTexture) { G_ASSERT_RETURN(d3d::check_texformat(flg), nullptr); diff --git a/prog/engine/drv/drv3d_DX12/texture.h b/prog/engine/drv/drv3d_DX12/texture.h index 8663d3d20..1fdf93513 100644 --- a/prog/engine/drv/drv3d_DX12/texture.h +++ b/prog/engine/drv/drv3d_DX12/texture.h @@ -1,7 +1,18 @@ #pragma once +#include #include <3d/tql.h> +#include <3d/dag_drv3d.h> #include +#include + +#include "device_memory_class.h" +#include "host_device_shared_memory_region.h" +#include "format_store.h" +#include "extents.h" +#include "image_view_state.h" +#include "sampler_state.h" + namespace ddsx { @@ -40,8 +51,8 @@ void dirty_srv_and_sampler_no_lock(BaseTex *texture, uint32_t stage, eastl::bits void dirty_uav_no_lock(BaseTex *texture, uint32_t stage, eastl::bitset slots); void dirty_rendertarget_no_lock(BaseTex *texture, eastl::bitset slots); -static constexpr uint32_t MAX_MIPMAPS = 16; -static constexpr uint32_t TEXTURE_TILE_SIZE = 64 * 1024; +inline constexpr uint32_t MAX_MIPMAPS = 16; +inline constexpr uint32_t TEXTURE_TILE_SIZE = 64 * 1024; struct BaseTex final : public BaseTexture { @@ -61,136 +72,22 @@ struct BaseTex final : public BaseTexture bool allowSrgbRead() const { return (cflg & TEXCF_SRGBREAD) != 0; } bool isRenderTarget() const { return 0 != (cflg & TEXCF_RTARGET); } bool isUAV() const { return 0 != (cflg & TEXCF_UNORDERED); } + bool isMultisampled() const { return 0 != (cflg & TEXCF_SAMPLECOUNT_MASK); } virtual void setResApiName(const char *name) const override; + void resolve(Image *dst); + FormatStore getFormat() const; Image *getDeviceImage() const { return tex.image; } - __forceinline ImageViewState getViewInfoUAV(MipMapIndex mip, ArrayLayerIndex layer, bool as_uint) const - { - ImageViewState result; - if (resType == RES3D_TEX) - { - result.isCubemap = 0; - result.isArray = 0; - result.setSingleArrayRange(ArrayLayerIndex::make(0)); - G_ASSERTF(layer.index() == 0, "UAV view for layer/face %u requested, but texture was 2d and has no layers/faces", layer); - } - else if (resType == RES3D_CUBETEX) - { - result.isCubemap = 1; - result.isArray = 0; - result.setArrayRange(getArrayCount().front(layer)); - G_ASSERTF(layer < getArrayCount(), - "UAV view for layer/face %u requested, but texture was cubemap and has only 6 " - "faces", - layer); - } - else if (resType == RES3D_ARRTEX) - { - result.isArray = 1; - result.isCubemap = isArrayCube(); - result.setArrayRange(getArrayCount().front(layer)); - G_ASSERTF(layer < getArrayCount(), "UAV view for layer/face %u requested, but texture has only %u layers", layer, - getArrayCount()); - } - else if (resType == RES3D_VOLTEX) - { - result.isArray = 0; - result.isCubemap = 0; - result.setDepthLayerRange(0, max(1, depth >> mip.index())); - G_ASSERTF(layer.index() == 0, "UAV view for layer/face %u requested, but texture was 3d and has no layers/faces", layer); - } - if (as_uint) - { - G_ASSERT(getFormat().getBytesPerPixelBlock() == 4 || - (getFormat().getBytesPerPixelBlock() == 8 && d3d::get_driver_desc().caps.hasShader64BitIntegerResources)); - if (getFormat().getBytesPerPixelBlock() == 4) - result.setFormat(FormatStore::fromCreateFlags(TEXFMT_R32UI)); - else if (getFormat().getBytesPerPixelBlock() == 8) - result.setFormat(FormatStore::fromCreateFlags(TEXFMT_R32G32UI)); - } - else - { - result.setFormat(getFormat().getLinearVariant()); - } - result.setSingleMipMapRange(mip); - result.setUAV(); - return result; - } - __forceinline ImageViewState getViewInfoRenderTarget(MipMapIndex mip, ArrayLayerIndex layer, bool as_const) const - { - FormatStore format = allowSrgbWrite() ? getFormat() : getFormat().getLinearVariant(); - ImageViewState result; - result.isArray = resType == RES3D_ARRTEX; - result.isCubemap = resType == RES3D_CUBETEX; - result.setFormat(format); - result.setSingleMipMapRange(mip); - - if (layer.index() < d3d::RENDER_TO_WHOLE_ARRAY) - { - result.setSingleArrayRange(layer); - } - else - { - if (RES3D_VOLTEX == resType) - { - result.setDepthLayerRange(0, max(1, depth >> mip.index())); - } - else - { - result.setArrayRange(getArrayCount()); - } - } - - if (format.isColor()) - { - result.setRTV(); - } - else - { - result.setDSV(as_const); - } - - return result; - } - __forceinline ImageViewState getViewInfo() const - { - ImageViewState result; - result.setFormat(allowSrgbRead() ? getFormat() : getFormat().getLinearVariant()); - result.isArray = resType == RES3D_ARRTEX ? 1 : 0; - result.isCubemap = resType == RES3D_CUBETEX ? 1 : (resType == RES3D_ARRTEX ? int(isArrayCube()) : 0); - int32_t baseMip = clamp(maxMipLevel, 0, max(0, (int32_t)tex.realMipLevels - 1)); - int32_t mipCount = (minMipLevel - maxMipLevel) + 1; - if (mipCount <= 0 || baseMip + mipCount > tex.realMipLevels) - { - mipCount = tex.realMipLevels - baseMip; - } - if (isStub()) - { - baseMip = 0; - mipCount = 1; - } - result.setMipBase(MipMapIndex::make(baseMip)); - result.setMipCount(max(mipCount, 1)); - result.setArrayRange(getArrayCount()); - result.setSRV(); - result.sampleStencil = sampleStencil(); - return result; - } + ImageViewState getViewInfoUAV(MipMapIndex mip, ArrayLayerIndex layer, bool as_uint) const; + ImageViewState getViewInfoRenderTarget(MipMapIndex mip, ArrayLayerIndex layer, bool as_const) const; + ImageViewState getViewInfo() const; void updateDeviceSampler(); D3D12_CPU_DESCRIPTOR_HANDLE getDeviceSampler(); - Extent3D getMipmapExtent(uint32_t level) const - { - Extent3D result; - result.width = max(width >> level, 1); - result.height = max(height >> level, 1); - result.depth = resType == RES3D_VOLTEX ? max(depth >> level, 1) : 1; - return result; - } - + Extent3D getMipmapExtent(uint32_t level) const; uint32_t getMemorySize() const { return tex.memSize; } struct ReloadDataHandler @@ -225,111 +122,19 @@ struct BaseTex final : public BaseTexture eastl::bitset uavBindingStages[STAGE_MAX_EXT]; eastl::bitset stateBitSet; - void notifySamplerChange() - { - for (uint32_t s = 0; s < STAGE_MAX_EXT; ++s) - { - if (srvBindingStages[s].any()) - { - dirty_sampler(this, s, srvBindingStages[s]); - } - } - } - - void notifySRViewChange() - { - for (uint32_t s = 0; s < STAGE_MAX_EXT; ++s) - { - if (srvBindingStages[s].any()) - { - dirty_srv(this, s, srvBindingStages[s]); - } - } - } - - void notifyTextureReplaceFinish() - { - // if we are ending up here, we are still holding the lock of the state tracker - // to avoid a deadlock by reentering we have to do the dirtying without a lock - for (uint32_t s = 0; s < STAGE_MAX_EXT; ++s) - { - if (srvBindingStages[s].any()) - { - dirty_srv_and_sampler_no_lock(this, s, srvBindingStages[s]); - } - } - } - - void dirtyBoundSRVsNoLock() - { - for (uint32_t s = 0; s < STAGE_MAX_EXT; ++s) - { - if (srvBindingStages[s].any()) - { - dirty_srv_no_lock(this, s, srvBindingStages[s]); - } - } - } - - void dirtyBoundUAVsNoLock() - { - for (uint32_t s = 0; s < STAGE_MAX_EXT; ++s) - { - if (uavBindingStages[s].any()) - { - dirty_uav_no_lock(this, s, uavBindingStages[s]); - } - } - } - + void notifySamplerChange(); + void notifySRViewChange(); + void notifyTextureReplaceFinish(); + void dirtyBoundSRVsNoLock(); + void dirtyBoundUAVsNoLock(); void dirtyBoundRTVsNoLock() { dirty_rendertarget_no_lock(this, getRTVBinding()); } - - void setUAVBinding(uint32_t stage, uint32_t index, bool s) - { - uavBindingStages[stage].set(index, s); - stateBitSet.set(acitve_binding_used_offset); - if (s) - { - stateBitSet.reset(active_binding_was_copied_to_stage_offset); - } - } - - void setSRVBinding(uint32_t stage, uint32_t index, bool s) - { - srvBindingStages[stage].set(index, s); - stateBitSet.set(acitve_binding_used_offset); - } - - void setRTVBinding(uint32_t index, bool s) - { - G_ASSERT(index < Driver3dRenderTarget::MAX_SIMRT); - stateBitSet.set(active_binding_rtv_offset + index, s); - stateBitSet.set(acitve_binding_used_offset); - if (s) - { - stateBitSet.reset(active_binding_was_copied_to_stage_offset); - stateBitSet.set(active_binding_dirty_rt); - } - } - - void setDSVBinding(bool s) - { - stateBitSet.set(active_binding_dsv_offset, s); - stateBitSet.set(acitve_binding_used_offset); - if (s) - { - stateBitSet.reset(active_binding_was_copied_to_stage_offset); - stateBitSet.set(active_binding_dirty_rt); - } - } + void setUAVBinding(uint32_t stage, uint32_t index, bool s); + void setSRVBinding(uint32_t stage, uint32_t index, bool s); + void setRTVBinding(uint32_t index, bool s); + void setDSVBinding(bool s); eastl::bitset getUAVBinding(uint32_t stage) const { return uavBindingStages[stage]; } eastl::bitset getSRVBinding(uint32_t stage) const { return srvBindingStages[stage]; } - eastl::bitset getRTVBinding() const - { - eastl::bitset ret; - ret.from_uint64((stateBitSet >> active_binding_rtv_offset).to_uint64()); - return ret; - } + eastl::bitset getRTVBinding() const; bool getDSVBinding() const { return stateBitSet.test(active_binding_dsv_offset); } bool wasUsed() const { return stateBitSet.test(acitve_binding_used_offset); } void setWasUsed() { stateBitSet.set(acitve_binding_used_offset); } @@ -337,27 +142,15 @@ struct BaseTex final : public BaseTexture void setDelayedCreate(bool s) { stateBitSet.set(active_binding_delayed_create_offset, s); } bool preallocBeforeLoad() const { return stateBitSet.test(active_binding_prealloc_before_load_offset); } void setPreallocBeforeLoad(bool s) { stateBitSet.set(active_binding_prealloc_before_load_offset, s); } - bool sampleStencil() const { return stateBitSet.test(active_binding_sample_stencil); } - void setSampleStencil(bool s) { stateBitSet.set(active_binding_sample_stencil, s); } - bool isArrayCube() const { return stateBitSet.test(active_binding_is_array_cube_offset); } - void setIsArrayCube(bool s) { stateBitSet.set(active_binding_is_array_cube_offset, s); } - bool wasCopiedToStage() const { return stateBitSet.test(active_binding_was_copied_to_stage_offset); } void setWasCopiedToStage(bool s) { stateBitSet.set(active_binding_was_copied_to_stage_offset, s); } - bool dirtyRt() const { return stateBitSet.test(active_binding_dirty_rt); } void setDirtyRty(bool s) { stateBitSet.set(active_binding_dirty_rt, s); } - - void setUsedWithBindless() - { - stateBitSet.set(active_binding_bindless_used_offset); - stateBitSet.set(acitve_binding_used_offset); - } - + void setUsedWithBindless(); bool wasUsedWithBindless() const { return stateBitSet.test(active_binding_bindless_used_offset); } uint16_t width = 0; @@ -384,18 +177,7 @@ struct BaseTex final : public BaseTexture void setParams(int w, int h, int d, int levels, const char *stat_name); - ArrayLayerCount getArrayCount() const - { - if (resType == RES3D_CUBETEX) - { - return ArrayLayerCount::make(6); - } - else if (resType == RES3D_ARRTEX) - { - return ArrayLayerCount::make((isArrayCube() ? 6 : 1) * depth); - } - return ArrayLayerCount::make(1); - } + ArrayLayerCount getArrayCount() const; uint32_t getDepthSlices() const { @@ -404,153 +186,31 @@ struct BaseTex final : public BaseTexture return 1; } - BaseTex(int res_type, uint32_t cflg_) : - resType(res_type), cflg(cflg_), minMipLevel(20), maxMipLevel(0), lockFlags(0), depth(1), width(0), height(0) - { - samplerState.setBias(default_lodbias); - samplerState.setAniso(default_aniso); - samplerState.setW(D3D12_TEXTURE_ADDRESS_MODE_WRAP); - samplerState.setV(D3D12_TEXTURE_ADDRESS_MODE_WRAP); - samplerState.setU(D3D12_TEXTURE_ADDRESS_MODE_WRAP); - samplerState.setMip(D3D12_FILTER_TYPE_LINEAR); - samplerState.setFilter(D3D12_FILTER_TYPE_LINEAR); - - if (RES3D_CUBETEX == resType) - { - samplerState.setV(D3D12_TEXTURE_ADDRESS_MODE_CLAMP); - samplerState.setU(D3D12_TEXTURE_ADDRESS_MODE_CLAMP); - } - } - + BaseTex(int res_type, uint32_t cflg_); ~BaseTex() override; /// ->> void setReadStencil(bool on) override { setSampleStencil(on && getFormat().isStencil()); } int restype() const override { return resType; } int ressize() const override; - - int getinfo(TextureInfo &ti, int level = 0) const override - { - level = clamp(level, 0, mipLevels - 1); - - ti.w = max(1u, width >> level); - ti.h = max(1u, height >> level); - switch (resType) - { - case RES3D_CUBETEX: - ti.d = 1; - ti.a = 6; - break; - case RES3D_CUBEARRTEX: - case RES3D_ARRTEX: - ti.d = 1; - ti.a = getArrayCount().count(); - break; - case RES3D_VOLTEX: - ti.d = max(1u, getDepthSlices() >> level); - ti.a = 1; - break; - default: - ti.d = 1; - ti.a = 1; - break; - } - - ti.mipLevels = mipLevels; - ti.resType = resType; - ti.cflg = cflg; - return 1; - } - + int getinfo(TextureInfo &ti, int level = 0) const override; bool addDirtyRect(const RectInt &) override { return true; } - int level_count() const override { return mipLevels; } - int texaddr(int a) override - { - samplerState.setW(translate_texture_address_mode_to_dx12(a)); - samplerState.setV(translate_texture_address_mode_to_dx12(a)); - samplerState.setU(translate_texture_address_mode_to_dx12(a)); - notifySamplerChange(); - return 1; - } - - int texaddru(int a) override - { - samplerState.setU(translate_texture_address_mode_to_dx12(a)); - notifySamplerChange(); - return 1; - } - - int texaddrv(int a) override - { - samplerState.setV(translate_texture_address_mode_to_dx12(a)); - notifySamplerChange(); - return 1; - } - - int texaddrw(int a) override - { - if (RES3D_VOLTEX == resType) - { - samplerState.setW(translate_texture_address_mode_to_dx12(a)); - notifySamplerChange(); - return 1; - } - return 0; - } - - int texbordercolor(E3DCOLOR c) override - { - samplerState.setBorder(c); - notifySamplerChange(); - return 1; - } - - int texfilter(int m) override - { - samplerState.isCompare = m == TEXFILTER_COMPARE; - samplerState.setFilter(translate_filter_type_to_dx12(m)); - notifySamplerChange(); - return 1; - } - - int texmipmap(int m) override - { - samplerState.setMip(translate_mip_filter_type_to_dx12(m)); - notifySamplerChange(); - return 1; - } - - int texlod(float mipmaplod) override - { - samplerState.setBias(mipmaplod); - notifySamplerChange(); - return 1; - } - - int texmiplevel(int minlevel, int maxlevel) override - { - maxMipLevel = (minlevel >= 0) ? minlevel : 0; - minMipLevel = (maxlevel >= 0) ? maxlevel : (mipLevels - 1); - notifySRViewChange(); - return 1; - } - - int setAnisotropy(int level) override - { - samplerState.setAniso(clamp(level, 1, 16)); - notifySamplerChange(); - return 1; - } + int texaddr(int a) override; + int texaddru(int a) override; + int texaddrv(int a) override; + int texaddrw(int a) override; + int texbordercolor(E3DCOLOR c) override; + int texfilter(int m) override; + int texmipmap(int m) override; + int texlod(float mipmaplod) override; + int texmiplevel(int minlevel, int maxlevel) override; + int setAnisotropy(int level) override; int generateMips() override; - bool setReloadCallback(IReloadData *_rld) override - { - setRld(_rld); - return true; - } + bool setReloadCallback(IReloadData *_rld) override; void resetTex(); void releaseTex(); @@ -581,14 +241,8 @@ struct BaseTex final : public BaseTexture } bool allocateTex() override; - void discardTex() override - { - if (stubTexIdx >= 0) - { - releaseTex(); - recreate(); - } - } + void discardTex() override; + bool isTexResEqual(BaseTexture *bt) const { return bt && ((BaseTex *)bt)->tex.image == tex.image; } bool isCubeArray() const { return isArrayCube(); } @@ -602,7 +256,7 @@ struct BaseTex final : public BaseTexture static DeviceMemoryClass get_memory_class(uint32_t cflags); }; -static inline BaseTex *getbasetex(/*const*/ BaseTexture *t) { return t ? (BaseTex *)t : nullptr; } +static inline BaseTex *getbasetex(BaseTexture *t) { return t ? (BaseTex *)t : nullptr; } static inline const BaseTex *getbasetex(const BaseTexture *t) { return t ? (const BaseTex *)t : nullptr; } diff --git a/prog/engine/drv/drv3d_DX12/texture_subresource_util.h b/prog/engine/drv/drv3d_DX12/texture_subresource_util.h new file mode 100644 index 000000000..84aab274e --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/texture_subresource_util.h @@ -0,0 +1,26 @@ +#pragma once + +#include "driver.h" + + +namespace drv3d_dx12 +{ + +inline constexpr UINT calculate_subresource_index(UINT mip_slice, UINT array_slice, UINT plane_slice, UINT mip_size, UINT array_size) +{ + return mip_slice + (array_slice * mip_size) + (plane_slice * mip_size * array_size); +} + +inline constexpr UINT calculate_mip_slice_from_index(UINT index, UINT mip_size) { return index % mip_size; } + +inline constexpr UINT calculate_array_slice_from_index(UINT index, UINT mip_size, UINT array_size) +{ + return (index / mip_size) % array_size; +} + +inline constexpr UINT calculate_plane_slice_from_index(UINT index, UINT mip_size, UINT array_size) +{ + return (index / mip_size) / array_size; +} + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/typed_bit_set.h b/prog/engine/drv/drv3d_DX12/typed_bit_set.h new file mode 100644 index 000000000..4e39fea04 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/typed_bit_set.h @@ -0,0 +1,90 @@ +#pragma once + +#include + + +template (T::COUNT)> +class TypedBitSet : private eastl::bitset +{ + using BaseType = eastl::bitset; + +public: + using BaseType::all; + using BaseType::any; + using BaseType::count; + using BaseType::flip; + using BaseType::none; + // using BaseType::to_string; + using BaseType::to_ulong; + // using BaseType::to_ullong; + using BaseType::size; + using typename BaseType::reference; + + TypedBitSet() = default; + TypedBitSet(const TypedBitSet &) = default; + ~TypedBitSet() = default; + + TypedBitSet &operator=(const TypedBitSet &) = default; + + bool operator[](T index) const { return (*this)[static_cast(index)]; } + typename BaseType::reference operator[](T index) { return (*this)[static_cast(index)]; } + bool test(T index) const { return BaseType::test(static_cast(index)); } + + TypedBitSet &set() + { + BaseType::set(); + return *this; + } + + TypedBitSet &set(T index, bool value = true) + { + BaseType::set(static_cast(index), value); + return *this; + } + + TypedBitSet &reset() + { + BaseType::reset(); + return *this; + } + + TypedBitSet &reset(T index) + { + BaseType::reset(static_cast(index)); + return *this; + } + + TypedBitSet operator~() const + { + auto cpy = *this; + cpy.flip(); + return cpy; + } + + bool operator==(const TypedBitSet &other) const { return BaseType::operator==(other); } + + bool operator!=(const TypedBitSet &other) const { return BaseType::operator!=(other); } + + // extended stuff + template + TypedBitSet &set(T0 v0, T1 v1, Ts... vs) + { + set(v0); + set(v1, vs...); + return *this; + } + + template + TypedBitSet &reset(T0 v0, T1 v1, Ts... vs) + { + reset(v0); + reset(v1, vs...); + return *this; + } + + template + TypedBitSet(T0 v0, Ts... vs) + { + set(v0, vs...); + } +}; diff --git a/prog/engine/drv/drv3d_DX12/util.h b/prog/engine/drv/drv3d_DX12/util.h new file mode 100644 index 000000000..4b273c125 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/util.h @@ -0,0 +1,60 @@ +#pragma once + +#include +#include + + +template +inline eastl::bitset &or_bit(eastl::bitset &s, size_t i, bool b = true) +{ + return s.set(i, s.test(i) || b); +} + +template +inline char *append_literal(char *at, char *ed, const char (&lit)[N]) +{ + auto d = static_cast(ed - at); + auto c = min(d, N - 1); + memcpy(at, lit, c); + return at + c; +} + +template +inline char *append_or_mask_value_name(char *beg, char *at, char *ed, const char (&name)[N]) +{ + if (beg != at) + { + at = append_literal(at, ed, " | "); + } + return append_literal(at, ed, name); +} + +template +inline T align_value(T value, T alignment) +{ + return (value + alignment - 1) & ~(alignment - 1); +} + +template +inline eastl::span string_literal_span(const char (&sl)[N]) +{ + return {sl, N - 1}; +} + +template +inline eastl::span string_literal_span(const wchar_t (&sl)[N]) +{ + return {sl, N - 1}; +} + +// Applies function object 'f' to each bit index of each set bit in bit mask 'm'. +template +inline void for_each_set_bit(uint32_t m, F f) +{ + while (0 != m) + { + uint32_t i = __bsf_unsafe(m); + m ^= 1u << i; + f(i); + } +} diff --git a/prog/engine/drv/drv3d_DX12/variant_vector.h b/prog/engine/drv/drv3d_DX12/variant_vector.h index c6931807f..3cc77210c 100644 --- a/prog/engine/drv/drv3d_DX12/variant_vector.h +++ b/prog/engine/drv/drv3d_DX12/variant_vector.h @@ -5,8 +5,13 @@ #include #include #include +#include #include #include +#include + +#include "driver.h" + template struct TypePack; diff --git a/prog/engine/drv/drv3d_DX12/viewport_state.h b/prog/engine/drv/drv3d_DX12/viewport_state.h new file mode 100644 index 000000000..0dc0ac9ae --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/viewport_state.h @@ -0,0 +1,103 @@ +#pragma once + +#include "driver.h" + + +namespace drv3d_dx12 +{ + +struct ViewportState +{ + int x; + int y; + int width; + int height; + float minZ; + float maxZ; + + ViewportState() = default; + + ViewportState(const D3D12_VIEWPORT &vp) + { + x = vp.TopLeftX; + y = vp.TopLeftY; + width = vp.Width; + height = vp.Height; + minZ = vp.MinDepth; + maxZ = vp.MaxDepth; + } + + D3D12_RECT asRect() const + { + D3D12_RECT result; + result.left = x; + result.top = y; + result.right = x + width; + result.bottom = y + height; + + return result; + } + + operator D3D12_VIEWPORT() const + { + D3D12_VIEWPORT result; + result.TopLeftX = x; + result.TopLeftY = y; + result.Width = width; + result.Height = height; + result.MinDepth = minZ; + result.MaxDepth = maxZ; + return result; + } +}; + +inline bool operator==(const ViewportState &l, const ViewportState &r) +{ +#define CMP_P(n) (l.n == r.n) + return CMP_P(x) && CMP_P(y) && CMP_P(width) && CMP_P(height) && CMP_P(minZ) && CMP_P(maxZ); +#undef CMP_P +} +inline bool operator!=(const ViewportState &l, const ViewportState &r) { return !(l == r); } +enum class RegionDifference +{ + EQUAL, + SUBSET, + SUPERSET +}; + +inline RegionDifference classify_viewport_diff(const ViewportState &from, const ViewportState &to) +{ + const int32_t dX = to.x - from.x; + const int32_t dY = to.y - from.y; + const int32_t dW = (to.width + to.x) - (from.width + from.x); + const int32_t dH = (to.height + to.y) - (from.height + from.y); + + RegionDifference rectDif = RegionDifference::EQUAL; + // if all zero, then they are the same + if (dX | dY | dW | dH) + { + // can be either subset or completely different + if (dX >= 0 && dY >= 0 && dW <= 0 && dH <= 0) + { + rectDif = RegionDifference::SUBSET; + } + else + { + rectDif = RegionDifference::SUPERSET; + } + } + + if (RegionDifference::SUPERSET != rectDif) + { + // min/max z only affect viewport but not render regions, so it is always a subset if it has + // changed + if (to.maxZ != from.maxZ || to.minZ != from.minZ) + { + return RegionDifference::SUBSET; + } + } + return rectDif; +} + + +} // namespace drv3d_dx12 diff --git a/prog/engine/drv/drv3d_DX12/winapi_helpers.h b/prog/engine/drv/drv3d_DX12/winapi_helpers.h new file mode 100644 index 000000000..1841657d9 --- /dev/null +++ b/prog/engine/drv/drv3d_DX12/winapi_helpers.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include + + +struct UnloadLibHandler +{ + typedef HMODULE pointer; + void operator()(HMODULE lib) + { + if (lib) + { + FreeLibrary(lib); + } + } +}; + +using LibPointer = eastl::unique_ptr; + +struct GenericHandleHandler +{ + typedef HANDLE pointer; + void operator()(pointer h) + { + if (h != nullptr && h != INVALID_HANDLE_VALUE) + CloseHandle(h); + } +}; + +using HandlePointer = eastl::unique_ptr; +using EventPointer = eastl::unique_ptr; + +struct VirtaulAllocMemoryHandler +{ + void operator()(void *ptr) { VirtualFree(ptr, 0, MEM_RELEASE); } +}; + +template +using VirtualAllocPtr = eastl::unique_ptr; diff --git a/prog/engine/drv/drv3d_Metal/d3d_buffers.mm b/prog/engine/drv/drv3d_Metal/d3d_buffers.mm index f1849162d..278639ff4 100644 --- a/prog/engine/drv/drv3d_Metal/d3d_buffers.mm +++ b/prog/engine/drv/drv3d_Metal/d3d_buffers.mm @@ -11,6 +11,8 @@ #include <3d/dag_drv3d.h> +#include + #include "render.h" using namespace drv3d_metal; @@ -56,20 +58,20 @@ Sbuffer *d3d::create_vb(int size, int flg, const char* name) { flg |= SBCF_BIND_VERTEX; - d3d::validate_sbuffer_flags(flg, name); + validate_sbuffer_flags(flg, name); return new Buffer(size, 0, flg, 0, name); } Sbuffer *d3d::create_ib(int size, int flg, const char *name) { flg |= SBCF_BIND_INDEX; - d3d::validate_sbuffer_flags(flg, name); + validate_sbuffer_flags(flg, name); return new Buffer(size, 0, flg, 0, name); } Sbuffer *d3d::create_sbuffer(int struct_size, int elements, unsigned flags, unsigned format, const char* name) { - d3d::validate_sbuffer_flags(flags, name); + validate_sbuffer_flags(flags, name); return new Buffer(elements, struct_size, flags, format, name); } diff --git a/prog/engine/drv/drv3d_Metal/d3d_rtarget.mm b/prog/engine/drv/drv3d_Metal/d3d_rtarget.mm index 940db9d6c..607b90515 100644 --- a/prog/engine/drv/drv3d_Metal/d3d_rtarget.mm +++ b/prog/engine/drv/drv3d_Metal/d3d_rtarget.mm @@ -298,7 +298,7 @@ bool set_render_target_impl(int rt_index, ::BaseTexture* rt, int level, int laye bool d3d::set_render_target() { nextRtState.setBackbufColor(); - nextRtState.setBackbufDepth(); + nextRtState.removeDepth(); nextRtState.changed = true; vp.used = false; diff --git a/prog/engine/drv/drv3d_Metal/d3d_shaders.mm b/prog/engine/drv/drv3d_Metal/d3d_shaders.mm index 0194fbd6d..d2cb25324 100644 --- a/prog/engine/drv/drv3d_Metal/d3d_shaders.mm +++ b/prog/engine/drv/drv3d_Metal/d3d_shaders.mm @@ -66,7 +66,7 @@ } //if strides & streams are unset, will get them from VDECL -PROGRAM d3d::create_program_cs(const uint32_t *cs_native) +PROGRAM d3d::create_program_cs(const uint32_t *cs_native, CSPreloaded) { return render.createComputeProgram((const uint8_t*)cs_native); } diff --git a/prog/engine/drv/drv3d_Metal/d3d_texture.mm b/prog/engine/drv/drv3d_Metal/d3d_texture.mm index e161a9877..6c6226017 100644 --- a/prog/engine/drv/drv3d_Metal/d3d_texture.mm +++ b/prog/engine/drv/drv3d_Metal/d3d_texture.mm @@ -340,6 +340,17 @@ bool check_texformat(int cflg, int resType) return check_texformat(cflg, RES3D_TEX); } +int d3d::get_max_sample_count(int cflg) +{ + for (int samples = get_sample_count(TEXCF_SAMPLECOUNT_MAX); samples; samples >>= 1) + { + if ([render.device supportsTextureSampleCount:samples]) + return samples; + } + + return 1; +} + /// check whether this cube texture format is available /// returns false if cube texture of the specified format can't be created bool d3d::check_cubetexformat(int cflg) diff --git a/prog/engine/drv/drv3d_Metal/render.mm b/prog/engine/drv/drv3d_Metal/render.mm index 29e5f30fd..7e03439b5 100644 --- a/prog/engine/drv/drv3d_Metal/render.mm +++ b/prog/engine/drv/drv3d_Metal/render.mm @@ -735,7 +735,7 @@ static void PatchShader(int shader, int num_reg, int num_tex) { float clear_mesh[8] = { -1,-1, -1, 1, 1,-1, 1, 1 }; - clear_mesh_buffer = d3d::create_vb(sizeof(clear_mesh), SBCF_MAYBELOST, "metal system clear"); + clear_mesh_buffer = d3d::create_vb(sizeof(clear_mesh), 0, "metal system clear"); float *verts = NULL; clear_mesh_buffer->lock(0, 0, (void**)&verts, VBLOCK_WRITEONLY); @@ -2188,7 +2188,7 @@ static void fillAttachement(MTLRenderPassAttachmentDescriptor* desc, int i, Text #endif { desc.texture = tex ? tex->apiTex->rt_texture : nil; - desc.storeAction = tex && (tex->cflg & TEXFMT_MASK) == TEXFMT_MSAA_MAX_SAMPLES ? MTLStoreActionDontCare : MTLStoreActionStore; + desc.storeAction = tex && (tex->cflg & TEXCF_SAMPLECOUNT_MASK) ? MTLStoreActionDontCare : MTLStoreActionStore; desc.level = level; } } diff --git a/prog/engine/drv/drv3d_Metal/shadersPreCache.mm b/prog/engine/drv/drv3d_Metal/shadersPreCache.mm index c74d4fca4..2e862c070 100644 --- a/prog/engine/drv/drv3d_Metal/shadersPreCache.mm +++ b/prog/engine/drv/drv3d_Metal/shadersPreCache.mm @@ -9,6 +9,7 @@ #include #include #include +#include namespace drv3d_metal { @@ -551,13 +552,21 @@ static uint64_t buildRenderStateHash(const Program::RenderState& rstate) id ShadersPreCache::compileShader(const QueuedShader& shader) { + bool is_binary = shader.data.size() > 4 && memcmp(shader.data.data(), "MTLB", 4) == 0; +#if DAGOR_DBGLEVEL > 0 + auto newline = eastl::find(shader.data.begin(), shader.data.end(), '\n'); + eastl::string name = is_binary || newline == shader.data.end() ? shader.entry + : eastl::string((char*)shader.data.data(), eastl::distance(shader.data.begin(), newline)); + TIME_PROFILE_NAME(compile_shader, name.c_str()); +#else TIME_PROFILE(compile_shader); +#endif id func = nil; id lib = nil; NSError* err = nil; - if (shader.data.size() > 4 && memcmp(shader.data.data(), "MTLB", 4) == 0) + if (is_binary) { dispatch_data_t buffer = dispatch_data_create(shader.data.data(), shader.data.size(), nil, DISPATCH_DATA_DESTRUCTOR_DEFAULT); lib = [drv3d_metal::render.device newLibraryWithData:buffer error:&err]; diff --git a/prog/engine/drv/drv3d_Metal/texture.mm b/prog/engine/drv/drv3d_Metal/texture.mm index 6f8821c79..e0c4197bd 100644 --- a/prog/engine/drv/drv3d_Metal/texture.mm +++ b/prog/engine/drv/drv3d_Metal/texture.mm @@ -89,7 +89,6 @@ static bool add_texture_to_list(drv3d_metal::Texture::ApiTexture *t) //case TEXFMT_V16U16: return MTLPixelFormatBC1_RGBA; case TEXFMT_L16: return MTLPixelFormatR16Unorm; case TEXFMT_A8: return MTLPixelFormatA8Unorm; - case TEXFMT_MSAA_MAX_SAMPLES: case TEXFMT_L8: return MTLPixelFormatR8Unorm; #if _TARGET_PC_MACOSX case TEXFMT_A1R5G5B5: return MTLPixelFormatBGRA8Unorm; @@ -315,7 +314,6 @@ static bool add_texture_to_list(drv3d_metal::Texture::ApiTexture *t) } case TEXFMT_A8: case TEXFMT_L8: - case TEXFMT_MSAA_MAX_SAMPLES: { break; } @@ -582,10 +580,10 @@ static void setTexName(id tex, const char* name) width = w; height = h; - if (fmt == TEXFMT_MSAA_MAX_SAMPLES) + if (fmt & TEXCF_SAMPLECOUNT_MASK) { memoryless = true; - samples = [render.device supportsTextureSampleCount:8] ? 8 : 4; + samples = get_sample_count(fmt); } if (l < 1) diff --git a/prog/engine/drv/drv3d_commonCode/free_list_utils.h b/prog/engine/drv/drv3d_commonCode/free_list_utils.h index 5f5ec9278..5c96bb032 100644 --- a/prog/engine/drv/drv3d_commonCode/free_list_utils.h +++ b/prog/engine/drv/drv3d_commonCode/free_list_utils.h @@ -1,5 +1,10 @@ #pragma once +#include +#include +#include + + // Generic free list insert algorithm. A free list is a container that // is ValueRange compatible and sorted by increasing 'front' member value. // The algorithm linearly searches for the position at the container that could diff --git a/prog/engine/drv/drv3d_commonCode/genericSbufferImplementation.h b/prog/engine/drv/drv3d_commonCode/genericSbufferImplementation.h index c5c29bcbe..80db9d68c 100644 --- a/prog/engine/drv/drv3d_commonCode/genericSbufferImplementation.h +++ b/prog/engine/drv/drv3d_commonCode/genericSbufferImplementation.h @@ -1,5 +1,13 @@ #pragma once +#include +#include <3d/dag_drv3d.h> +#include +#include + +#include "drv_returnAddrStore.h" + + template class GenericBufferErrorHandler { @@ -57,15 +65,6 @@ class GenericBufferErrorHandler logerr("indirect buffer can't be structured one in DX11, check <%s>", name); } - static void errorUAVWithSysmemCopy(const char *name) - { -#if DAGOR_DBGLEVEL > 0 - logerr("Unordered access buffer, shouldn't have system copy, add SBCF_MAYBELOST to <%s>", name); -#else - G_UNUSED(name); -#endif - } - static void errorDiscardWithoutPointer(const char *name) { logerr("%s: Discarded buffer '%s' without providing output pointer", name); @@ -96,7 +95,6 @@ class GenericBufferErrorHandler static void errorInvalidStructSizeForRawView(const char *, uint32_t) {} static void errorAllocationOfBufferFailed(const char *, uint32_t, uint32_t, uint32_t) {} static void errorStructuredIndirect(const char *) {} - static void errorUAVWithSysmemCopy(const char *) {} static void errorDiscardWithoutPointer(const char *) {} static void errorFormatUsedWithInvalidUsageFlags(const char *) {} }; @@ -169,7 +167,7 @@ class GenericBufferMemoryArchitecture protected: bool allocateHostCopyMemory(uint32_t buf_flags, uint32_t size) { - if (!(buf_flags & (SBCF_MAYBELOST | SBCF_DYNAMIC)) && + if (!(buf_flags & SBCF_DYNAMIC) && ((buf_flags & SBCF_BIND_MASK) == SBCF_BIND_VERTEX || (buf_flags & SBCF_BIND_MASK) == SBCF_BIND_INDEX)) { localBuffer = eastl::make_unique(size); @@ -618,11 +616,6 @@ class GenericSbufferImplementation final : public GenericBufferMemoryArchitectur { setResName(stat_name); - if ((bufFlags & SBCF_BIND_UNORDERED) && !(bufFlags & SBCF_MAYBELOST)) // fixup - { - BufferErrorHandler::errorUAVWithSysmemCopy(stat_name); - bufFlags |= SBCF_MAYBELOST; - } BufferMemoryArchitecture::allocateHostCopyMemory(bufFlags, bufSize); validate_buffer_properties(bufFlags, format_flags, stat_name); @@ -1207,16 +1200,7 @@ class GenericSbufferImplementation final : public GenericBufferMemoryArchitectur if (!T::isValidBuffer(buffer)) return; - if (!BufferReloadImplementation::executeReload(this)) - { - if (!(bufFlags & SBCF_MAYBELOST) && BufferMemoryArchitecture::hasHostCopy()) - { - void *p = nullptr; - d3d_err(lock(0, 0, &p, VBLOCK_WRITEONLY)); - // no need to copy, returned pointer is host memory - d3d_err(unlock()); - } - } + BufferReloadImplementation::executeReload(this); } void deviceWritesTo() { diff --git a/prog/engine/drv/drv3d_commonCode/init_d3di.inc.cpp b/prog/engine/drv/drv3d_commonCode/init_d3di.inc.cpp index 8749bc15b..ec2f81b2a 100644 --- a/prog/engine/drv/drv3d_commonCode/init_d3di.inc.cpp +++ b/prog/engine/drv/drv3d_commonCode/init_d3di.inc.cpp @@ -41,6 +41,7 @@ bool d3d::fill_interface_table(D3dInterfaceTable &d3dit) FILL_ENTRY(get_texformat_usage); FILL_ENTRY(check_texformat); + FILL_ENTRY(get_max_sample_count); FILL_ENTRY(issame_texformat); FILL_ENTRY(check_cubetexformat); FILL_ENTRY(issame_cubetexformat); diff --git a/prog/engine/drv/drv3d_commonCode/validate_sbuf_flags.h b/prog/engine/drv/drv3d_commonCode/validate_sbuf_flags.h new file mode 100644 index 000000000..5ffee4a41 --- /dev/null +++ b/prog/engine/drv/drv3d_commonCode/validate_sbuf_flags.h @@ -0,0 +1,7 @@ +#pragma once + +inline void validate_sbuffer_flags(unsigned flags, const char *name) +{ + G_UNUSED(flags); + G_UNUSED(name); +} diff --git a/prog/engine/drv/drv3d_null/d3d_stub.cpp b/prog/engine/drv/drv3d_null/d3d_stub.cpp index ca69360db..9064982a2 100644 --- a/prog/engine/drv/drv3d_null/d3d_stub.cpp +++ b/prog/engine/drv/drv3d_null/d3d_stub.cpp @@ -61,6 +61,7 @@ bool d3d::get_event_query_status(D3dEventQuery *q, bool force_flush) { return fa unsigned d3d::get_texformat_usage(int cflg, int restype) { return 0; } bool d3d::check_texformat(int cflg) { return false; } +int d3d::get_max_sample_count(int cflg) { return 0; } bool d3d::issame_texformat(int cflg1, int cflg2) { return false; } bool d3d::check_cubetexformat(int cflg) { return false; } bool d3d::issame_cubetexformat(int cflg1, int cflg2) { return false; } @@ -117,7 +118,7 @@ PROGRAM d3d::create_program(const uint32_t *vpr_native, const uint32_t *fsh_nati return BAD_PROGRAM; } -PROGRAM d3d::create_program_cs(const uint32_t *cs_native) { return BAD_PROGRAM; } +PROGRAM d3d::create_program_cs(const uint32_t *cs_native, CSPreloaded) { return BAD_PROGRAM; } bool d3d::set_program(PROGRAM) { return false; } void d3d::delete_program(PROGRAM) {} diff --git a/prog/engine/drv/drv3d_stub/d3d_stub.cpp b/prog/engine/drv/drv3d_stub/d3d_stub.cpp index dbd93827c..56191dbd0 100644 --- a/prog/engine/drv/drv3d_stub/d3d_stub.cpp +++ b/prog/engine/drv/drv3d_stub/d3d_stub.cpp @@ -894,6 +894,7 @@ unsigned d3d::get_texformat_usage(int cflg, int /*restype*/) return ret; } bool d3d::check_texformat(int /*cflg*/) { return true; } +int d3d::get_max_sample_count(int) { return 1; } bool d3d::issame_texformat(int cflg1, int cflg2) { return cflg1 == cflg2; } bool d3d::check_cubetexformat(int /*cflg*/) { return true; } bool d3d::issame_cubetexformat(int cflg1, int cflg2) { return cflg1 == cflg2; } @@ -1037,7 +1038,7 @@ PROGRAM d3d::create_program(VPROG, FSHADER, VDECL, unsigned *, unsigned) { retur PROGRAM d3d::create_program(const uint32_t *, const uint32_t *, VDECL, unsigned *, unsigned) { return 1; } -PROGRAM d3d::create_program_cs(const uint32_t * /*cs_native*/) { return 1; } +PROGRAM d3d::create_program_cs(const uint32_t * /*cs_native*/, CSPreloaded) { return 1; } bool d3d::set_program(PROGRAM) { return true; } void d3d::delete_program(PROGRAM) {} @@ -1092,7 +1093,7 @@ void d3d::insert_wait_on_fence(GPUFENCEHANDLE & /*fence*/, GpuPipeline /*gpu_pip bool d3d::set_render_target() { currentRtState.setBackbufColor(); - currentRtState.setBackbufDepth(); + currentRtState.removeDepth(); return true; } diff --git a/prog/engine/drv/drv3d_use_d3di/d3d_wrap.cpp b/prog/engine/drv/drv3d_use_d3di/d3d_wrap.cpp index 37e8892de..d202e2a84 100644 --- a/prog/engine/drv/drv3d_use_d3di/d3d_wrap.cpp +++ b/prog/engine/drv/drv3d_use_d3di/d3d_wrap.cpp @@ -26,6 +26,7 @@ bool should_use_compute_for_image_processing(std::initializer_list for } bool check_texformat(int cflg) { return d3di.check_texformat(cflg); } +int d3d::get_max_sample_count(int cflg) { return d3di.get_max_sample_count(cflg); } unsigned get_texformat_usage(int cflg, int restype) { return d3di.get_texformat_usage(cflg, restype); } bool issame_texformat(int cflg1, int cflg2) { return d3di.issame_texformat(cflg1, cflg2); } bool check_cubetexformat(int cflg) { return d3di.check_cubetexformat(cflg); } @@ -112,7 +113,7 @@ PROGRAM create_program(const uint32_t *vpr_native, const uint32_t *fsh_native, V return d3di.create_program_1(vpr_native, fsh_native, vdecl, strides, streams); } -PROGRAM create_program_cs(const uint32_t *cs_native) { return d3di.create_program_cs(cs_native); } +PROGRAM create_program_cs(const uint32_t *cs_native, CSPreloaded preloaded) { return d3di.create_program_cs(cs_native, preloaded); } bool set_program(PROGRAM p) { return d3di.set_program(p); } void delete_program(PROGRAM p) { return d3di.delete_program(p); } diff --git a/prog/engine/drv/drv3d_vulkan/d3d_framebuffer.cpp b/prog/engine/drv/drv3d_vulkan/d3d_framebuffer.cpp index 1e51eec5b..fb09f9cfc 100644 --- a/prog/engine/drv/drv3d_vulkan/d3d_framebuffer.cpp +++ b/prog/engine/drv/drv3d_vulkan/d3d_framebuffer.cpp @@ -117,7 +117,7 @@ bool d3d::set_render_target() la.pipeState.set({0, Bind::back_buffer}); la.pipeState.set( - {MRT_INDEX_DEPTH_STENCIL, Bind::back_buffer}); + {MRT_INDEX_DEPTH_STENCIL, Bind::empty}); la.pipeState.set(false); for (uint32_t i = 1; i < Driver3dRenderTarget::MAX_SIMRT; ++i) la.pipeState.set({i, Bind::empty}); diff --git a/prog/engine/drv/drv3d_vulkan/device.cpp b/prog/engine/drv/drv3d_vulkan/device.cpp index 963f91b36..8af3337df 100644 --- a/prog/engine/drv/drv3d_vulkan/device.cpp +++ b/prog/engine/drv/drv3d_vulkan/device.cpp @@ -583,6 +583,15 @@ VkFormatFeatureFlags Device::getFormatFeatures(VkFormat format) return props.optimalTilingFeatures; } +VkSampleCountFlags Device::getFormatSamples(VkFormat format, VkImageType type, VkImageTiling tiling, VkImageUsageFlags usage) +{ + VkImageFormatProperties properties; + VkResult result = VULKAN_CHECK_RESULT(device.getInstance().vkGetPhysicalDeviceImageFormatProperties(physicalDeviceInfo.device, + format, type, tiling, usage, 0, &properties)); + + return VULKAN_OK(result) ? properties.sampleCounts : 0; +} + bool Device::init(VulkanInstance &inst, const Config &ucfg, const PhysicalDeviceSet &set, const VkDeviceCreateInfo &config, const SwapchainMode &swc_info, const DeviceQueueGroup::Info &queue_info) { @@ -637,6 +646,7 @@ bool Device::init(VulkanInstance &inst, const Config &ucfg, const PhysicalDevice context.getBackend().contextState.frame.start(); shaders::RenderState emptyRS{}; context.getBackend().contextState.renderStateSystem.setRenderStateData((shaders::DriverRenderStateId)0, emptyRS, *this); + context.getBackend().pipelineCompiler.init(); context.initTimingRecord(); context.initTempBuffersConfiguration(); context.initMode(execMode); @@ -804,6 +814,14 @@ const DataBlock *Device::getPerDriverPropertyBlock(const char *prop_name) ret = entry.getBlockByNameEx("data"); }); + if (ret == &DataBlock::emptyBlock) + ret = ::dgs_get_settings() + ->getBlockByNameEx("vulkan") + ->getBlockByNameEx("vendor") + ->getBlockByNameEx("default") + ->getBlockByNameEx("driverProps") + ->getBlockByNameEx(prop_name); + return ret; } @@ -1162,25 +1180,6 @@ uint64_t Device::getGpuTimestampFrequency() const return uint64_t(1000000000.0 / physicalDeviceInfo.properties.limits.timestampPeriod); } -VkSampleCountFlagBits Device::calcMSAAQuality() const -{ - auto quality = ::dgs_get_settings()->getBlockByNameEx("vulkan")->getInt("msaaQuality", 4); - auto qualityLimit = physicalDeviceInfo.properties.limits.framebufferColorSampleCounts & - physicalDeviceInfo.properties.limits.framebufferDepthSampleCounts; - - while ((quality & qualityLimit) == 0 && quality > 1) - quality /= 2; - - static int lastQuality = -1; - if (quality != lastQuality) - { - debug("vulkan: using %d samples for MSAA", quality); - lastQuality = quality; - } - - return VkSampleCountFlagBits(quality); -} - Image *Device::createImage(const ImageCreateInfo &ii) { Image::Description::TrimmedCreateInfo ici; diff --git a/prog/engine/drv/drv3d_vulkan/device.h b/prog/engine/drv/drv3d_vulkan/device.h index d2cbda3ea..b3ee8ff17 100644 --- a/prog/engine/drv/drv3d_vulkan/device.h +++ b/prog/engine/drv/drv3d_vulkan/device.h @@ -51,7 +51,7 @@ #endif #include "resource_manager.h" -#include "timeline.h" +#include "timelines.h" #include "execution_markers.h" namespace drv3d_vulkan @@ -168,6 +168,7 @@ class Device bool checkFormatSupport(VkFormat format, VkImageType type, VkImageTiling tiling, VkImageUsageFlags usage, VkImageCreateFlags flags, VkSampleCountFlags samples); VkFormatFeatureFlags getFormatFeatures(VkFormat format); + VkSampleCountFlags getFormatSamples(VkFormat format, VkImageType type, VkImageTiling tiling, VkImageUsageFlags usage); bool init(VulkanInstance &inst, const Config &ucfg, const PhysicalDeviceSet &set, const VkDeviceCreateInfo &config, const SwapchainMode &swc_info, const DeviceQueueGroup::Info &queue_info); bool isInitialized() const; @@ -216,8 +217,6 @@ class Device bool hasCreateRenderPass2() const; uint32_t getCurrentAvailableMemoryKb(); - VkSampleCountFlagBits calcMSAAQuality() const; - VulkanShaderModuleHandle makeVkModule(const ShaderModuleBlob *module); VkDevice getDevice() const { return device.get(); } diff --git a/prog/engine/drv/drv3d_vulkan/device_context.cpp b/prog/engine/drv/drv3d_vulkan/device_context.cpp index b03559912..8226c987e 100644 --- a/prog/engine/drv/drv3d_vulkan/device_context.cpp +++ b/prog/engine/drv/drv3d_vulkan/device_context.cpp @@ -607,6 +607,7 @@ void DeviceContext::shutdown() executionMode = ExecutionMode::INVALID; back.contextState.bindlessManagerBackend.shutdown(vkDev); + back.pipelineCompiler.shutdown(); back.contextState.frame.end(); front.replayRecord.end(); } @@ -732,7 +733,7 @@ void DeviceContext::setPipelineCompilationTimeBudget(unsigned usecs) uint32_t DeviceContext::getPiplineCompilationQueueLength() { VULKAN_LOCK_FRONT(); - return front.replayRecord->skippedGraphicsPipelines; + return back.pipelineCompiler.getQueueLength(); } size_t DeviceContext::getCurrentWorkItemId() diff --git a/prog/engine/drv/drv3d_vulkan/device_context.h b/prog/engine/drv/drv3d_vulkan/device_context.h index a0660c4ee..6ad1a3c9d 100644 --- a/prog/engine/drv/drv3d_vulkan/device_context.h +++ b/prog/engine/drv/drv3d_vulkan/device_context.h @@ -14,7 +14,7 @@ #include "render_work.h" #include "temp_buffers.h" #include "frame_info.h" -#include "timeline.h" +#include "timelines.h" #include "image_resource.h" #include #include "execution_state.h" @@ -23,6 +23,7 @@ #include "util/fault_report.h" #include "bindless.h" #include "execution_sync.h" +#include "pipeline/compiler.h" namespace drv3d_vulkan { @@ -146,8 +147,7 @@ struct ContextBackend }; eastl::vector delayedDiscards; - unsigned pipelineCompilationTime = 0; - unsigned pipelineCompilationTimeBudget = unsigned(-1); + PipelineCompiler pipelineCompiler; int64_t gpuWaitDuration = 0; int64_t acquireBackBufferDuration = 0; @@ -157,7 +157,7 @@ struct ContextBackend int64_t lastMemoryStatTime = 0; int64_t memoryStatisticsPeriod = 0; - ContextBackend(TimelineManager &tl_man) : contextState(tl_man) + ContextBackend(TimelineManager &tl_man) : contextState(tl_man), pipelineCompiler(tl_man) { executionState.reset(); pipelineState.reset(); diff --git a/prog/engine/drv/drv3d_vulkan/device_context/base_context.cpp b/prog/engine/drv/drv3d_vulkan/device_context/base_context.cpp index 70f6e151c..5dd83cff9 100644 --- a/prog/engine/drv/drv3d_vulkan/device_context/base_context.cpp +++ b/prog/engine/drv/drv3d_vulkan/device_context/base_context.cpp @@ -11,7 +11,6 @@ ExecutionContext::ExecutionContext(RenderWork &work_item) : vkDev(get_device().getVkDevice()) { back.executionState.setExecutionContext(this); - back.pipelineCompilationTime = 0; #if VULKAN_VALIDATION_COLLECT_CALLER > 0 tlsDbgActiveInstance = this; #endif diff --git a/prog/engine/drv/drv3d_vulkan/device_context/execution_context.cpp b/prog/engine/drv/drv3d_vulkan/device_context/execution_context.cpp index 59e87f6af..4a8172588 100644 --- a/prog/engine/drv/drv3d_vulkan/device_context/execution_context.cpp +++ b/prog/engine/drv/drv3d_vulkan/device_context/execution_context.cpp @@ -289,7 +289,7 @@ void ExecutionContext::flushImageUploads() bool anyBindless = false; for (auto &&upload : data.imageUploads) { - if (!upload.image->isGPUWritable() && (upload.image->getUsage() & VK_IMAGE_USAGE_SAMPLED_BIT)) + if (!upload.image->isSampledSRV()) { upload.image->layout.roSealTargetLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; upload.image->requestRoSeal(data.id); @@ -482,8 +482,9 @@ void ExecutionContext::flush(ThreadedFence *fence) back.contextState.cmdListsToSubmit.clear(); - onFrameCoreReset(); + + back.pipelineCompiler.processQueued(); flushProcessed = true; } @@ -1217,6 +1218,16 @@ void ExecutionContext::copyImage(Image *src, Image *dst, uint32_t src_mip, uint3 VULKAN_LOG_CALL(vkDev.vkCmdCopyImage(frameCore, src->getHandle(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst->getHandle(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, region_count, data.imageCopyInfos.data() + first_region)); + + if (dst->isSampledSRV()) + dst->layout.roSealTargetLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + + // we can't know is it readed or not, so assume readed + if (dst->isUsedInBindless()) + { + trackBindlessRead(dst); + back.syncTrack.completeNeeded(frameCore, vkDev); + } } void ExecutionContext::blitImage(Image *src, Image *dst, const VkImageBlit ®ion) @@ -1583,4 +1594,4 @@ void ExecutionContext::trackBindlessRead(Image *img) back.syncTrack.addImageAccess(ExecutionSyncTracker::LogicAddress::forImageBindlessRead(), img, srvLayout, {0, img->getMipLevels(), 0, img->getArrayLayers()}); -} \ No newline at end of file +} diff --git a/prog/engine/drv/drv3d_vulkan/device_context_cmd.inc b/prog/engine/drv/drv3d_vulkan/device_context_cmd.inc index 81c0d85a4..03db5adc0 100644 --- a/prog/engine/drv/drv3d_vulkan/device_context_cmd.inc +++ b/prog/engine/drv/drv3d_vulkan/device_context_cmd.inc @@ -135,8 +135,7 @@ VULKAN_END_CONTEXT_COMMAND VULKAN_BEGIN_CONTEXT_COMMAND(CompileComputePipeline) #if VULKAN_CONTEXT_COMMAND_IMPLEMENTATION - ctx.back.executionState.set(ActiveExecutionStage::COMPUTE); - ctx.flushComputeState(); +// nothing to do #endif VULKAN_END_CONTEXT_COMMAND @@ -1009,7 +1008,7 @@ VULKAN_END_CONTEXT_COMMAND VULKAN_BEGIN_CONTEXT_COMMAND(PipelineCompilationTimeBudget) VULKAN_CONTEXT_COMMAND_PARAM(unsigned, usecs) #if VULKAN_CONTEXT_COMMAND_IMPLEMENTATION - ctx.back.pipelineCompilationTimeBudget = usecs; + drv3d_vulkan::get_device().pipeMan.setAsyncCompile(usecs == 0); #endif VULKAN_END_CONTEXT_COMMAND diff --git a/prog/engine/drv/drv3d_vulkan/execution_sync.cpp b/prog/engine/drv/drv3d_vulkan/execution_sync.cpp index 1e662893d..b3fa095de 100644 --- a/prog/engine/drv/drv3d_vulkan/execution_sync.cpp +++ b/prog/engine/drv/drv3d_vulkan/execution_sync.cpp @@ -101,7 +101,9 @@ struct OpsProcessAlgorithm // add other parts if any while (scratch.coverageMap.getArea(cachedArea)) { - ops.arr.push_back(srcOp); + ops.arr.push_back_uninitialized(); + // read from proper memory if vector was reallocated + ops.arr.back() = ops.arr[srcOpIndex]; ops.arr.back().area = cachedArea; } @@ -431,6 +433,19 @@ void ExecutionSyncTracker::completeAll(VulkanCommandBufferHandle cmd_buffer, con if (op.laddr.isWrite() && op.obj->isUsedInBindless()) logerr("vulkan: sync: image: incompleted write while registered in bindless, must handle it! %s", op.format()); + if (!op.laddr.isWrite() && op.obj->layout.roSealTargetLayout != VK_IMAGE_LAYOUT_UNDEFINED) + { + bool canSeal = true; + for (VkImageLayout i : op.obj->layout.data) + if (i != op.obj->layout.roSealTargetLayout) + { + canSeal = false; + break; + } + if (canSeal) + op.obj->optionallyActivateRoSeal(gpu_work_id); + } + srcLA.merge(op.laddr); } diff --git a/prog/engine/drv/drv3d_vulkan/image_resource.h b/prog/engine/drv/drv3d_vulkan/image_resource.h index e0860cde9..baa0e552a 100644 --- a/prog/engine/drv/drv3d_vulkan/image_resource.h +++ b/prog/engine/drv/drv3d_vulkan/image_resource.h @@ -87,6 +87,7 @@ struct ImageLayoutInfo void init(uint32_t mips, uint32_t layers, VkImageLayout initial) { + roSealTargetLayout = VK_IMAGE_LAYOUT_UNDEFINED; mipLevels = mips; data.resize(mips * layers); for (auto &&s : data) @@ -215,6 +216,8 @@ class Image : public ImageImplBase, public ResourceExecutionSyncableExtend (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_STORAGE_BIT); } + bool isSampledSRV() { return !isGPUWritable() && (getUsage() & VK_IMAGE_USAGE_SAMPLED_BIT); } + void addBindlessSlot(uint32_t slot) { bindlessSlots.push_back(slot); } void removeBindlessSlot(uint32_t slot) diff --git a/prog/engine/drv/drv3d_vulkan/jamfile b/prog/engine/drv/drv3d_vulkan/jamfile index bd75760f6..e2129d3b3 100644 --- a/prog/engine/drv/drv3d_vulkan/jamfile +++ b/prog/engine/drv/drv3d_vulkan/jamfile @@ -49,7 +49,7 @@ Sources = device_context/base_context.cpp device_context/render_pass.cpp render_work.cpp - timeline.cpp + timelines.cpp frame_info.cpp temp_buffers.cpp cleanup_queue.cpp @@ -73,6 +73,7 @@ Sources = pipeline/stage_state_base.cpp pipeline/main_pipelines.cpp pipeline/variated_graphics.cpp + pipeline/compiler.cpp render_state_system.cpp #resources & memory diff --git a/prog/engine/drv/drv3d_vulkan/physical_device_set.h b/prog/engine/drv/drv3d_vulkan/physical_device_set.h index 4f2847e17..ca9963e0e 100644 --- a/prog/engine/drv/drv3d_vulkan/physical_device_set.h +++ b/prog/engine/drv/drv3d_vulkan/physical_device_set.h @@ -17,13 +17,6 @@ struct PhysicalDeviceSet VkPhysicalDeviceSubgroupProperties subgroupProperties{}; eastl::vector extensions; - struct MsaaMaxSamplesDesc - { - VkFormat format; - VkSampleCountFlagBits samples; - }; - MsaaMaxSamplesDesc maxSamplesFormat = {VK_FORMAT_UNDEFINED, VK_SAMPLE_COUNT_1_BIT}; - #if VK_KHR_imageless_framebuffer VkPhysicalDeviceImagelessFramebufferFeaturesKHR imagelessFramebufferFeature = // {VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGELESS_FRAMEBUFFER_FEATURES_KHR, nullptr, false}; @@ -582,42 +575,11 @@ struct PhysicalDeviceSet formatProperties[i] = fp.formatProperties; } - setupMaxSamplesFormat(instance); - deviceLocalHeapSizeKb = calculateTotalAvailableDeviceLocalMemoryKb(); return true; } - void setupMaxSamplesFormat(VulkanInstance &instance) - { - VkFormat maxMsaaFormats[] = {VK_FORMAT_R8_UNORM, VK_FORMAT_R16_UNORM, VK_FORMAT_R8G8_UNORM, VK_FORMAT_B8G8R8A8_UNORM}; - int bestSamples = 1; - const int maxSamples = 8; - VkImageFormatProperties fmtProp; - - for (int formatNo = 0; formatNo < sizeof(maxMsaaFormats) / sizeof(maxMsaaFormats[0]); formatNo++) - { - for (int samples = maxSamples; samples > bestSamples; samples--) - { - VkResult result = instance.vkGetPhysicalDeviceImageFormatProperties(device, maxMsaaFormats[formatNo], VK_IMAGE_TYPE_2D, - VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, 0, &fmtProp); - if (VULKAN_FAIL(result)) - continue; - - if (fmtProp.sampleCounts & (1 << samples)) - { - maxSamplesFormat.format = maxMsaaFormats[formatNo]; - bestSamples = samples; - maxSamplesFormat.samples = (VkSampleCountFlagBits)(1 << samples); - break; - } - } - } - debug("vulkan: TEXFMT_MSAA_MAX_SAMPLES { samples %d, fmt %s } ", maxSamplesFormat.samples, - FormatStore::fromVkFormat(maxSamplesFormat.format).getNameString()); - } - void initUnextended(VulkanInstance &instance) { VULKAN_LOG_CALL(instance.vkGetPhysicalDeviceFeatures(device, &features)); diff --git a/prog/engine/drv/drv3d_vulkan/pipeline/base_pipeline.h b/prog/engine/drv/drv3d_vulkan/pipeline/base_pipeline.h index 202d91797..9c15975ef 100644 --- a/prog/engine/drv/drv3d_vulkan/pipeline/base_pipeline.h +++ b/prog/engine/drv/drv3d_vulkan/pipeline/base_pipeline.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace drv3d_vulkan { @@ -153,7 +154,9 @@ template class BasePipeline : public CustomPipeline { public: - BasePipeline(PipelineLayoutType *iLayout) : CustomPipeline(iLayout), handle() {} + BasePipeline(PipelineLayoutType *iLayout) : + CustomPipeline(iLayout), handle(), compiledHandle() + {} void shutdown(VulkanDevice &device) { @@ -161,10 +164,67 @@ class BasePipeline : public CustomPipeline class TargetPipelineType> diff --git a/prog/engine/drv/drv3d_vulkan/pipeline/compiler.cpp b/prog/engine/drv/drv3d_vulkan/pipeline/compiler.cpp new file mode 100644 index 000000000..69557389d --- /dev/null +++ b/prog/engine/drv/drv3d_vulkan/pipeline/compiler.cpp @@ -0,0 +1,270 @@ +#include "compiler.h" +#include "main_pipelines.h" +#include "timelines.h" +#include "device.h" + +using namespace drv3d_vulkan; + +namespace +{ +// keep compiler as global in module, to not have overhead on storing its pointer in multiple places +static PipelineCompiler *g_compiler = nullptr; +} // namespace + +void PipelineCompileQueueItem::compile() +{ + switch (type) + { + case PipelineCompileQueueItemType::CS: cs->compile(); break; + case PipelineCompileQueueItemType::GR: gr->compile(); break; + default: G_ASSERTF(0, "vulkan: unknown compile queue item type %u", (uint8_t)type); break; + } + g_compiler->onItemCompiled(); +} + +bool PipelineCompileQueueItem::completed() +{ + switch (type) + { + case PipelineCompileQueueItemType::CS: return cs->checkCompiled(); break; + case PipelineCompileQueueItemType::GR: return gr->checkCompiled(); break; + default: G_ASSERTF(0, "vulkan: unknown compile queue item type %u", (uint8_t)type); break; + } + return false; +} + +void PipelineCompilerWork::init() {} + +void PipelineCompilerWork::submit() {} + +void PipelineCompilerWork::acquire(size_t) {} + +void PipelineCompilerWork::wait() { g_compiler->waitSecondaryWorkers(); } + +void PipelineCompilerWork::cleanup() { queue.clear(); } + +void PipelineCompilerWork::process() { g_compiler->compileBlock(queue); } + +void PipelineCompilerWork::shutdown() +{ + queue.clear(); + queue.shrink_to_fit(); +} + +void PipelineCompiler::PrimaryWorkerThread::execute() +{ + TIME_PROFILE_THREAD(getCurrentThreadName()); + compiler.startSecondaryWorkers(); + auto &compileQueue = get_device().timelineMan.get(); + while (!interlocked_acquire_load(terminating)) + { + // wait for at least one work item to be processed + if (compileQueue.waitSubmit(1, 1)) + compileQueue.advance(); + } +} + +void PipelineCompiler::SecondaryWorkerThread::execute() +{ + TIME_PROFILE_THREAD(getCurrentThreadName()); + + // initial startup + compiler.notifyWorkerCompleted(); + + while (!interlocked_acquire_load(terminating)) + { + if (os_event_wait(&wakeEvent, OS_WAIT_INFINITE) != OS_WAIT_OK) + continue; + + compiler.asyncCompileLoop(); + compiler.notifyWorkerCompleted(); + } +} + +void PipelineCompiler::loadConfig() +{ + int coreCount = cpujobs::get_core_count(); + const DataBlock *cfgBlk = get_device().getPerDriverPropertyBlock("pipelineCompiler"); + cfg.maxSecondaryThreads = min(coreCount, cfgBlk->getInt("maxSecondaryThreads", max(2, coreCount - 2))); + cfg.maxSecondaryThreads = min(cfg.maxSecondaryThreads, MAX_THREADS); + cfg.secondarySpawnThreshold = cfgBlk->getInt("secondarySpawnThreshold", 16); + cfg.minItemsPerSecondary = cfgBlk->getInt("minItemsPerSecondary", 4); + debug("vulkan: pipeline compiler: %u threads %u secondary threshold %u min items per secondary", cfg.maxSecondaryThreads, + cfg.secondarySpawnThreshold, cfg.minItemsPerSecondary); +} + +PipelineCompiler::PipelineCompiler(TimelineManager &tl_man) : timeBlock(tl_man), primaryWorker(*this) { g_compiler = this; } + +void PipelineCompiler::init() +{ + loadConfig(); + + os_event_create(&secondaryWorkersFinishEvent); + + timeBlock.start(); + primaryWorker.start(); +} + +void PipelineCompiler::shutdown() +{ + processQueuedBlocked(); + primaryWorker.terminate(true /*wait*/); + shutdownSecondaryWorkers(); + + os_event_destroy(&secondaryWorkersFinishEvent); + + cfg.maxSecondaryThreads = 0; + auto &compileQueue = get_device().timelineMan.get(); + while (compileQueue.waitSubmit(1, 1)) + compileQueue.advance(); + timeBlock.end(); +} + +void PipelineCompiler::queue(ComputePipeline *compute_pipe) +{ + PipelineCompileQueueItem qi{PipelineCompileQueueItemType::CS}; + qi.cs = compute_pipe; + timeBlock->queue.push_back(qi); + ++queueLength; +} + +void PipelineCompiler::queue(GraphicsPipeline *graphics_pipe) +{ + PipelineCompileQueueItem qi{PipelineCompileQueueItemType::GR}; + qi.gr = graphics_pipe; + timeBlock->queue.push_back(qi); + ++queueLength; +} + +void PipelineCompiler::onItemCompiled() { --queueLength; } + +void PipelineCompiler::waitFor(ComputePipeline *compute_pipe) +{ +#if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA + // if we waiting for CS pipeline that is async compiled - we doing something wrong in user code + logerr("vulkan: pipe compiler: blocked wait for CS %s", compute_pipe->printDebugInfoBuffered()); +#endif + TIME_PROFILE(vulkan_cs_pipe_wait); + processQueuedBlocked(); + ComputePipeline *pipe = compute_pipe; + spin_wait([pipe]() { return !pipe->checkCompiled(); }); +} + +void PipelineCompiler::waitFor(GraphicsPipeline *graphics_pipe) +{ +#if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA + // if we waiting for graphics pipeline that is async compiled - we doing something wrong in user code + // name is not known here unfortunately, but should be relatively visible in profiler + logerr("vulkan: pipe compiler: blocked wait for graphics pipeline"); +#endif + TIME_PROFILE(vulkan_gr_pipe_wait); + processQueuedBlocked(); + GraphicsPipeline *pipe = graphics_pipe; + spin_wait([pipe]() { return !pipe->checkCompiled(); }); +} + +void PipelineCompiler::processQueuedBlocked() +{ + if (!processQueued()) + { + TIME_PROFILE(vulkan_pipe_compiler_queue_block); + auto &compileQueue = get_device().timelineMan.get(); + constexpr size_t maxWaitCycles = 1000000; + while (!compileQueue.waitAcquireSpace(maxWaitCycles)) + logwarn("vulkan: long pipe compiler wait"); + if (!processQueued()) + fatal("vulkan: can't complete blocked wait for compiler timeline"); + } +} + +bool PipelineCompiler::processQueued() +{ + if (!timeBlock->queue.size()) + return true; + + auto &compileQueue = get_device().timelineMan.get(); + if (!compileQueue.waitAcquireSpace(0)) + return false; + timeBlock.restart(); + return true; +} + +size_t PipelineCompiler::getQueueLength() { return queueLength.load(); } + +void PipelineCompiler::compileBlock(eastl::vector &block) +{ + endItem = block.end(); + currentItem = block.begin(); + if (block.size() > cfg.secondarySpawnThreshold && cfg.maxSecondaryThreads) + { + TIME_PROFILE(vulkan_pipe_compiler_wake); + size_t threadsToStart = min(block.size() / cfg.minItemsPerSecondary, cfg.maxSecondaryThreads); + setPendingWorkers(threadsToStart); + for (int i = 0; i < threadsToStart; ++i) + secondaryWorkers[i]->wake(); + } + else + asyncCompileLoop(); +} + +void PipelineCompiler::asyncCompileLoop() +{ + TIME_PROFILE(vulkan_pipe_compiler_loop); + PipelineCompileQueueItem *acquiredItem = currentItem.fetch_add(1); + PipelineCompileQueueItem *end = endItem; + while (acquiredItem < end) + { + G_ASSERTF(!acquiredItem->completed(), "vulkan: trying to compile already completed item"); + acquiredItem->compile(); + acquiredItem = currentItem.fetch_add(1); + } +} + +void PipelineCompiler::startSecondaryWorkers() +{ + setPendingWorkers(cfg.maxSecondaryThreads); + for (int i = 0; i < cfg.maxSecondaryThreads; ++i) + { + secondaryWorkers[i] = eastl::make_unique(*this, i); + secondaryWorkers[i]->start(); + } + // wait threads to start + waitSecondaryWorkers(); +} + +void PipelineCompiler::notifyWorkerCompleted() +{ + if (--pendingWorkers == 0) + os_event_set(&secondaryWorkersFinishEvent); +} + +void PipelineCompiler::setPendingWorkers(size_t v) +{ + if (!v) + return; + pendingWorkers = v; + shouldWaitThreads = true; +} + +void PipelineCompiler::waitSecondaryWorkers() +{ + if (!shouldWaitThreads) + return; + + TIME_PROFILE(vulkan_pipe_compiler_wait_secondary); + if (os_event_wait(&secondaryWorkersFinishEvent, OS_WAIT_INFINITE) != OS_WAIT_OK) + fatal("vulkan: pipe compiler secondary wait failed"); + shouldWaitThreads = false; +} + +void PipelineCompiler::shutdownSecondaryWorkers() +{ + endItem = nullptr; + currentItem = nullptr; + setPendingWorkers(cfg.maxSecondaryThreads); + for (int i = 0; i < cfg.maxSecondaryThreads; ++i) + secondaryWorkers[i]->wakeAndTerminate(); + waitSecondaryWorkers(); + for (int i = 0; i < cfg.maxSecondaryThreads; ++i) + secondaryWorkers[i].reset(); +} \ No newline at end of file diff --git a/prog/engine/drv/drv3d_vulkan/pipeline/compiler.h b/prog/engine/drv/drv3d_vulkan/pipeline/compiler.h new file mode 100644 index 000000000..52b6d7014 --- /dev/null +++ b/prog/engine/drv/drv3d_vulkan/pipeline/compiler.h @@ -0,0 +1,134 @@ +#pragma once +#include +#include +#include +#include +#include "compiler_scratch_data.h" +#include + +namespace drv3d_vulkan +{ + +class ComputePipeline; +class GraphicsPipeline; +class TimelineManager; +class PipelineCompiler; + +enum class PipelineCompileQueueItemType : uint8_t +{ + CS, + GR +}; + +struct PipelineCompileQueueItem +{ + PipelineCompileQueueItemType type; + union + { + ComputePipeline *cs; + GraphicsPipeline *gr; + }; + void compile(); + bool completed(); +}; + +struct PipelineCompilerWork +{ + static constexpr size_t RING_SIZE = 128; + eastl::vector queue; + + void init(); + void submit(); + void acquire(size_t timeline_abs_idx); + void wait(); + void cleanup(); + void process(); + void shutdown(); +}; + +struct PipelineCompileTimelineSync : public TimelineSyncPartLockFree, + public TimelineSyncPartSingleWriterSingleReader, + public TimelineSyncPartEventWaitable +{}; + +typedef Timeline PipelineCompileTimeline; + +class PipelineCompiler +{ + struct PrimaryWorkerThread : public DaThread + { + PrimaryWorkerThread(PipelineCompiler &c) : DaThread("VkPipeCompilerPrimary"), compiler(c) {} + void execute() override; + + private: + PipelineCompiler &compiler; + }; + + struct SecondaryWorkerThread : public DaThread + { + static const char *getWorkerName(int idx); + + SecondaryWorkerThread(PipelineCompiler &c, int idx) : DaThread(String(32, "VkPipeCompilerSecondary%u", idx)), compiler(c) + { + os_event_create(&wakeEvent); + } + ~SecondaryWorkerThread() { os_event_destroy(&wakeEvent); } + void wakeAndTerminate() { terminate(true /*wait*/, -1, &wakeEvent); } + void execute() override; + void wake() { os_event_set(&wakeEvent); } + + private: + PipelineCompiler &compiler; + os_event_t wakeEvent; //-V730_NOINIT + }; + + struct Config + { + uint8_t maxSecondaryThreads; + size_t secondarySpawnThreshold; + size_t minItemsPerSecondary; + }; + + Config cfg = {0, 0, 0}; + PrimaryWorkerThread primaryWorker; + static constexpr uint8_t MAX_THREADS = 16; + + eastl::unique_ptr secondaryWorkers[MAX_THREADS] = {}; + TimelineSpan timeBlock; + std::atomic currentItem{nullptr}; + PipelineCompileQueueItem *endItem = nullptr; + bool shouldWaitThreads = false; + std::atomic pendingWorkers{0}; + std::atomic queueLength = 0; + os_event_t secondaryWorkersFinishEvent; //-V730_NOINIT + void loadConfig(); + +public: + PipelineCompiler(TimelineManager &tl_man); + ~PipelineCompiler() = default; + PipelineCompiler(const PipelineCompiler &) = delete; + + void init(); + void shutdown(); + + void queue(ComputePipeline *compute_pipe); + void queue(GraphicsPipeline *graphics_pipe); + void onItemCompiled(); + void waitFor(ComputePipeline *compute_pipe); + void waitFor(GraphicsPipeline *graphics_pipe); + void processQueuedBlocked(); + bool processQueued(); + + size_t getQueueLength(); + + void compileBlock(eastl::vector &block); + void asyncCompileLoop(); + + void startSecondaryWorkers(); + void setPendingWorkers(size_t v); + void notifyWorkerCompleted(); + void waitSecondaryWorkers(); + void shutdownSecondaryWorkers(); +}; + +} // namespace drv3d_vulkan diff --git a/prog/engine/drv/drv3d_vulkan/pipeline/compiler_scratch_data.h b/prog/engine/drv/drv3d_vulkan/pipeline/compiler_scratch_data.h new file mode 100644 index 000000000..6966478b0 --- /dev/null +++ b/prog/engine/drv/drv3d_vulkan/pipeline/compiler_scratch_data.h @@ -0,0 +1,54 @@ +#pragma once +#include "driver.h" + +namespace drv3d_vulkan +{ + +class GraphicsPipeline; +class RenderPassResource; +struct ComputePipelineCompileScratchData +{ + VulkanShaderModuleHandle vkModule; + VulkanPipelineLayoutHandle vkLayout; + VulkanPipelineCacheHandle vkCache; +#if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA + String name; +#endif + int progIdx; + bool allocated; +}; + +struct GraphicsPipelineCompileScratchData +{ + GraphicsPipeline *parentPipe; + VulkanPipelineCacheHandle vkCache; +#if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA + String shortDebugName; + String fullDebugName; +#endif + int progIdx; + int varIdx; + int varTotal; + bool allocated; + RenderPassResource *nativeRP; + + // for actual create pipe API + carray inputStreams; + carray inputAttribs; + VkPipelineVertexInputStateCreateInfo vertexInput; + VkPipelineTessellationStateCreateInfo tesselation; + VkPipelineRasterizationStateCreateInfo raster; + VkPipelineMultisampleStateCreateInfo multisample; + VkPipelineDepthStencilStateCreateInfo depthStencil; + carray attachmentStates; + VkPipelineColorBlendStateCreateInfo colorBlendState; + VkPipelineDynamicStateCreateInfo dynamicStates; +#if VK_EXT_conservative_rasterization + VkPipelineRasterizationConservativeStateCreateInfoEXT conservativeRasterStateCI; +#endif + VkPipelineShaderStageCreateInfo stages[spirv::graphics::MAX_SETS]; + VkPipelineInputAssemblyStateCreateInfo piasci; + VkGraphicsPipelineCreateInfo gpci; +}; + +} // namespace drv3d_vulkan diff --git a/prog/engine/drv/drv3d_vulkan/pipeline/main_pipelines.cpp b/prog/engine/drv/drv3d_vulkan/pipeline/main_pipelines.cpp index 410293e33..e64eb3a31 100644 --- a/prog/engine/drv/drv3d_vulkan/pipeline/main_pipelines.cpp +++ b/prog/engine/drv/drv3d_vulkan/pipeline/main_pipelines.cpp @@ -38,17 +38,50 @@ namespace drv3d_vulkan template <> void ComputePipeline::onDelayedCleanupFinish() { + if (!checkCompiled()) + get_device().getContext().getBackend().pipelineCompiler.waitFor(this); shutdown(get_device().getVkDevice()); delete this; } } // namespace drv3d_vulkan -ComputePipeline::ComputePipeline(VulkanDevice &device, ProgramID prog, VulkanPipelineCacheHandle cache, LayoutType *l, +ComputePipeline::ComputePipeline(VulkanDevice &, ProgramID prog, VulkanPipelineCacheHandle cache, LayoutType *l, const CreationInfo &info) : DebugAttachedPipeline(l) { - VulkanShaderModuleHandle shader = get_device().makeVkModule(info.sci); + ComputePipelineCompileScratchData localScratch; + compileScratch = info.allowAsyncCompile ? new ComputePipelineCompileScratchData() : &localScratch; + compileScratch->allocated = info.allowAsyncCompile; + + compileScratch->vkModule = get_device().makeVkModule(info.sci); + compileScratch->vkLayout = layout->handle; + compileScratch->vkCache = cache; +#if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA + compileScratch->name = info.sci->name; +#endif + compileScratch->progIdx = prog.get(); + + if (info.allowAsyncCompile) + get_device().getContext().getBackend().pipelineCompiler.queue(this); + else + compile(); +} + +void ComputePipeline::bind(VulkanDevice &vk_dev, VulkanCommandBufferHandle cmd_buffer) +{ + if (!checkCompiled()) + get_device().getContext().getBackend().pipelineCompiler.waitFor(this); +#if VULKAN_LOG_PIPELINE_ACTIVITY > 1 + debug("vulkan: bind compute cs %s", debugInfo.cs().name); +#endif + VULKAN_LOG_CALL(vk_dev.vkCmdBindPipeline(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, getHandle())); +} + +void ComputePipeline::compile() +{ + Device &vkDev = get_device(); + VulkanDevice &device = vkDev.getVkDevice(); VkComputePipelineCreateInfo cpci = {VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, NULL}; cpci.flags = 0; @@ -56,7 +89,7 @@ ComputePipeline::ComputePipeline(VulkanDevice &device, ProgramID prog, VulkanPip cpci.stage.pNext = NULL; cpci.stage.flags = 0; cpci.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT; - cpci.stage.module = shader; + cpci.stage.module = compileScratch->vkModule; cpci.stage.pName = "main"; cpci.stage.pSpecializationInfo = NULL; cpci.layout = layout->handle; @@ -68,55 +101,59 @@ ComputePipeline::ComputePipeline(VulkanDevice &device, ProgramID prog, VulkanPip int64_t compilationTime = 0; VkResult compileResult = VK_ERROR_UNKNOWN; + VulkanPipelineHandle retHandle; { #if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA - TIME_PROFILE_NAME(vulkan_cs_pipeline_compile, info.sci->name) + TIME_PROFILE_NAME(vulkan_cs_pipeline_compile, compileScratch->name) #else TIME_PROFILE(vulkan_cs_pipeline_compile) #endif ScopedTimer compilationTimer(compilationTime); - compileResult = device.vkCreateComputePipelines(device.get(), cache, 1, &cpci, NULL, ptr(handle)); + compileResult = device.vkCreateComputePipelines(device.get(), compileScratch->vkCache, 1, &cpci, NULL, ptr(retHandle)); } - if (is_null(handle) && VULKAN_OK(compileResult)) + if (is_null(retHandle) && VULKAN_OK(compileResult)) { - debug("vulkan: pipeline [compute:%u] not compiled but result was ok (%u)", prog.get(), compileResult); + debug("vulkan: pipeline [compute:%u] not compiled but result was ok (%u)", compileScratch->progIdx, compileResult); compileResult = VK_ERROR_UNKNOWN; } #if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA if (VULKAN_FAIL(compileResult)) - debug("vulkan: pipeline [compute:%u] cs: %s failed to compile", prog.get(), info.sci->name); - get_device().setPipelineName(handle, info.sci->name); - get_device().setPipelineLayoutName(getLayout()->handle, info.sci->name); + debug("vulkan: pipeline [compute:%u] cs: %s failed to compile", compileScratch->progIdx, compileScratch->name); + get_device().setPipelineName(retHandle, compileScratch->name); + get_device().setPipelineLayoutName(getLayout()->handle, compileScratch->name); totalCompilationTime = compilationTime; variantCount = 1; #endif VULKAN_EXIT_ON_FAIL(compileResult); #if VULKAN_LOG_PIPELINE_ACTIVITY < 1 - if (compilationTime > PIPELINE_COMPILATION_LONG_THRESHOLD) + if (compilationTime > PIPELINE_COMPILATION_LONG_THRESHOLD && !compileScratch->allocated) #endif { - debug("vulkan: pipeline [compute:%u] compiled in %u us", prog.get(), compilationTime); + debug("vulkan: pipeline [compute:%u] compiled in %u us", compileScratch->progIdx, compilationTime); crFeedback.logFeedback(); #if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA - debug("vulkan: with cs %s , handle %p", info.sci->name, generalize(handle)); + debug("vulkan: with cs %s , handle %p", compileScratch->name, generalize(retHandle)); #endif } // no need to keep the shader module, delete it to save memory - VULKAN_LOG_CALL(device.vkDestroyShaderModule(device.get(), shader, NULL)); -} + VULKAN_LOG_CALL(device.vkDestroyShaderModule(device.get(), compileScratch->vkModule, NULL)); -void ComputePipeline::bind(VulkanDevice &vk_dev, VulkanCommandBufferHandle cmd_buffer) -{ -#if VULKAN_LOG_PIPELINE_ACTIVITY > 1 - debug("vulkan: bind compute cs %s", debugInfo.cs().name); -#endif - VULKAN_LOG_CALL(vk_dev.vkCmdBindPipeline(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, handle)); + if (compileScratch->allocated) + { + delete compileScratch; + setCompiledHandle(retHandle); + } + else + setHandle(retHandle); + compileScratch = nullptr; } +bool ComputePipeline::pendingCompilation() { return !checkCompiled(); } + static VkSampleCountFlagBits checkSampleCount(unsigned int count, uint8_t colorMask, uint8_t hasDepth) { G_UNUSED(colorMask); @@ -145,20 +182,23 @@ static VkSampleCountFlagBits checkSampleCount(unsigned int count, uint8_t colorM return ret; } +static VkDynamicState grPipeDynamicStateList[] = // + {VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR, VK_DYNAMIC_STATE_DEPTH_BIAS, VK_DYNAMIC_STATE_DEPTH_BOUNDS, + VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, VK_DYNAMIC_STATE_STENCIL_REFERENCE, + VK_DYNAMIC_STATE_BLEND_CONSTANTS}; + +static const VkRect2D grPipeStaticRect = {{0, 0}, {1, 1}}; +static const VkViewport grPipeStaticViewport = {0.f, 0.f, 1.f, 1.f, 0.f, 1.f}; +// no need for unique states per variant, they are all the same +static const VkPipelineViewportStateCreateInfo grPipeViewportStates = // + {VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, NULL, 0, 1, &grPipeStaticViewport, 1, &grPipeStaticRect}; + GraphicsPipeline::GraphicsPipeline(VulkanDevice &device, VulkanPipelineCacheHandle cache, LayoutType *l, const CreationInfo &info) : BasePipeline(l), dynStateMask(info.dynStateMask) { - carray inputStreams; - carray inputAttribs; - VkPipelineVertexInputStateCreateInfo vertexInput; - VkPipelineTessellationStateCreateInfo tesselation; - VkPipelineRasterizationStateCreateInfo raster; - VkPipelineMultisampleStateCreateInfo multisample; - VkPipelineDepthStencilStateCreateInfo depthStencil; - carray attachmentStates; - VkPipelineColorBlendStateCreateInfo colorBlendState; - VkPipelineDynamicStateCreateInfo dynamicStates; - VkGraphicsPipelineCreateInfo gpci; + GraphicsPipelineCompileScratchData &csd = *info.scratch; + compileScratch = info.scratch; + csd.vkCache = cache; // deal with render pass dependencies VulkanRenderPassHandle renderPassHandle; @@ -166,12 +206,14 @@ GraphicsPipeline::GraphicsPipeline(VulkanDevice &device, VulkanPipelineCacheHand bool forceNoZWrite = false; uint32_t rpColorTargetMask = 0; VkSampleCountFlagBits sampleCount; + csd.nativeRP = info.nativeRP; if (info.nativeRP) { sampleCount = info.nativeRP->getMSAASamples(info.varDsc.subpass); // TODO: add MSAA test and make it work renderPassHandle = info.nativeRP->getHandle(); hasDepth = info.nativeRP->hasDepthAtSubpass(info.varDsc.subpass); rpColorTargetMask = info.nativeRP->getColorWriteMaskAtSubpass(info.varDsc.subpass); + csd.nativeRP->addPipelineCompileRef(); } else { @@ -193,100 +235,99 @@ GraphicsPipeline::GraphicsPipeline(VulkanDevice &device, VulkanPipelineCacheHand for (int32_t i = 0; i < inputLayout.attribs.size(); ++i) if (inputLayout.attribs[i].used) - inputAttribs[attribs++] = inputLayout.attribs[i].toVulkan(); + csd.inputAttribs[attribs++] = inputLayout.attribs[i].toVulkan(); for (uint32_t i = 0; i < MAX_VERTEX_INPUT_STREAMS; ++i) if (inputLayout.streams.used[i]) - inputStreams[lss++] = inputLayout.streams.toVulkan(i, info.varDsc.state.strides[i]); - - vertexInput.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; - vertexInput.pNext = NULL; - vertexInput.flags = 0; - vertexInput.vertexBindingDescriptionCount = lss; - vertexInput.pVertexBindingDescriptions = inputStreams.data(); - vertexInput.vertexAttributeDescriptionCount = attribs; - vertexInput.pVertexAttributeDescriptions = inputAttribs.data(); - - tesselation.sType = VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO; - tesselation.pNext = NULL; - tesselation.flags = 0; - tesselation.patchControlPoints = 4; - - raster.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; - raster.pNext = NULL; - raster.flags = 0; + csd.inputStreams[lss++] = inputLayout.streams.toVulkan(i, info.varDsc.state.strides[i]); + + csd.vertexInput.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + csd.vertexInput.pNext = NULL; + csd.vertexInput.flags = 0; + csd.vertexInput.vertexBindingDescriptionCount = lss; + csd.vertexInput.pVertexBindingDescriptions = csd.inputStreams.data(); + csd.vertexInput.vertexAttributeDescriptionCount = attribs; + csd.vertexInput.pVertexAttributeDescriptions = csd.inputAttribs.data(); + + csd.tesselation.sType = VK_STRUCTURE_TYPE_PIPELINE_TESSELLATION_STATE_CREATE_INFO; + csd.tesselation.pNext = NULL; + csd.tesselation.flags = 0; + csd.tesselation.patchControlPoints = 4; + + csd.raster.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + csd.raster.pNext = NULL; + csd.raster.flags = 0; #if !_TARGET_ANDROID - raster.depthClampEnable = staticState.depthClipEnable ? VK_FALSE : VK_TRUE; + csd.raster.depthClampEnable = staticState.depthClipEnable ? VK_FALSE : VK_TRUE; #else - raster.depthClampEnable = VK_FALSE; + csd.raster.depthClampEnable = VK_FALSE; #endif - raster.rasterizerDiscardEnable = VK_FALSE; - raster.polygonMode = static_cast((uint32_t)info.varDsc.state.polygonLine); + csd.raster.rasterizerDiscardEnable = VK_FALSE; + csd.raster.polygonMode = static_cast((uint32_t)info.varDsc.state.polygonLine); uint32_t cull_mode = staticState.cullMode; if (!cull_mode) { - raster.cullMode = 0; + csd.raster.cullMode = 0; } else if (cull_mode == (CULL_CW - CULL_NONE)) { - raster.cullMode = VK_CULL_MODE_FRONT_BIT; + csd.raster.cullMode = VK_CULL_MODE_FRONT_BIT; } else if (cull_mode == (CULL_CCW - CULL_NONE)) { - raster.cullMode = VK_CULL_MODE_BACK_BIT; + csd.raster.cullMode = VK_CULL_MODE_BACK_BIT; } - raster.frontFace = VK_FRONT_FACE_CLOCKWISE; - raster.depthBiasEnable = VK_TRUE; - raster.depthBiasConstantFactor = 0.f; - raster.depthBiasClamp = 0.f; - raster.depthBiasSlopeFactor = 0.f; - raster.lineWidth = 1.f; + csd.raster.frontFace = VK_FRONT_FACE_CLOCKWISE; + csd.raster.depthBiasEnable = VK_TRUE; + csd.raster.depthBiasConstantFactor = 0.f; + csd.raster.depthBiasClamp = 0.f; + csd.raster.depthBiasSlopeFactor = 0.f; + csd.raster.lineWidth = 1.f; #if VK_EXT_conservative_rasterization - VkPipelineRasterizationConservativeStateCreateInfoEXT conservativeRasterStateCI{}; if (staticState.conservativeRasterEnable && device.hasExtension()) { - conservativeRasterStateCI.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT; - conservativeRasterStateCI.conservativeRasterizationMode = VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT; - conservativeRasterStateCI.extraPrimitiveOverestimationSize = 0; - chain_structs(raster, conservativeRasterStateCI); + csd.conservativeRasterStateCI.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT; + csd.conservativeRasterStateCI.conservativeRasterizationMode = VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT; + csd.conservativeRasterStateCI.extraPrimitiveOverestimationSize = 0; + chain_structs(csd.raster, csd.conservativeRasterStateCI); } #endif auto forcedSamplerCount = staticState.getForcedSamplerCount(); - multisample.rasterizationSamples = + csd.multisample.rasterizationSamples = forcedSamplerCount == 0 ? sampleCount : checkSampleCount(staticState.getForcedSamplerCount(), rpColorTargetMask, hasDepth); - multisample.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; - multisample.pNext = NULL; - multisample.flags = 0; - multisample.sampleShadingEnable = VK_FALSE; - multisample.minSampleShading = 1.f; - multisample.pSampleMask = NULL; - multisample.alphaToCoverageEnable = staticState.alphaToCoverage ? VK_TRUE : VK_FALSE; - multisample.alphaToOneEnable = VK_FALSE; - - G_ASSERTF((multisample.alphaToCoverageEnable == VK_FALSE) || - ((multisample.alphaToCoverageEnable == VK_TRUE) && !(multisample.rasterizationSamples & VK_SAMPLE_COUNT_1_BIT)), + csd.multisample.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + csd.multisample.pNext = NULL; + csd.multisample.flags = 0; + csd.multisample.sampleShadingEnable = VK_FALSE; + csd.multisample.minSampleShading = 1.f; + csd.multisample.pSampleMask = NULL; + csd.multisample.alphaToCoverageEnable = staticState.alphaToCoverage ? VK_TRUE : VK_FALSE; + csd.multisample.alphaToOneEnable = VK_FALSE; + + G_ASSERTF((csd.multisample.alphaToCoverageEnable == VK_FALSE) || + ((csd.multisample.alphaToCoverageEnable == VK_TRUE) && !(csd.multisample.rasterizationSamples & VK_SAMPLE_COUNT_1_BIT)), "vulkan: alpha to coverage must be used with MSAA"); - depthStencil.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; - depthStencil.pNext = NULL; - depthStencil.flags = 0; - depthStencil.depthTestEnable = staticState.depthTestEnable; - depthStencil.depthWriteEnable = staticState.depthWriteEnable && !forceNoZWrite; - depthStencil.depthCompareOp = (VkCompareOp)(uint32_t)staticState.depthTestFunc; - depthStencil.depthBoundsTestEnable = staticState.depthBoundsEnable; - depthStencil.stencilTestEnable = staticState.stencilTestEnable; - depthStencil.front.failOp = (VkStencilOp)(uint32_t)staticState.stencilTestOpStencilFail; - depthStencil.front.passOp = (VkStencilOp)(uint32_t)staticState.stencilTestOpPass; - depthStencil.front.depthFailOp = (VkStencilOp)(uint32_t)staticState.stencilTestOpDepthFail; - depthStencil.front.compareOp = (VkCompareOp)(uint32_t)staticState.stencilTestFunc; - depthStencil.front.compareMask = 0xFF; - depthStencil.front.writeMask = 0xFF; - depthStencil.front.reference = 0xFF; - depthStencil.back = depthStencil.front; - depthStencil.minDepthBounds = 0.f; - depthStencil.maxDepthBounds = 1.f; + csd.depthStencil.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO; + csd.depthStencil.pNext = NULL; + csd.depthStencil.flags = 0; + csd.depthStencil.depthTestEnable = staticState.depthTestEnable; + csd.depthStencil.depthWriteEnable = staticState.depthWriteEnable && !forceNoZWrite; + csd.depthStencil.depthCompareOp = (VkCompareOp)(uint32_t)staticState.depthTestFunc; + csd.depthStencil.depthBoundsTestEnable = staticState.depthBoundsEnable; + csd.depthStencil.stencilTestEnable = staticState.stencilTestEnable; + csd.depthStencil.front.failOp = (VkStencilOp)(uint32_t)staticState.stencilTestOpStencilFail; + csd.depthStencil.front.passOp = (VkStencilOp)(uint32_t)staticState.stencilTestOpPass; + csd.depthStencil.front.depthFailOp = (VkStencilOp)(uint32_t)staticState.stencilTestOpDepthFail; + csd.depthStencil.front.compareOp = (VkCompareOp)(uint32_t)staticState.stencilTestFunc; + csd.depthStencil.front.compareMask = 0xFF; + csd.depthStencil.front.writeMask = 0xFF; + csd.depthStencil.front.reference = 0xFF; + csd.depthStencil.back = csd.depthStencil.front; + csd.depthStencil.minDepthBounds = 0.f; + csd.depthStencil.maxDepthBounds = 1.f; uint32_t attachmentCount = 0; @@ -310,7 +351,7 @@ GraphicsPipeline::GraphicsPipeline(VulkanDevice &device, VulkanPipelineCacheHand continue; } - auto &state = attachmentStates[attachmentCount]; + auto &state = csd.attachmentStates[attachmentCount]; uint32_t blendStateId = staticState.indenpendentBlendEnabled && (i < shaders::RenderState::NumIndependentBlendParameters) ? i : 0; @@ -352,96 +393,149 @@ GraphicsPipeline::GraphicsPipeline(VulkanDevice &device, VulkanPipelineCacheHand } } - colorBlendState.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; - colorBlendState.pNext = NULL; - colorBlendState.flags = 0; - colorBlendState.logicOpEnable = VK_FALSE; - colorBlendState.logicOp = VK_LOGIC_OP_COPY; - colorBlendState.attachmentCount = attachmentCount; - colorBlendState.pAttachments = attachmentStates.data(); - memset(colorBlendState.blendConstants, 0, sizeof(colorBlendState.blendConstants)); + csd.colorBlendState.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + csd.colorBlendState.pNext = NULL; + csd.colorBlendState.flags = 0; + csd.colorBlendState.logicOpEnable = VK_FALSE; + csd.colorBlendState.logicOp = VK_LOGIC_OP_COPY; + csd.colorBlendState.attachmentCount = attachmentCount; + csd.colorBlendState.pAttachments = csd.attachmentStates.data(); + memset(csd.colorBlendState.blendConstants, 0, sizeof(csd.colorBlendState.blendConstants)); if (!hasDepth) { - depthStencil.depthTestEnable = VK_FALSE; - depthStencil.depthWriteEnable = VK_FALSE; - depthStencil.depthBoundsTestEnable = VK_FALSE; - depthStencil.stencilTestEnable = VK_FALSE; + csd.depthStencil.depthTestEnable = VK_FALSE; + csd.depthStencil.depthWriteEnable = VK_FALSE; + csd.depthStencil.depthBoundsTestEnable = VK_FALSE; + csd.depthStencil.stencilTestEnable = VK_FALSE; - raster.depthBiasEnable = VK_FALSE; - raster.depthClampEnable = VK_FALSE; + csd.raster.depthBiasEnable = VK_FALSE; + csd.raster.depthClampEnable = VK_FALSE; } - VkDynamicState dynamicStateList[] = // - {VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR, VK_DYNAMIC_STATE_DEPTH_BIAS, VK_DYNAMIC_STATE_DEPTH_BOUNDS, - VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, VK_DYNAMIC_STATE_STENCIL_REFERENCE, - VK_DYNAMIC_STATE_BLEND_CONSTANTS}; - - dynamicStates.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; - dynamicStates.pNext = NULL; - dynamicStates.flags = 0; - dynamicStates.dynamicStateCount = array_size(dynamicStateList); - dynamicStates.pDynamicStates = dynamicStateList; + csd.dynamicStates.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; + csd.dynamicStates.pNext = NULL; + csd.dynamicStates.flags = 0; + csd.dynamicStates.dynamicStateCount = array_size(grPipeDynamicStateList); + csd.dynamicStates.pDynamicStates = grPipeDynamicStateList; - VkPipelineInputAssemblyStateCreateInfo piasci = // + csd.piasci = // {VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, NULL, 0, info.varDsc.topology, VK_FALSE}; - static const VkRect2D rect = {{0, 0}, {1, 1}}; - static const VkViewport viewport = {0.f, 0.f, 1.f, 1.f, 0.f, 1.f}; - // no need for unique states per variant, they are all the same - static const VkPipelineViewportStateCreateInfo viewportStates = // - {VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, NULL, 0, 1, &viewport, 1, &rect}; - - VkPipelineShaderStageCreateInfo stages[spirv::graphics::MAX_SETS] = {}; unsigned stagesCount = 0; for (size_t i = 0; i < spirv::graphics::MAX_SETS; ++i) { const ShaderModule *shModule = info.modules.list[i]; if (shModule) { - stages[stagesCount].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; - stages[stagesCount].stage = LayoutType::ShaderConfiguration::stages[i]; - stages[stagesCount].module = shModule->module; - stages[stagesCount].pName = "main"; + csd.stages[stagesCount].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + csd.stages[stagesCount].stage = LayoutType::ShaderConfiguration::stages[i]; + csd.stages[stagesCount].module = shModule->module; + csd.stages[stagesCount].pName = "main"; stagesCount++; } } - gpci.stageCount = stagesCount; - - gpci.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; - gpci.pNext = NULL; - gpci.pVertexInputState = &vertexInput; - gpci.pTessellationState = layout->hasTC() ? &tesselation : NULL; - gpci.pRasterizationState = &raster; - gpci.pMultisampleState = &multisample; - gpci.pDepthStencilState = &depthStencil; - gpci.pColorBlendState = &colorBlendState; - gpci.pDynamicState = &dynamicStates; - gpci.pInputAssemblyState = &piasci; - gpci.pViewportState = &viewportStates; - gpci.pStages = stages; - gpci.layout = layout->handle; - gpci.renderPass = renderPassHandle; - gpci.subpass = info.varDsc.subpass; - gpci.basePipelineIndex = 0; - gpci.basePipelineHandle = VK_NULL_HANDLE; - gpci.flags = VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT; - - if (!is_null(info.parentPipeline)) + csd.gpci.stageCount = stagesCount; + + csd.gpci.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; + csd.gpci.pNext = NULL; + csd.gpci.pVertexInputState = &csd.vertexInput; + csd.gpci.pTessellationState = layout->hasTC() ? &csd.tesselation : NULL; + csd.gpci.pRasterizationState = &csd.raster; + csd.gpci.pMultisampleState = &csd.multisample; + csd.gpci.pDepthStencilState = &csd.depthStencil; + csd.gpci.pColorBlendState = &csd.colorBlendState; + csd.gpci.pDynamicState = &csd.dynamicStates; + csd.gpci.pInputAssemblyState = &csd.piasci; + csd.gpci.pViewportState = &grPipeViewportStates; + csd.gpci.pStages = csd.stages; + csd.gpci.layout = layout->handle; + csd.gpci.renderPass = renderPassHandle; + csd.gpci.subpass = info.varDsc.subpass; + csd.gpci.basePipelineIndex = 0; + csd.gpci.basePipelineHandle = VK_NULL_HANDLE; + csd.gpci.flags = VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT; + csd.parentPipe = info.parentPipeline; +} + +void GraphicsPipeline::bind(VulkanDevice &vk_dev, VulkanCommandBufferHandle cmd_buffer) const +{ + VULKAN_LOG_CALL(vk_dev.vkCmdBindPipeline(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, getHandle())); +} + +void GraphicsPipeline::compile() +{ + int64_t compilationTime = 0; + CreationFeedback crFeedback; + VulkanPipelineHandle retHandle; { - gpci.flags |= VK_PIPELINE_CREATE_DERIVATIVE_BIT; - gpci.basePipelineIndex = -1; - gpci.basePipelineHandle = info.parentPipeline; + ScopedTimer compileTimer(compilationTime); +#if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA + TIME_PROFILE_NAME(vulkan_gr_pipeline_compile, compileScratch->shortDebugName) +#else + TIME_PROFILE(vulkan_gr_pipeline_compile) +#endif + retHandle = createPipelineObject(crFeedback); } - info.crFeedback.chainWith(gpci, device); + if (is_null(retHandle)) + { + logerr("vulkan: pipeline [gfx:%u:%u(%u)] not compiled but result was ok", compileScratch->progIdx, compileScratch->varIdx, + compileScratch->varTotal); +#if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA + logerr("vulkan: with\n %s", compileScratch->fullDebugName); +#endif + } + else + { +#if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA + get_device().setPipelineName(retHandle, compileScratch->fullDebugName.c_str()); + if (compileScratch->varIdx == 0) + get_device().setPipelineLayoutName(getLayout()->handle, compileScratch->fullDebugName.c_str()); +#endif - VULKAN_EXIT_ON_FAIL(device.vkCreateGraphicsPipelines(device.get(), cache, 1, &gpci, NULL, ptr(handle))); +#if VULKAN_LOG_PIPELINE_ACTIVITY < 1 + if (compilationTime > PIPELINE_COMPILATION_LONG_THRESHOLD && !compileScratch->allocated) +#endif + { + debug("vulkan: pipeline [gfx:%u:%u(%u)] compiled in %u us", compileScratch->progIdx, compileScratch->varIdx, + compileScratch->varTotal, compilationTime); + crFeedback.logFeedback(); +#if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA + debug("vulkan: with\n %s handle: %p", compileScratch->fullDebugName, generalize(retHandle)); +#endif + } + } + + if (compileScratch->nativeRP) + compileScratch->nativeRP->releasePipelineCompileRef(); + + if (compileScratch->allocated) + { + delete compileScratch; + setCompiledHandle(retHandle); + } + else + setHandle(retHandle); + compileScratch = nullptr; } -void GraphicsPipeline::bind(VulkanDevice &vk_dev, VulkanCommandBufferHandle cmd_buffer) const +VulkanPipelineHandle GraphicsPipeline::createPipelineObject(CreationFeedback &cr_feedback) { - VULKAN_LOG_CALL(vk_dev.vkCmdBindPipeline(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, handle)); + VulkanDevice &device = get_device().getVkDevice(); + + if (compileScratch->parentPipe && !is_null(compileScratch->parentPipe->getCompiledHandle())) + { + compileScratch->gpci.flags |= VK_PIPELINE_CREATE_DERIVATIVE_BIT; + compileScratch->gpci.basePipelineIndex = -1; + compileScratch->gpci.basePipelineHandle = compileScratch->parentPipe->getHandle(); + } + + cr_feedback.chainWith(compileScratch->gpci, device); + VulkanPipelineHandle ret; + VULKAN_EXIT_ON_FAIL( + device.vkCreateGraphicsPipelines(device.get(), compileScratch->vkCache, 1, &compileScratch->gpci, NULL, ptr(ret))); + return ret; } void GraphicsPipelineDynamicStateMask::from(RenderStateSystem::Backend &rs_backend, const GraphicsPipelineVariantDescription &desc, diff --git a/prog/engine/drv/drv3d_vulkan/pipeline/main_pipelines.h b/prog/engine/drv/drv3d_vulkan/pipeline/main_pipelines.h index f99785579..1f9cc69d1 100644 --- a/prog/engine/drv/drv3d_vulkan/pipeline/main_pipelines.h +++ b/prog/engine/drv/drv3d_vulkan/pipeline/main_pipelines.h @@ -9,6 +9,7 @@ #include "render_state_system.h" #include #include "render_pass_resource.h" +#include "compiler_scratch_data.h" namespace drv3d_vulkan { @@ -158,6 +159,7 @@ class ComputePipeline : public DebugAttachedPipeline &modules; - CreationFeedback &crFeedback; - VulkanPipelineHandle parentPipeline; + GraphicsPipeline *parentPipeline; RenderPassResource *nativeRP; + GraphicsPipelineCompileScratchData *scratch; CreationInfo() = delete; }; @@ -325,6 +332,8 @@ class GraphicsPipeline : public BasePipeline PipelineType &get(ProgramID id) @@ -208,6 +210,8 @@ class PipelineManager #endif void unloadAll(VulkanDevice &device); void prepareRemoval(ProgramID program); + void setAsyncCompile(bool allowed) { asyncCompileAllowed = allowed; } + bool asyncCompileEnabled() { return asyncCompileAllowed; } }; } // namespace drv3d_vulkan \ No newline at end of file diff --git a/prog/engine/drv/drv3d_vulkan/pipeline/stage_state_base.cpp b/prog/engine/drv/drv3d_vulkan/pipeline/stage_state_base.cpp index b418296be..f220ea35c 100644 --- a/prog/engine/drv/drv3d_vulkan/pipeline/stage_state_base.cpp +++ b/prog/engine/drv/drv3d_vulkan/pipeline/stage_state_base.cpp @@ -213,6 +213,7 @@ void PipelineStageStateBase::setBbuffer(uint32_t unit, BufferRef buffer) { getConstBufferRegister(unit).buffer.buffer = buffer.getHandle(); getConstBufferRegister(unit).buffer.range = buffer.dataSize(); + getConstBufferRegister(unit).buffer.offset = 0; getConstBufferRegister(unit).type = VkAnyDescriptorInfo::TYPE_BUF; } else diff --git a/prog/engine/drv/drv3d_vulkan/pipeline/variated_graphics.cpp b/prog/engine/drv/drv3d_vulkan/pipeline/variated_graphics.cpp index 2dbd4cf31..98901ac96 100644 --- a/prog/engine/drv/drv3d_vulkan/pipeline/variated_graphics.cpp +++ b/prog/engine/drv/drv3d_vulkan/pipeline/variated_graphics.cpp @@ -50,6 +50,31 @@ GraphicsPipelineVariationStorage::ExtendedVariantDescription &GraphicsPipelineVa } } +bool VariatedGraphicsPipeline::pendingCompilation() +{ + for (const auto &[_, pipeline] : items) + { + if (!pipeline->checkCompiled()) + return true; + } + return false; +} + +void VariatedGraphicsPipeline::shutdown(VulkanDevice &device) +{ + for (const auto &[_, pipeline] : items) + { + if (!pipeline->release()) + { + if (!pipeline->checkCompiled()) + get_device().getContext().getBackend().pipelineCompiler.waitFor(pipeline); + pipeline->shutdown(device); + delete pipeline; + } + } + items.clear(); +} + GraphicsPipeline *VariatedGraphicsPipeline::findVariant(const GraphicsPipelineVariantDescription &dsc) { auto hash = dsc.getHash(); @@ -64,7 +89,7 @@ GraphicsPipeline *VariatedGraphicsPipeline::findVariant(const GraphicsPipelineVa } GraphicsPipeline *VariatedGraphicsPipeline::compileNewVariant(CompilationContext &comp_ctx, - const GraphicsPipelineVariantDescription &dsc, unsigned &inOutTotalCompilationTime) + const GraphicsPipelineVariantDescription &dsc) { auto hash = dsc.getHash(); auto eDsc = variations.get(dsc, hash, comp_ctx.rsBackend, comp_ctx.nativeRP); @@ -76,67 +101,45 @@ GraphicsPipeline *VariatedGraphicsPipeline::compileNewVariant(CompilationContext { ScopedTimer compileTimer(compilationTime); - VulkanPipelineHandle parentPipe; + GraphicsPipeline *parentPipe = nullptr; if (!items.empty()) - parentPipe = items[0].second->getHandle(); + parentPipe = items[0].second; - { + bool async = get_device().pipeMan.asyncCompileEnabled(); + GraphicsPipelineCompileScratchData localCompileData; + GraphicsPipelineCompileScratchData *csd = async ? new GraphicsPipelineCompileScratchData() : &localCompileData; + memset(csd, 0, sizeof(GraphicsPipelineCompileScratchData)); + csd->allocated = async; + + csd->varIdx = eDsc.index; + csd->varTotal = items.size(); + csd->progIdx = program.get(); #if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA - String shortDebugName(128, "%s-%s", debugInfo.vs().name, debugInfo.fs().name); - TIME_PROFILE_NAME(vulkan_gr_pipeline_compile, shortDebugName) -#else - TIME_PROFILE(vulkan_gr_pipeline_compile) + csd->shortDebugName = String(128, "%s-%s", debugInfo.vs().name, debugInfo.fs().name); + csd->fullDebugName = String(512, "vs: %s\nps: %s\nvaridx: %u", debugInfo.vs().debugName, debugInfo.fs().debugName, eDsc.index); #endif - ret = new GraphicsPipeline(comp_ctx.dev, comp_ctx.pipeCache, layout, - {comp_ctx.passMan, comp_ctx.rsBackend, eDsc.base, eDsc.mask, modules, crFeedback, parentPipe, comp_ctx.nativeRP}); - } - - items.push_back(eastl::make_pair(hash, ret)); - } - inOutTotalCompilationTime += compilationTime; + ret = new GraphicsPipeline(comp_ctx.dev, comp_ctx.pipeCache, layout, + {comp_ctx.passMan, comp_ctx.rsBackend, eDsc.base, eDsc.mask, modules, parentPipe, comp_ctx.nativeRP, csd}); -#if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA - String fullDebugName(512, "vs: %s\nps: %s\nvaridx: %u", debugInfo.vs().debugName, debugInfo.fs().debugName, eDsc.index); -#endif + if (async) + get_device().getContext().getBackend().pipelineCompiler.queue(ret); + else + ret->compile(); - if (is_null(ret->getHandle())) - { - logerr("vulkan: pipeline [gfx:%u:%u(%u)] not compiled but result was ok", program.get(), eDsc.index, items.size()); -#if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA - logerr("vulkan: with\n %s", fullDebugName); -#endif - return ret; + items.push_back(eastl::make_pair(hash, ret)); } - #if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA - get_device().setPipelineName(ret->getHandle(), fullDebugName.c_str()); - if (items.size() == 1) - get_device().setPipelineLayoutName(getLayout()->handle, fullDebugName.c_str()); totalCompilationTime += compilationTime; ++variantCount; #endif -#if VULKAN_LOG_PIPELINE_ACTIVITY < 1 - if (compilationTime > PIPELINE_COMPILATION_LONG_THRESHOLD) -#endif - { - debug("vulkan: pipeline [gfx:%u:%u(%u)] compiled in %u us", program.get(), eDsc.index, items.size(), compilationTime); - crFeedback.logFeedback(); -#if VULKAN_LOAD_SHADER_EXTENDED_DEBUG_DATA - debug("vulkan: with\n %s handle: %p", fullDebugName, generalize(ret->getHandle())); -#endif - } - return ret; } -GraphicsPipeline *VariatedGraphicsPipeline::getVariant(CompilationContext &comp_ctx, const GraphicsPipelineVariantDescription &dsc, - bool compilationTimeout, unsigned &inOutTotalCompilationTime) +GraphicsPipeline *VariatedGraphicsPipeline::getVariant(CompilationContext &comp_ctx, const GraphicsPipelineVariantDescription &dsc) { GraphicsPipeline *pipe = findVariant(dsc); - if (compilationTimeout && !pipe) - return nullptr; #if VULKAN_ENABLE_DEBUG_FLUSHING_SUPPORT if (!isUsageAllowed()) @@ -161,7 +164,7 @@ GraphicsPipeline *VariatedGraphicsPipeline::getVariant(CompilationContext &comp_ // masking did not changed layout, no need to map dsc to something if (originalLayout.isSame(maskedLayout)) - pipe = compileNewVariant(comp_ctx, dsc, inOutTotalCompilationTime); + pipe = compileNewVariant(comp_ctx, dsc); else { // register/find new input layout @@ -170,7 +173,7 @@ GraphicsPipeline *VariatedGraphicsPipeline::getVariant(CompilationContext &comp_ pipe = findVariant(modDsc); if (!pipe) - pipe = compileNewVariant(comp_ctx, modDsc, inOutTotalCompilationTime); + pipe = compileNewVariant(comp_ctx, modDsc); // add to list as mapped to another desc auto origHash = dsc.getHash(); @@ -180,10 +183,8 @@ GraphicsPipeline *VariatedGraphicsPipeline::getVariant(CompilationContext &comp_ } } - if (is_null(pipe->getHandle())) - { + if (!pipe->checkCompiled()) return nullptr; - } return pipe; } diff --git a/prog/engine/drv/drv3d_vulkan/pipeline/variated_graphics.h b/prog/engine/drv/drv3d_vulkan/pipeline/variated_graphics.h index 1ba5a914c..772cc0894 100644 --- a/prog/engine/drv/drv3d_vulkan/pipeline/variated_graphics.h +++ b/prog/engine/drv/drv3d_vulkan/pipeline/variated_graphics.h @@ -52,8 +52,7 @@ class VariatedGraphicsPipeline : public DebugAttachedPipelinerelease()) - { - i.second->shutdown(device); - delete i.second; - } - } - items.clear(); - } + void shutdown(VulkanDevice &device); bool hasGeometryStage() const { return layout->hasGS(); } @@ -110,6 +97,8 @@ class VariatedGraphicsPipeline : public DebugAttachedPipelinehasTE(); } + bool pendingCompilation(); + private: GraphicsPipelineShaderSet modules; diff --git a/prog/engine/drv/drv3d_vulkan/pipeline_state.cpp b/prog/engine/drv/drv3d_vulkan/pipeline_state.cpp index f14ff4228..bb84bb1b7 100644 --- a/prog/engine/drv/drv3d_vulkan/pipeline_state.cpp +++ b/prog/engine/drv/drv3d_vulkan/pipeline_state.cpp @@ -53,18 +53,31 @@ bool PipelineState::handleObjectRemoval(ProgramID object) // unfortunately it can't be verified at use-place // so just fill with empty programs (so it can be noticed) and hope for the best bool ret = false; - if (get() == object) + int progType = get_program_type(object); + if (progType == program_type_graphics) { - // logerr("vulkan: removing active graphics program %u", object.get()); - set(ProgramID::Null()); - ret |= true; + if (get() == object) + { + // logerr("vulkan: removing active graphics program %u", object.get()); + set(ProgramID::Null()); + ret |= true; + } + + if (get_device().pipeMan.get(object).pendingCompilation()) + ret |= true; } - if (get() == object) + if (progType == program_type_compute) { - // logerr("vulkan: removing active compute program %u", object.get()); - set(ProgramID::Null()); - ret |= true; + if (get() == object) + { + // logerr("vulkan: removing active compute program %u", object.get()); + set(ProgramID::Null()); + ret |= true; + } + + if (get_device().pipeMan.get(object).pendingCompilation()) + ret |= true; } return ret; @@ -73,6 +86,8 @@ bool PipelineState::handleObjectRemoval(ProgramID object) template <> bool PipelineState::handleObjectRemoval(RenderPassResource *object) { + if (object->isPipelineCompileReferenced()) + return true; FrontRenderPassState &rp = get(); return rp.handleObjectRemoval(object); } @@ -99,13 +114,32 @@ bool PipelineState::isReferenced(Buffer *object) const template <> bool PipelineState::isReferenced(ProgramID object) const { - return (getRO() == object) | - (getRO() == object); + int progType = get_program_type(object); + if (progType == program_type_graphics) + { + if (getRO() == object) + return true; + if (get_device().pipeMan.get(object).pendingCompilation()) + return true; + } + + if (progType == program_type_compute) + { + if (getRO() == object) + return true; + + if (get_device().pipeMan.get(object).pendingCompilation()) + return true; + } + + return false; } template <> bool PipelineState::isReferenced(RenderPassResource *object) const { + if (object->isPipelineCompileReferenced()) + return true; const FrontRenderPassState &rp = getRO(); return rp.isReferenced(object); } diff --git a/prog/engine/drv/drv3d_vulkan/render_pass_resource.h b/prog/engine/drv/drv3d_vulkan/render_pass_resource.h index b2b9d6b99..5d35cb6e8 100644 --- a/prog/engine/drv/drv3d_vulkan/render_pass_resource.h +++ b/prog/engine/drv/drv3d_vulkan/render_pass_resource.h @@ -207,6 +207,8 @@ class RenderPassResource : public RenderPassResourceImplBase Tab compiledFBs; + std::atomic pipelineCompileRefs = 0; + public: RenderPassResource(const Description &in_desc, bool manage = true); @@ -230,6 +232,10 @@ class RenderPassResource : public RenderPassResourceImplBase void bindInputAttachments(ExecutionContext &ctx, PipelineStageStateBase &tgt, uint32_t input_index, uint32_t register_index, const VariatedGraphicsPipeline *pipeline); + + void addPipelineCompileRef() { ++pipelineCompileRefs; } + void releasePipelineCompileRef() { --pipelineCompileRefs; } + bool isPipelineCompileReferenced() { return pipelineCompileRefs.load() != 0; } }; } // namespace drv3d_vulkan diff --git a/prog/engine/drv/drv3d_vulkan/render_pass_resource_convert.cpp b/prog/engine/drv/drv3d_vulkan/render_pass_resource_convert.cpp index d0f25034c..034eac201 100644 --- a/prog/engine/drv/drv3d_vulkan/render_pass_resource_convert.cpp +++ b/prog/engine/drv/drv3d_vulkan/render_pass_resource_convert.cpp @@ -551,8 +551,7 @@ void RenderPassResource::fillAttachmentDescription(const RenderPassDesc &rp_desc desc.flags = extDesc.aliased ? VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT : 0; desc.format = FormatStore::fromCreateFlags(texCf).asVkFormat(); - bool multisample = texCf & (TEXCF_MULTISAMPLED | TEXCF_MSAATARGET); - desc.samples = multisample ? get_device().calcMSAAQuality() : VkSampleCountFlagBits::VK_SAMPLE_COUNT_1_BIT; + desc.samples = VkSampleCountFlagBits(get_sample_count(texCf)); uint32_t loadActions = 0; uint32_t storeActions = 0; @@ -802,12 +801,12 @@ void RenderPassResource::storeSubpassAttachmentInfos() colorWriteMask |= 1 << bind.slot; unsigned texcf = getAttachmentTexcf(rpDesc, bind.target); - const bool hasMultisampleFlags = texcf & (TEXCF_MULTISAMPLED | TEXCF_MSAATARGET); - VkSampleCountFlagBits samples = - hasMultisampleFlags ? get_device().calcMSAAQuality() : VkSampleCountFlagBits::VK_SAMPLE_COUNT_1_BIT; + const bool hasMultisampleFlags = texcf & TEXCF_SAMPLECOUNT_MASK; + VkSampleCountFlagBits samples = VkSampleCountFlagBits(get_sample_count(texcf)); + bool isMultisampled = samples > VkSampleCountFlagBits::VK_SAMPLE_COUNT_1_BIT; msaaSamples = eastl::max(msaaSamples, samples); const bool isResolve = bind.action & RP_TA_SUBPASS_RESOLVE; - const bool isMultisampled = samples != VkSampleCountFlagBits::VK_SAMPLE_COUNT_1_BIT; + if (!isMultisampled && !isResolve) { diff --git a/prog/engine/drv/drv3d_vulkan/render_work.cpp b/prog/engine/drv/drv3d_vulkan/render_work.cpp index c0d00e720..0567834a5 100644 --- a/prog/engine/drv/drv3d_vulkan/render_work.cpp +++ b/prog/engine/drv/drv3d_vulkan/render_work.cpp @@ -253,7 +253,6 @@ void RenderWork::process() { TIME_PROFILE(vulkan_render_work_process); - skippedGraphicsPipelines = 0; ExecutionContext executionContext(*this); executionContext.prepareFrameCore(); processCommands(executionContext); diff --git a/prog/engine/drv/drv3d_vulkan/render_work.h b/prog/engine/drv/drv3d_vulkan/render_work.h index 0619eeffb..a113da2d2 100644 --- a/prog/engine/drv/drv3d_vulkan/render_work.h +++ b/prog/engine/drv/drv3d_vulkan/render_work.h @@ -61,9 +61,6 @@ struct RenderWork size_t id = 0; - // this value is intact on cleanup for frontend readback - uint32_t skippedGraphicsPipelines = 0; - eastl::vector bufferUploads; eastl::vector bufferUploadCopies; diff --git a/prog/engine/drv/drv3d_vulkan/state_field_graphics.cpp b/prog/engine/drv/drv3d_vulkan/state_field_graphics.cpp index 217cfd5bf..6a09d8c10 100644 --- a/prog/engine/drv/drv3d_vulkan/state_field_graphics.cpp +++ b/prog/engine/drv/drv3d_vulkan/state_field_graphics.cpp @@ -413,10 +413,7 @@ void StateFieldGraphicsPipeline::applyTo(BackGraphicsStateStorage &state, Execut VariatedGraphicsPipeline::CompilationContext compCtx = { vkDev, drvDev.passMan, ctx.renderStateSystem, drvDev.getPipeCache(), state.nativeRenderPass.ptr}; - bool compilationTimeout = target.back.pipelineCompilationTime > target.back.pipelineCompilationTimeBudget; - ptr = state.basePipeline.ptr->getVariant(compCtx, varDsc, compilationTimeout, target.back.pipelineCompilationTime); - if (compilationTimeout) - ++target.data.skippedGraphicsPipelines; + ptr = state.basePipeline.ptr->getVariant(compCtx, varDsc); } if (oldPtr == ptr && ptr) diff --git a/prog/engine/drv/drv3d_vulkan/state_field_resource_binds.cpp b/prog/engine/drv/drv3d_vulkan/state_field_resource_binds.cpp index 423329cd0..f7cf5faf3 100644 --- a/prog/engine/drv/drv3d_vulkan/state_field_resource_binds.cpp +++ b/prog/engine/drv/drv3d_vulkan/state_field_resource_binds.cpp @@ -31,6 +31,9 @@ bool hasResourceConflictWithFramebuffer(Image *img, ImageViewState view, ShaderS FramebufferState &fbs = back.executionState.get().framebufferState; RenderPassClass::FramebufferDescription &fbi = fbs.frameBufferInfo; + const ValueRange mipRange(view.getMipBase(), view.getMipBase() + view.getMipCount()); + const ValueRange arrayRange(view.getArrayBase(), view.getArrayBase() + view.getArrayCount()); + if (img->getUsage() & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { uint32_t i; @@ -47,10 +50,7 @@ bool hasResourceConflictWithFramebuffer(Image *img, ImageViewState view, ShaderS const ImageViewState &rtView = fbi.colorAttachments[i].view; const ValueRange rtMipRange(rtView.getMipBase(), rtView.getMipBase() + rtView.getMipCount()); - const ValueRange rtArrayRange(rtView.getArrayBase(), rtView.getArrayBase() + rtView.getMipCount()); - - const ValueRange mipRange(view.getMipBase(), view.getMipBase() + view.getMipCount()); - const ValueRange arrayRange(view.getArrayBase(), view.getArrayBase() + view.getMipCount()); + const ValueRange rtArrayRange(rtView.getArrayBase(), rtView.getArrayBase() + rtView.getArrayCount()); if (mipRange.overlaps(rtMipRange) && arrayRange.overlaps(rtArrayRange)) return true; @@ -67,9 +67,6 @@ bool hasResourceConflictWithFramebuffer(Image *img, ImageViewState view, ShaderS const ImageViewState &dsView = fbi.depthStencilAttachment.view; - const ValueRange mipRange(view.getMipBase(), view.getMipBase() + view.getMipCount()); - const ValueRange arrayRange(view.getArrayBase(), view.getArrayBase() + view.getMipCount()); - auto mipIndex = dsView.getMipBase(); auto arrayIndex = dsView.getArrayBase(); if (mipRange.isInside(mipIndex) && arrayRange.isInside(arrayIndex)) diff --git a/prog/engine/drv/drv3d_vulkan/texture.cpp b/prog/engine/drv/drv3d_vulkan/texture.cpp index 99cb4720c..4b8460c8a 100644 --- a/prog/engine/drv/drv3d_vulkan/texture.cpp +++ b/prog/engine/drv/drv3d_vulkan/texture.cpp @@ -123,7 +123,7 @@ bool create_tex2d(BaseTex::D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_ BaseTex::ImageMem *initial_data, int array_size = 1, bool temp_alloc = false) { uint32_t &flg = bt_in->cflg; - G_ASSERT(!((flg & TEXCF_MULTISAMPLED) && initial_data != nullptr)); + G_ASSERT(!((flg & TEXCF_SAMPLECOUNT_MASK) && initial_data != nullptr)); G_ASSERT(!((flg & TEXCF_LOADONCE) && (flg & (TEXCF_DYNAMIC | TEXCF_RTARGET)))); ImageCreateInfo desc; @@ -136,20 +136,10 @@ bool create_tex2d(BaseTex::D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_ desc.arrays = (cube ? 6 : 1) * array_size; desc.residencyFlags = Image::MEM_NOT_EVICTABLE; - bool multisample = flg & (TEXCF_MULTISAMPLED | TEXCF_MSAATARGET); + bool multisample = flg & (TEXCF_SAMPLECOUNT_MASK); bool transient = flg & TEXCF_TRANSIENT; - if ((flg & TEXFMT_MASK) == TEXFMT_MSAA_MAX_SAMPLES) - { - const PhysicalDeviceSet::MsaaMaxSamplesDesc &maxSamplesFormat = get_device().getDeviceProperties().maxSamplesFormat; - if (maxSamplesFormat.samples <= 1) - return false; - flg = (flg & ~TEXFMT_MASK) | FormatStore::fromVkFormat(maxSamplesFormat.format).asTexFlags() | - (TEXCF_MULTISAMPLED | TEXCF_MSAATARGET | TEXCF_RTARGET); - desc.samples = maxSamplesFormat.samples; - multisample = true; - } - else - desc.samples = multisample ? get_device().calcMSAAQuality() : VkSampleCountFlagBits::VK_SAMPLE_COUNT_1_BIT; + + desc.samples = VkSampleCountFlagBits(get_sample_count(flg)); desc.format = FormatStore::fromCreateFlags(flg); desc.usage = 0; @@ -287,7 +277,7 @@ bool create_tex2d(BaseTex::D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_ bool create_tex3d(BaseTex::D3DTextures &tex, BaseTex *bt_in, uint32_t w, uint32_t h, uint32_t d, uint32_t flg, uint32_t levels, BaseTex::ImageMem *initial_data) { - G_ASSERT((flg & TEXCF_MULTISAMPLED) == 0); + G_ASSERT((flg & TEXCF_SAMPLECOUNT_MASK) == 0); G_ASSERT(!((flg & TEXCF_LOADONCE) && (flg & TEXCF_DYNAMIC))); ImageCreateInfo desc; @@ -656,7 +646,7 @@ int BaseTex::update(BaseTexture *src) BaseTex *btex = getbasetex(src); if (btex) { - if (btex->cflg & TEXCF_MULTISAMPLED) + if (btex->cflg & TEXCF_SAMPLECOUNT_MASK) btex->resolve(tex.image); else if (btex->tex.image && tex.image) { diff --git a/prog/engine/drv/drv3d_vulkan/texture.h b/prog/engine/drv/drv3d_vulkan/texture.h index ae4ed0ae8..e125c168d 100644 --- a/prog/engine/drv/drv3d_vulkan/texture.h +++ b/prog/engine/drv/drv3d_vulkan/texture.h @@ -170,12 +170,6 @@ struct BaseTex final : public BaseTexture return colorTable[int(as_float) | (int(isBlack) << 1) | (int(isTransparent) << 2)]; } - enum RebindFlags - { - NONE = 0, - FORCE_REBIND = 1 - }; - void rebindTRegs(SamplerState new_sampler) { samplerState = new_sampler; } inline void setUsedAsRenderTarget() { dirtyRt = 1; } diff --git a/prog/engine/drv/drv3d_vulkan/timeline.h b/prog/engine/drv/drv3d_vulkan/timeline.h index 2027399ec..065f06b6d 100644 --- a/prog/engine/drv/drv3d_vulkan/timeline.h +++ b/prog/engine/drv/drv3d_vulkan/timeline.h @@ -1,11 +1,12 @@ // classes for represenation & managment of various work blocks // with ability to move current execution point #pragma once -#include "driver.h" -#include "osApiWrappers/dag_events.h" -#include "osApiWrappers/dag_critSec.h" -#include "render_work.h" -#include "frame_info.h" +#include +#include +#include +#include +#include +#include namespace drv3d_vulkan { @@ -336,37 +337,6 @@ struct TimelineSyncPartNonWaitable }; }; -class TimelineManager -{ - - struct CpuReplaySync : public TimelineSyncPartLockFree, - public TimelineSyncPartSingleWriterSingleReader, - public TimelineSyncPartEventWaitable - {}; - - struct GpuExecuteSync : public TimelineSyncPartLockFree, public TimelineSyncPartNonConcurrent, public TimelineSyncPartNonWaitable - {}; - -public: - typedef Timeline CpuReplay; - typedef Timeline GpuExecute; - -private: - CpuReplay cpuReplay; - GpuExecute gpuExecute; - - TimelineManager(const TimelineManager &) = delete; - -public: - template - T &get(); - - void shutdown(); - void init(); - - TimelineManager() = default; -}; - template class TimelineSpan { @@ -376,7 +346,9 @@ class TimelineSpan TimelineSpan(const TimelineSpan &) = delete; public: - TimelineSpan(TimelineManager &tl_man) : timelineRef(tl_man.get()) {} + template + TimelineSpan(Manager &tl_man) : timelineRef(tl_man.template get()) + {} void start() { @@ -404,4 +376,4 @@ class TimelineSpan typename T::Element &get() { return timelineRef.getElement(idx); } }; -} // namespace drv3d_vulkan \ No newline at end of file +} // namespace drv3d_vulkan diff --git a/prog/engine/drv/drv3d_vulkan/timeline.cpp b/prog/engine/drv/drv3d_vulkan/timelines.cpp similarity index 67% rename from prog/engine/drv/drv3d_vulkan/timeline.cpp rename to prog/engine/drv/drv3d_vulkan/timelines.cpp index cb75d9030..e8d769b1f 100644 --- a/prog/engine/drv/drv3d_vulkan/timeline.cpp +++ b/prog/engine/drv/drv3d_vulkan/timelines.cpp @@ -1,4 +1,4 @@ -#include "timeline.h" +#include "timelines.h" namespace drv3d_vulkan { @@ -15,16 +15,24 @@ TimelineManager::GpuExecute &TimelineManager::get() return gpuExecute; } +template <> +PipelineCompileTimeline &TimelineManager::get() +{ + return pipelineCompileTimeline; +} + void TimelineManager::shutdown() { cpuReplay.shutdown(); gpuExecute.shutdown(); + pipelineCompileTimeline.shutdown(); } void TimelineManager::init() { cpuReplay.init(); gpuExecute.init(); + pipelineCompileTimeline.init(); } } // namespace drv3d_vulkan \ No newline at end of file diff --git a/prog/engine/drv/drv3d_vulkan/timelines.h b/prog/engine/drv/drv3d_vulkan/timelines.h new file mode 100644 index 000000000..09cee9245 --- /dev/null +++ b/prog/engine/drv/drv3d_vulkan/timelines.h @@ -0,0 +1,43 @@ +#pragma once +#include "timeline.h" +#include "driver.h" +#include "render_work.h" +#include "frame_info.h" +#include "pipeline/compiler.h" + +namespace drv3d_vulkan +{ + +class TimelineManager +{ + + struct CpuReplaySync : public TimelineSyncPartLockFree, + public TimelineSyncPartSingleWriterSingleReader, + public TimelineSyncPartEventWaitable + {}; + + struct GpuExecuteSync : public TimelineSyncPartLockFree, public TimelineSyncPartNonConcurrent, public TimelineSyncPartNonWaitable + {}; + +public: + typedef Timeline CpuReplay; + typedef Timeline GpuExecute; + +private: + CpuReplay cpuReplay; + GpuExecute gpuExecute; + PipelineCompileTimeline pipelineCompileTimeline; + + TimelineManager(const TimelineManager &) = delete; + +public: + template + T &get(); + + void shutdown(); + void init(); + + TimelineManager() = default; +}; + +} // namespace drv3d_vulkan diff --git a/prog/engine/drv/drv3d_vulkan/vulkan.cpp b/prog/engine/drv/drv3d_vulkan/vulkan.cpp index 9ed2a5eb7..95f681147 100644 --- a/prog/engine/drv/drv3d_vulkan/vulkan.cpp +++ b/prog/engine/drv/drv3d_vulkan/vulkan.cpp @@ -41,6 +41,7 @@ #include #include "../drv3d_commonCode/gpuConfig.h" +#include "../drv3d_commonCode/validate_sbuf_flags.h" #include "driver.h" #include "vulkan_loader.h" @@ -1480,7 +1481,23 @@ bool d3d::check_texformat(int cflg) { auto fmt = FormatStore::fromCreateFlags(cflg); return api_state.device.checkFormatSupport(fmt.asVkFormat(), VK_IMAGE_TYPE_2D, VK_IMAGE_TILING_OPTIMAL, - usage_flags_from_cfg(cflg, fmt), 0, (cflg & TEXCF_MULTISAMPLED) ? api_state.device.calcMSAAQuality() : VK_SAMPLE_COUNT_1_BIT); + usage_flags_from_cfg(cflg, fmt), 0, VkSampleCountFlagBits(get_sample_count(cflg))); +} + +int d3d::get_max_sample_count(int cflg) +{ + auto fmt = FormatStore::fromCreateFlags(cflg); + if (auto sampleFlags = api_state.device.getFormatSamples(fmt.asVkFormat(), VK_IMAGE_TYPE_2D, VK_IMAGE_TILING_OPTIMAL, + usage_flags_from_cfg(cflg, fmt))) + { + for (int samples = get_sample_count(TEXCF_SAMPLECOUNT_MAX); samples; samples >>= 1) + { + if (sampleFlags & samples) + return samples; + } + } + + return 1; } bool d3d::issame_texformat(int cflg1, int cflg2) @@ -1494,8 +1511,7 @@ bool d3d::check_cubetexformat(int cflg) { auto fmt = FormatStore::fromCreateFlags(cflg); return api_state.device.checkFormatSupport(fmt.asVkFormat(), VK_IMAGE_TYPE_2D, VK_IMAGE_TILING_OPTIMAL, - usage_flags_from_cfg(cflg, fmt), VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT, - (cflg & TEXCF_MULTISAMPLED) ? api_state.device.calcMSAAQuality() : VK_SAMPLE_COUNT_1_BIT); + usage_flags_from_cfg(cflg, fmt), VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT, VkSampleCountFlagBits(get_sample_count(cflg))); } bool d3d::issame_cubetexformat(int cflg1, int cflg2) { return issame_texformat(cflg1, cflg2); } @@ -1517,7 +1533,7 @@ bool d3d::check_voltexformat(int cflg) cflg &= ~TEXCF_RTARGET; } return api_state.device.checkFormatSupport(fmt.asVkFormat(), VK_IMAGE_TYPE_3D, VK_IMAGE_TILING_OPTIMAL, - usage_flags_from_cfg(cflg, fmt), flags, (cflg & TEXCF_MULTISAMPLED) ? api_state.device.calcMSAAQuality() : VK_SAMPLE_COUNT_1_BIT); + usage_flags_from_cfg(cflg, fmt), flags, VkSampleCountFlagBits(get_sample_count(cflg))); } bool d3d::issame_voltexformat(int cflg1, int cflg2) { return issame_texformat(cflg1, cflg2); } @@ -1755,7 +1771,7 @@ PROGRAM d3d::create_program(const uint32_t *vs, const uint32_t *ps, VDECL vdecl, return create_program(vprog, fshad, vdecl, strides, streams); } -PROGRAM d3d::create_program_cs(const uint32_t *cs_native) +PROGRAM d3d::create_program_cs(const uint32_t *cs_native, CSPreloaded) { Tab chunks; Tab chunkData; @@ -2460,18 +2476,21 @@ void d3d::get_video_modes_list(Tab &list) { clear_and_shrink(list); } Vbuffer *d3d::create_vb(int size, int flg, const char *name) { + validate_sbuffer_flags(flg, name); OSSpinlockScopedLock lock{api_state.bufferPoolGuard}; return api_state.bufferPool.allocate(1, size, flg | SBCF_BIND_VERTEX | SBCF_BIND_SHADER_RES, FormatStore(), name); } Ibuffer *d3d::create_ib(int size, int flg, const char *stat_name) { + validate_sbuffer_flags(flg, stat_name); OSSpinlockScopedLock lock{api_state.bufferPoolGuard}; return api_state.bufferPool.allocate(1, size, flg | SBCF_BIND_INDEX | SBCF_BIND_SHADER_RES, FormatStore(), stat_name); } Vbuffer *d3d::create_sbuffer(int struct_size, int elements, unsigned flags, unsigned format, const char *name) { + validate_sbuffer_flags(flags, name); OSSpinlockScopedLock lock{api_state.bufferPoolGuard}; return api_state.bufferPool.allocate(struct_size, elements, flags, FormatStore::fromCreateFlags(format), name); } diff --git a/prog/engine/drv/vr_device/vrDevice.cpp b/prog/engine/drv/vr_device/vrDevice.cpp index 9d292ca97..b5b19f054 100644 --- a/prog/engine/drv/vr_device/vrDevice.cpp +++ b/prog/engine/drv/vr_device/vrDevice.cpp @@ -351,7 +351,7 @@ void VRDevice::prepareScreenMask(const TMatrix4 &projection, int view_index) const char *vbName = view_index == 0 ? "OpenXRVisibilityVMask0" : "OpenXRVisibilityVMask1"; const char *ibName = view_index == 0 ? "OpenXRVisibilityIMask0" : "OpenXRVisibilityIMask1"; - int bufFlags = SBCF_MAYBELOST | SBCF_CPU_ACCESS_WRITE; + int bufFlags = SBCF_CPU_ACCESS_WRITE; int vbSize = sizeof(Point4) * visibilityMaskVertices.size(); int ibSize = sizeof(uint16_t) * visibilityMaskIndices.size(); diff --git a/prog/engine/gameRes/collisionGameRes.cpp b/prog/engine/gameRes/collisionGameRes.cpp index a3618cc19..0f5606710 100644 --- a/prog/engine/gameRes/collisionGameRes.cpp +++ b/prog/engine/gameRes/collisionGameRes.cpp @@ -2351,18 +2351,20 @@ bool CollisionResource::testInclusion(const CollisionNode &node_to_test, const T return testInclusion(node_to_test, tm_test, restraining_node.convexPlanes, restrainTm, test_node_tree); } -VECTORCALL bool CollisionResource::rayHit(const mat44f &tm, vec3f v_from, vec3f v_dir, float in_t, int &out_mat_id) const +VECTORCALL bool CollisionResource::rayHit(const mat44f &tm, const Point3 &from, const Point3 &dir, float in_t, int ray_mat_id, + int &out_mat_id) const { uint8_t behaviorFilter = CollisionNode::TRACEABLE; - auto nodeFilter = [&](const CollisionNode * /*node*/) -> bool { return true; }; - - auto callback = [&out_mat_id](int /*trace_id*/, const CollisionNode *node, float /*t*/, vec3f /*normal*/, vec3f /*pos*/) { + auto nodeFilter = [&](const CollisionNode *node) -> bool { + return ray_mat_id == PHYSMAT_INVALID || PhysMat::isMaterialsCollide(ray_mat_id, node->physMatId); + }; + auto callback = [&](int /*trace_id*/, const CollisionNode *node, float /*t*/, vec3f /*normal*/, vec3f /*pos*/) { out_mat_id = node->physMatId; }; - return forEachIntersectedNode(tm, nullptr /*geom_node_tree*/, v_from, v_dir, in_t, - false /*out_normal*/, 1.f /*bsphere_scale*/, behaviorFilter, nodeFilter, callback, nullptr /*stats*/); + return forEachIntersectedNode(tm, nullptr /*geom_node_tree*/, v_ldu(&from.x), + v_ldu(&dir.x), in_t, false /*out_normal*/, 1.f /*bsphere_scale*/, behaviorFilter, nodeFilter, callback, nullptr /*stats*/); } VECTORCALL bool CollisionResource::rayHit(const TMatrix &instance_tm, const GeomNodeTree *geom_node_tree, const Point3 &from, diff --git a/prog/engine/guiBase/guiRenderCache.cpp b/prog/engine/guiBase/guiRenderCache.cpp index addc9ba76..a2685d3ef 100644 --- a/prog/engine/guiBase/guiRenderCache.cpp +++ b/prog/engine/guiBase/guiRenderCache.cpp @@ -28,8 +28,8 @@ bool GuiVertexData::create(int num_vertices, int num_indices, const char *name) G_ASSERTF(!vb && !ib, "vb=%p ib=%p, destroy() not called?", ib, vb); int minIndices = max(num_indices, 6); // dummy minimal index buffer int flag32 = (indexSize() > 2) ? SBCF_INDEX32 : 0; - vb = d3d::create_vb(int(num_vertices * elemSize()), SBCF_MAYBELOST | SBCF_DYNAMIC, name); - ib = d3d::create_ib(int(minIndices * indexSize()), SBCF_MAYBELOST | SBCF_DYNAMIC | flag32); + vb = d3d::create_vb(int(num_vertices * elemSize()), SBCF_DYNAMIC, name); + ib = d3d::create_ib(int(minIndices * indexSize()), SBCF_DYNAMIC | flag32); if (vb && ib) { verticesTotal = num_vertices; diff --git a/prog/engine/heightMapLand/compressedHeightmap.cpp b/prog/engine/heightMapLand/compressedHeightmap.cpp index 6f3044154..89310901d 100644 --- a/prog/engine/heightMapLand/compressedHeightmap.cpp +++ b/prog/engine/heightMapLand/compressedHeightmap.cpp @@ -316,7 +316,7 @@ bool CompressedHeightmap::loadData(CompressedHeightmap &hmap, IGenLoad &crd, uns int chunk_cnt = chunk_sz ? (hmap.bw * hmap.bh + blocks_per_chunk - 1) / blocks_per_chunk : 0; uint32_t cOfs = 0; - if (chunk_cnt && threadpool::get_num_workers()) + if (chunk_cnt && threadpool::get_num_workers() >= 2) { struct UnpackChunkJob final : cpujobs::IJob { diff --git a/prog/engine/imgui/imguiRenderer.cpp b/prog/engine/imgui/imguiRenderer.cpp index fcfaf78e6..8b9f7c79d 100644 --- a/prog/engine/imgui/imguiRenderer.cpp +++ b/prog/engine/imgui/imguiRenderer.cpp @@ -71,15 +71,13 @@ void DagImGuiRenderer::render(ImDrawData *draw_data) // Create and grow vertex/index buffers if needed if (!vb.getBuf() || vbSize < draw_data->TotalVtxCount) { - vb = dag::create_vb(sizeof(ImDrawVert) * (draw_data->TotalVtxCount + 5000), SBCF_DYNAMIC | SBCF_MAYBELOST | SBCF_CPU_ACCESS_WRITE, - "imgui_vb"); + vb = dag::create_vb(sizeof(ImDrawVert) * (draw_data->TotalVtxCount + 5000), SBCF_DYNAMIC | SBCF_CPU_ACCESS_WRITE, "imgui_vb"); G_ASSERT(vb.getBuf()); vbSize = draw_data->TotalVtxCount + 5000; } if (!ib.getBuf() || ibSize < draw_data->TotalIdxCount) { - ib = dag::create_ib(sizeof(ImDrawIdx) * (draw_data->TotalIdxCount + 10000), SBCF_DYNAMIC | SBCF_MAYBELOST | SBCF_CPU_ACCESS_WRITE, - "imgui_ib"); + ib = dag::create_ib(sizeof(ImDrawIdx) * (draw_data->TotalIdxCount + 10000), SBCF_DYNAMIC | SBCF_CPU_ACCESS_WRITE, "imgui_ib"); G_ASSERT(ib.getBuf()); ibSize = draw_data->TotalIdxCount + 10000; } diff --git a/prog/engine/kernel/dagorHwExcept.cpp b/prog/engine/kernel/dagorHwExcept.cpp index ef57a9645..470dd6c99 100644 --- a/prog/engine/kernel/dagorHwExcept.cpp +++ b/prog/engine/kernel/dagorHwExcept.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -23,6 +25,147 @@ static constexpr int EXCEPT_BUF_SZ = 2048; static char common_buf[EXCEPT_BUF_SZ]; +struct MinidumpExceptionData +{ + PMINIDUMP_EXCEPTION_INFORMATION excInfo = nullptr; + uint64_t stackBase = 0; // crashed thread + uint64_t stackEnd = 0; + uint64_t baseOfImage = 0; // module containing this file + uint64_t endOfImage = 0; + int scannedRegs = 0; + + struct MemoryRange + { + uint64_t base; + uint32_t size; + }; + uint32_t memRemoveQueueSize = 0; + MemoryRange memRemoveQueue[32]{}; +}; + +static BOOL dump_memory(uintptr_t addr, uintptr_t bytes_fwd, uintptr_t bytes_back, MinidumpExceptionData *data, + PMINIDUMP_CALLBACK_OUTPUT callback_output) +{ + bool isStack = addr >= data->stackEnd && addr < data->stackBase; + bool isInGameModule = addr >= data->baseOfImage && addr < data->endOfImage; + if (addr > 0x100000 && !(uint64_t(addr) >> 48) && !isStack) + { + MEMORY_BASIC_INFORMATION mem; + if (VirtualQuery((void *)addr, &mem, sizeof(mem))) + { + uintptr_t start = max(addr - bytes_back, (uintptr_t)mem.BaseAddress); + uintptr_t end = min(addr + bytes_fwd, (uintptr_t)mem.BaseAddress + (uintptr_t)mem.RegionSize); + const int dataProt = PAGE_READONLY | PAGE_READWRITE; + const int codeProt = PAGE_EXECUTE | PAGE_EXECUTE_READ | PAGE_EXECUTE_READWRITE; + bool excludeAsKnownCode = (mem.Protect & codeProt) && isInGameModule; + if ((mem.State & MEM_COMMIT) && (mem.Protect & (dataProt | codeProt)) && !excludeAsKnownCode && end > start) + { + callback_output->MemoryBase = int64_t(intptr_t(start)); + callback_output->MemorySize = end - start; + return TRUE; + } + } + } + return FALSE; +} + +static BOOL minidump_memory_callback(MinidumpExceptionData *data, PMINIDUMP_CALLBACK_OUTPUT callback_output) +{ +#if _TARGET_64BIT + const int regsToScan = 16; + const uint64_t *regsBase = (uint64_t *)&data->excInfo->ExceptionPointers->ContextRecord->Rax; +#else + const int regsToScan = 7; + const uint32_t *regsBase = (uint32_t *)&data->excInfo->ExceptionPointers->ContextRecord->Edi; +#endif + while (data->scannedRegs < regsToScan) + { + uintptr_t regValue = regsBase[data->scannedRegs++]; + if (dump_memory(regValue, 128 /*fwd*/, 32 /*back*/, data, callback_output)) + return TRUE; + } + return FALSE; +} + +static BOOL CALLBACK minidump_callback(PVOID callback_param, const PMINIDUMP_CALLBACK_INPUT callback_input, + PMINIDUMP_CALLBACK_OUTPUT callback_output) +{ + MinidumpExceptionData *data = (MinidumpExceptionData *)callback_param; + switch (callback_input->CallbackType) //-V::1037 Two or more case-branches perform the same actions. + { + case ModuleCallback: + if (uint64_t(&minidump_callback) > callback_input->Module.BaseOfImage && + uint64_t(&minidump_callback) < callback_input->Module.BaseOfImage + callback_input->Module.SizeOfImage) + { + data->baseOfImage = callback_input->Module.BaseOfImage; + data->endOfImage = callback_input->Module.BaseOfImage + callback_input->Module.SizeOfImage; + } + // Here is common filter effective in most cases, but I prefer to have full memory map + // if (!(callback_output->ModuleWriteFlags & ModuleReferencedByMemory)) + // callback_output->ModuleWriteFlags &= ~ModuleWriteModule; + return TRUE; + + case IncludeThreadCallback: + { + bool saveStack = true; + if (callback_input->IncludeThread.ThreadId == data->excInfo->ThreadId || + callback_input->IncludeThread.ThreadId == get_main_thread_id() || + DaThread::isDaThreadWinUnsafe(callback_input->IncludeThread.ThreadId, saveStack)) + { + callback_output->ThreadWriteFlags = ThreadWriteThread | ThreadWriteContext; + if (saveStack) + callback_output->ThreadWriteFlags |= ThreadWriteStack; + if (callback_input->IncludeThread.ThreadId == data->excInfo->ThreadId) + callback_output->ThreadWriteFlags |= ThreadWriteInstructionWindow; + return TRUE; + } + return FALSE; + } + + case MemoryCallback: return minidump_memory_callback(data, callback_output); + + case RemoveMemoryCallback: + if (data->memRemoveQueueSize) + { + MinidumpExceptionData::MemoryRange range = data->memRemoveQueue[--data->memRemoveQueueSize]; + callback_output->MemoryBase = range.base; + callback_output->MemorySize = range.size; + return TRUE; + } + return FALSE; + + case ThreadCallback: + case ThreadExCallback: + if (callback_input->Thread.ThreadId == data->excInfo->ThreadId) + { + data->stackBase = callback_input->Thread.StackBase; + data->stackEnd = callback_input->Thread.StackEnd; + } + if (!(callback_output->ThreadWriteFlags & ThreadWriteStack)) + { + // ThreadWriteContext saves stack, so we need to strip it manually +#if _TARGET_64BIT + uint64_t sp = callback_input->Thread.Context.Rsp; +#else + uint64_t sp = int64_t(intptr_t(callback_input->Thread.Context.Esp)); +#endif + uint64_t base = sp + 128; + uint32_t size = uint32_t(uintptr_t(callback_input->Thread.StackBase) - uintptr_t(base)); + if (int(size) > 0) + { + if (data->memRemoveQueueSize < countof(data->memRemoveQueue)) + data->memRemoveQueue[data->memRemoveQueueSize++] = MinidumpExceptionData::MemoryRange{base, size}; + else + callback_output->ThreadWriteFlags = 0; + } + } + return TRUE; + + case IncludeModuleCallback: return TRUE; + + default: return FALSE; + } +} static void __cdecl hard_except_handler_named(EXCEPTION_POINTERS *eptr, char *buf, int buf_len) { @@ -61,10 +204,14 @@ static void __cdecl hard_except_handler_named(EXCEPTION_POINTERS *eptr, char *bu if (INVALID_HANDLE_VALUE != hDumpFile) { MINIDUMP_EXCEPTION_INFORMATION minidumpExcInfo = {::GetCurrentThreadId(), eptr, FALSE}; - MINIDUMP_TYPE minidump_type = (MINIDUMP_TYPE)(MiniDumpScanMemory | MiniDumpWithIndirectlyReferencedMemory); + MinidumpExceptionData param; + param.excInfo = &minidumpExcInfo; + MINIDUMP_CALLBACK_INFORMATION mci; + mci.CallbackRoutine = minidump_callback; + mci.CallbackParam = (void *)¶m; - if (MiniDumpWriteDump(::GetCurrentProcess(), ::GetCurrentProcessId(), hDumpFile, minidump_type, &minidumpExcInfo, NULL, NULL)) + if (MiniDumpWriteDump(::GetCurrentProcess(), ::GetCurrentProcessId(), hDumpFile, minidump_type, &minidumpExcInfo, NULL, &mci)) { debug_internal::dbgCrashDumpPath[0] = 0; // no more dumps } diff --git a/prog/engine/kernel/debugPrivate.h b/prog/engine/kernel/debugPrivate.h index 11f222ecd..e774183b8 100644 --- a/prog/engine/kernel/debugPrivate.h +++ b/prog/engine/kernel/debugPrivate.h @@ -6,6 +6,8 @@ #include #include "writeStream.h" +#define FORCE_THREAD_IDS _TARGET_XBOX || _TARGET_C1 || _TARGET_C2 + namespace debug_internal { extern bool flush_debug; @@ -18,7 +20,7 @@ extern debug_log_callback_t log_callback; struct Context { -#if DAGOR_FORCE_LOGS || DAGOR_DBGLEVEL > 0 +#if DAGOR_FORCE_LOGS || DAGOR_DBGLEVEL > 0 || FORCE_THREAD_IDS int threadId; const char *threadName; #endif diff --git a/prog/engine/kernel/logimpl.cpp b/prog/engine/kernel/logimpl.cpp index e8303dba4..3d8004bbb 100644 --- a/prog/engine/kernel/logimpl.cpp +++ b/prog/engine/kernel/logimpl.cpp @@ -134,7 +134,7 @@ static file_ptr_t xbox_debug_file = NULL; #define LOG_TAIL_BUF (_TARGET_XBOX || _TARGET_C1 || _TARGET_C2 || _TARGET_ANDROID || _TARGET_IOS) -#if DAGOR_DBGLEVEL > 0 +#if DAGOR_DBGLEVEL > 0 || FORCE_THREAD_IDS static int next_thread_id = 1; #endif @@ -143,7 +143,7 @@ static bool dbg_tid_enabled = false; void debug_enable_thread_ids(bool en_tid) { dbg_tid_enabled = en_tid; } void debug_set_thread_name(const char *persistent_thread_name_ptr) { -#if !DAGOR_FORCE_LOGS || DAGOR_DBGLEVEL > 0 +#if !DAGOR_FORCE_LOGS || DAGOR_DBGLEVEL > 0 || FORCE_THREAD_IDS (&debug_internal::dbg_ctx)->threadName = persistent_thread_name_ptr; #else (void)(persistent_thread_name_ptr); @@ -266,7 +266,7 @@ void debug_internal::vlog(int tag, const char *format, const void *arg, int anum char *buf = vlog_buf; int buf_used_len = i_strlen(buf); -#if !DAGOR_FORCE_LOGS || DAGOR_DBGLEVEL > 0 +#if !DAGOR_FORCE_LOGS || DAGOR_DBGLEVEL > 0 || FORCE_THREAD_IDS if (!dbg_tid_enabled || (&dbg_ctx)->threadId) ; // do nothing else if (is_main_thread()) @@ -282,7 +282,7 @@ void debug_internal::vlog(int tag, const char *format, const void *arg, int anum return; int t = debug_internal::timestampEnabled ? get_time_msec() : -1; -#if !DAGOR_FORCE_LOGS || DAGOR_DBGLEVEL > 0 +#if !DAGOR_FORCE_LOGS || DAGOR_DBGLEVEL > 0 || FORCE_THREAD_IDS int thread_id = dbg_tid_enabled ? (&dbg_ctx)->threadId - 1 : -1; #else int thread_id = -1; diff --git a/prog/engine/lib3d/ddsxDec.h b/prog/engine/lib3d/ddsxDec.h index 07bb27809..93fa7be00 100644 --- a/prog/engine/lib3d/ddsxDec.h +++ b/prog/engine/lib3d/ddsxDec.h @@ -136,7 +136,11 @@ struct DDSxDecodeCtx : DDSxDecodeCtxBase DDSxDecodeCtxBase *ctx; unsigned wIdx = 0; - DecThread() : DaThread("DDSX decoder", 128 << 10), ctx(NULL) { os_event_create(&event, NULL); } + DecThread() : DaThread("DDSX decoder", 128 << 10), ctx(NULL) + { + stripStackInMinidump(); + os_event_create(&event, NULL); + } ~DecThread() { os_event_destroy(&event); } void execute() override diff --git a/prog/engine/lib3d/debug3dBuffered.cpp b/prog/engine/lib3d/debug3dBuffered.cpp index 5f6f98aa9..ce1078a70 100644 --- a/prog/engine/lib3d/debug3dBuffered.cpp +++ b/prog/engine/lib3d/debug3dBuffered.cpp @@ -14,9 +14,11 @@ struct BufferedLine static Tab buffered_line_list(midmem_ptr()); // Note: sorted by deadline static size_t current_frame = 0; +static bool last_frame_game_was_paused = false; -void draw_debug_line_buffered(const Point3 &p0, const Point3 &p1, E3DCOLOR c, size_t frames) +void draw_debug_line_buffered(const Point3 &p0, const Point3 &p1, E3DCOLOR c, size_t requested_frames) { + size_t frames = last_frame_game_was_paused ? 1 : requested_frames; size_t deadlineFrame = current_frame + frames; for (int last = buffered_line_list.size() - 1, i = last; i >= 0; --i) // lookup place to insert into (according to deadline) if (deadlineFrame >= buffered_line_list[i].deadlineFrame) @@ -305,12 +307,13 @@ static int draw_buffered_lines(dag::ConstSpan lines, size_t cur_fr return eraseNum; } -void flush_buffered_debug_lines(bool decriment_buffer_frames) +void flush_buffered_debug_lines(bool game_is_paused) { + last_frame_game_was_paused = game_is_paused; if (buffered_line_list.empty()) return; - if (!decriment_buffer_frames) + if (game_is_paused) for (auto &line : buffered_line_list) if (line.bufferFrames > 1) line.deadlineFrame++; diff --git a/prog/engine/lib3d/quadIndexBuffer.cpp b/prog/engine/lib3d/quadIndexBuffer.cpp index 3da033d05..9b25b4744 100644 --- a/prog/engine/lib3d/quadIndexBuffer.cpp +++ b/prog/engine/lib3d/quadIndexBuffer.cpp @@ -64,7 +64,7 @@ static const uint16_t box_indices[36] = { static void try_to_init_box_16bit(const bool modify_counter = true) { - box16bit = d3d::create_ib(36 * sizeof(uint16_t), SBCF_MAYBELOST, "ib_box_16bit"); + box16bit = d3d::create_ib(36 * sizeof(uint16_t), 0, "ib_box_16bit"); d3d_err(box16bit); uint16_t *indices = nullptr; @@ -89,7 +89,7 @@ static void try_to_init_quads_16bit(const bool modify_counter = true) const uint32_t POINTS_COUNT = 1 << 16; const uint32_t QUADS_COUNT = POINTS_COUNT / 4; const uint32_t INDICES_COUNT = QUADS_COUNT * 6; - quads16bit = d3d::create_ib(INDICES_COUNT * sizeof(uint16_t), SBCF_MAYBELOST, "ib_quads_16bit"); + quads16bit = d3d::create_ib(INDICES_COUNT * sizeof(uint16_t), 0, "ib_quads_16bit"); d3d_err(quads16bit); uint16_t *indices = nullptr; @@ -113,7 +113,7 @@ static void try_to_init_quads_32bit(const bool modify_counter = true) { if (quads32bitIndicesCount == 0) return; - Ibuffer *biggerBuffer = d3d::create_ib(quads32bitIndicesCount * sizeof(uint32_t), SBCF_INDEX32 | SBCF_MAYBELOST, "ib_quads_32bit"); + Ibuffer *biggerBuffer = d3d::create_ib(quads32bitIndicesCount * sizeof(uint32_t), SBCF_INDEX32, "ib_quads_32bit"); d3d_err(biggerBuffer); uint32_t *indices = nullptr; diff --git a/prog/engine/lib3d/serialIntBuffer.cpp b/prog/engine/lib3d/serialIntBuffer.cpp index a2e7ca005..b1ed3c5c9 100644 --- a/prog/engine/lib3d/serialIntBuffer.cpp +++ b/prog/engine/lib3d/serialIntBuffer.cpp @@ -59,7 +59,7 @@ static Vbuffer *try_to_init(uint32_t count) { if (count == 0) return nullptr; - Vbuffer *ints = d3d::create_vb(count * sizeof(uint32_t), SBCF_MAYBELOST, "serial_ints_vb"); + Vbuffer *ints = d3d::create_vb(count * sizeof(uint32_t), 0, "serial_ints_vb"); d3d_err(ints); if (fill(ints, count)) return ints; diff --git a/prog/engine/libFx/effectsInterface.cpp b/prog/engine/libFx/effectsInterface.cpp index 765d43465..c1082bab3 100644 --- a/prog/engine/libFx/effectsInterface.cpp +++ b/prog/engine/libFx/effectsInterface.cpp @@ -517,9 +517,7 @@ void EffectsInterface::startup() for (int i = 0; i < countof(shaderNames); ++i) { - // compatibility: new premultalpha shaders are optional - bool optional = (i % FX__NUM_STD_SHADERS) >= (FX__NUM_STD_SHADERS - 3); - registerStdParticleCustomShader(shaderNames[i], optional); + registerStdParticleCustomShader(shaderNames[i], true); } lighting_power_vid = get_shader_variable_id("lighting_power", true); diff --git a/prog/engine/libFx/leavesWind.cpp b/prog/engine/libFx/leavesWind.cpp index 6c7a87d9a..0d3586073 100644 --- a/prog/engine/libFx/leavesWind.cpp +++ b/prog/engine/libFx/leavesWind.cpp @@ -2,6 +2,7 @@ #include #include #include <3d/dag_render.h> +#include #include #include diff --git a/prog/engine/osApiWrappers/threads.cpp b/prog/engine/osApiWrappers/threads.cpp index aae3278a4..a2994f969 100644 --- a/prog/engine/osApiWrappers/threads.cpp +++ b/prog/engine/osApiWrappers/threads.cpp @@ -337,6 +337,13 @@ void DaThread::setThreadIdealProcessor(int ideal_processor_no) #endif } +void DaThread::stripStackInMinidump() +{ +#if _TARGET_PC_WIN | _TARGET_XBOX + minidumpSaveStack = false; +#endif +} + #if defined(HAVE_PTHREAD) void *DaThread::threadEntry(void *arg) { @@ -532,5 +539,21 @@ void DaThread::terminate_all(bool wait, int timeout_ms) threads_list_head->terminate(false); } +#if _TARGET_PC_WIN | _TARGET_XBOX +bool DaThread::isDaThreadWinUnsafe(uintptr_t thread_id, bool &minidump_save_stack) +{ + minidump_save_stack = false; + for (DaThread *t = threads_list_head; t; t = t->nextThread) + { + if (GetThreadId((HANDLE)t->id) == thread_id) + { + minidump_save_stack = t->minidumpSaveStack; + return true; + } + } + return false; +} +#endif + #define EXPORT_PULL dll_pull_osapiwrappers_threads #include diff --git a/prog/engine/perfMon/daProfiler/stackUnwinder/unwindStackWin.cpp b/prog/engine/perfMon/daProfiler/stackUnwinder/unwindStackWin.cpp index 60836f6d3..35cebf7d6 100644 --- a/prog/engine/perfMon/daProfiler/stackUnwinder/unwindStackWin.cpp +++ b/prog/engine/perfMon/daProfiler/stackUnwinder/unwindStackWin.cpp @@ -394,6 +394,7 @@ ThreadStackUnwinder *start_unwind_thread_stack(ThreadStackUnwindProvider &, intp int unwind_thread_stack(ThreadStackUnwindProvider &, ThreadStackUnwinder &s, uint64_t *addresses, size_t max_size) { + return 0; // directly and immediately unroll whole stack. there is no risk of having deadlock HANDLE thread = (HANDLE)(&s); if (SuspendThread(thread) == ~((DWORD)0)) //-V720 diff --git a/prog/engine/perfMon/visClipMesh.cpp b/prog/engine/perfMon/visClipMesh.cpp index 019b95f2b..8d8b39873 100644 --- a/prog/engine/perfMon/visClipMesh.cpp +++ b/prog/engine/perfMon/visClipMesh.cpp @@ -74,9 +74,9 @@ bool create_visclipmesh(CfgReader &cfg, bool for_game) vcm_lines = false; } - vcm_vb = d3d::create_vb((MAX_VISCLIPMESH_FACETS * 6) * sizeof(VisClipMeshVertex), SBCF_DYNAMIC | SBCF_MAYBELOST, __FILE__); + vcm_vb = d3d::create_vb((MAX_VISCLIPMESH_FACETS * 6) * sizeof(VisClipMeshVertex), SBCF_DYNAMIC, __FILE__); d3d_err(vcm_vb); - vcm_ib = d3d::create_ib(MAX_VISCLIPMESH_FACETS * 3 * sizeof(uint16_t), SBCF_DYNAMIC | SBCF_MAYBELOST); + vcm_ib = d3d::create_ib(MAX_VISCLIPMESH_FACETS * 3 * sizeof(uint16_t), SBCF_DYNAMIC); d3d_err(vcm_ib); // vcm_consoleproc.demandInit(); diff --git a/prog/engine/phys/fastPhys/fastPhysDebug.cpp b/prog/engine/phys/fastPhys/fastPhysDebug.cpp index c9dcdaa1d..812ecedd7 100644 --- a/prog/engine/phys/fastPhys/fastPhysDebug.cpp +++ b/prog/engine/phys/fastPhys/fastPhysDebug.cpp @@ -1,20 +1,6 @@ #include #include #include -#include - -eastl::set debugAnimCharsSet; - -void FastPhys::toggleDebugAnimChar(eastl::string &str) -{ - auto it = debugAnimCharsSet.find(str); - if (it != debugAnimCharsSet.end()) - debugAnimCharsSet.erase(it); - else - debugAnimCharsSet.insert(str); -} -bool FastPhys::checkDebugAnimChar(eastl::string &str) { return debugAnimCharsSet.find(str) != debugAnimCharsSet.end(); } -void FastPhys::resetDebugAnimChars() { debugAnimCharsSet.clear(); } void FastPhysSystem::debugRender() diff --git a/prog/engine/phys/physJolt/joltPhysics.cpp b/prog/engine/phys/physJolt/joltPhysics.cpp index 7c31a2de7..2ad166d5b 100644 --- a/prog/engine/phys/physJolt/joltPhysics.cpp +++ b/prog/engine/phys/physJolt/joltPhysics.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include namespace layers @@ -128,6 +129,7 @@ class JoltJobSystemImpl final : public JPH::JobSystem OSSpinlock fbaJobsSL, fbaBarriersSL; unsigned maxThreads; void *chunkHoldPtr1, *chunkHoldPtr2; //< to prevent chunk release when last work block is released + cpujobs::IJob *delayedFreeTail = nullptr; public: JoltJobSystemImpl(unsigned max_jobs, unsigned max_barriers, unsigned max_threads) : @@ -138,8 +140,29 @@ class JoltJobSystemImpl final : public JPH::JobSystem } ~JoltJobSystemImpl() { - fbaJobs.freeOneBlock(chunkHoldPtr1); fbaBarriers.freeOneBlock(chunkHoldPtr2); + + uint32_t _1, _2, usedMem = 0; + { + OSSpinlockScopedLock lock(fbaJobsSL); + flushDelayedFreeList(); + fbaJobs.freeOneBlock(chunkHoldPtr1); + fbaJobs.getMemoryStats(_1, _2, usedMem); + if (DAGOR_LIKELY(usedMem == 0)) + { + G_FAST_ASSERT(!delayedFreeTail); + return; + } + } + + // Wait for rare jobs that wasn't added/waited by barrier + spin_wait([&] { + fbaJobsSL.lock(); + flushDelayedFreeList(); + fbaJobs.getMemoryStats(_1, _2, usedMem); + fbaJobsSL.unlock(); + return usedMem != 0; + }); } struct JobImpl final : public cpujobs::IJob, public Job @@ -263,7 +286,15 @@ class JoltJobSystemImpl final : public JPH::JobSystem OSSpinlockScopedLock lock(fbaBarriersSL); return fbaBarriers.allocateOneBlock(); } - Barrier *CreateBarrier() override { return ::new (allocBarrierMem(), _NEW_INPLACE) BarrierImpl; } + Barrier *CreateBarrier() override + { + if (DAGOR_UNLIKELY(interlocked_relaxed_load_ptr(delayedFreeTail))) + { + OSSpinlockScopedLock lock(fbaJobsSL); + flushDelayedFreeList(); + } + return ::new (allocBarrierMem(), _NEW_INPLACE) BarrierImpl; + } /// Destroy a barrier when it is no longer used. The barrier should be empty at this point. void DestroyBarrier(Barrier *inBarrier) override @@ -301,14 +332,29 @@ class JoltJobSystemImpl final : public JPH::JobSystem { auto jptr = static_cast(inJob); G_FAST_ASSERT(inJob->IsDone()); - - // Addition to barrier can be skipped if job was already done (IsDone()=true) at Jolt, - // but we still have to wait for it to be done in threadpool - threadpool::wait(jptr, 0, jptr->tprio); - jptr->~JobImpl(); + // Addition to barrier can be skipped if job was already done (IsDone()=true) at Jolt. It might cause + // this method to be called from job itself causing it not to be `done' by threadpool side OSSpinlockScopedLock lock(fbaJobsSL); - fbaJobs.freeOneBlock(jptr); + if (interlocked_acquire_load(jptr->done)) + fbaJobs.freeOneBlock(jptr); + else // Add to list in order to free it later + { + jptr->next = delayedFreeTail; + delayedFreeTail = jptr; + } + } + + void flushDelayedFreeList() + { + for (auto *j2free = interlocked_acquire_load_ptr(delayedFreeTail); j2free;) + { + auto jprev = j2free->next; + threadpool::wait(j2free, 0, JobImpl::tprio); + fbaJobs.freeOneBlock(j2free); + j2free = jprev; + } + delayedFreeTail = nullptr; } }; @@ -583,8 +629,11 @@ void PhysWorld::init_engine(bool single_threaded) void PhysWorld::term_engine() { using namespace jolt_api; - del_it(physicsSystem); + + // Note: phys world which does `fetchSimRes` in its dtor is assumed to be destroyed at this point del_it(jobSystem); + + del_it(physicsSystem); del_it(tempAllocator); } @@ -612,7 +661,7 @@ PhysBody::PhysBody(PhysWorld *w, float mass, const PhysCollision *coll, const TM G_ASSERTF(objlayer_to_layer_mask(body.mObjectLayer) == layerMask, "layerMask=0x%x -> objLayer=0x%x", layerMask, (unsigned)body.mObjectLayer); - body.mMotionType = mass == 0.f ? (s.kinematic ? JPH::EMotionType::Kinematic : JPH::EMotionType::Static) : JPH::EMotionType::Dynamic; + body.mMotionType = isDynamic ? JPH::EMotionType::Dynamic : (s.kinematic ? JPH::EMotionType::Kinematic : JPH::EMotionType::Static); if (body.mMotionType != JPH::EMotionType::Static) { body.mOverrideMassProperties = diff --git a/prog/engine/shaders/debugPrimitivesVbuffer.cpp b/prog/engine/shaders/debugPrimitivesVbuffer.cpp index 2a4d55354..f46c800c7 100644 --- a/prog/engine/shaders/debugPrimitivesVbuffer.cpp +++ b/prog/engine/shaders/debugPrimitivesVbuffer.cpp @@ -215,12 +215,12 @@ void DebugPrimitivesVbuffer::endCache() } const size_t vbuffer_size = vertexList.size() * sizeof(Point3); - vbuffer = d3d::create_vb((int)(vbuffer_size), SBCF_MAYBELOST, name); + vbuffer = d3d::create_vb((int)(vbuffer_size), 0, name); vbuffer->updateData(0, (uint32_t)vbuffer_size, (void *)vertexList.data(), 0); String ibName = String(0, "%s_ib", name); const size_t ibuffer_size = indices.size() * sizeof(int); - ibuffer = d3d::create_ib((int)ibuffer_size, SBCF_INDEX32 | SBCF_MAYBELOST, ibName); + ibuffer = d3d::create_ib((int)ibuffer_size, SBCF_INDEX32, ibName); ibuffer->updateData(0, (uint32_t)ibuffer_size, (void *)indices.data(), 0); clear_and_shrink(linesCache); diff --git a/prog/engine/shaders/dynamicShadersBuffer.cpp b/prog/engine/shaders/dynamicShadersBuffer.cpp index b5698ac85..85b51b7b0 100644 --- a/prog/engine/shaders/dynamicShadersBuffer.cpp +++ b/prog/engine/shaders/dynamicShadersBuffer.cpp @@ -13,7 +13,7 @@ class DynamicShadersBuffer::VertexBuffer void create(int vertex_count) { close(); - d3d_err(vb = d3d::create_vb(vertex_count * stride, SBCF_MAYBELOST | SBCF_DYNAMIC, __FILE__)); + d3d_err(vb = d3d::create_vb(vertex_count * stride, SBCF_DYNAMIC, __FILE__)); curVert = 0; size = vertex_count; } @@ -114,7 +114,7 @@ class DynamicShadersBuffer::IndexBuffer void create(int ind_count) { close(); - d3d_err(ib = d3d::create_ib(ind_count * sizeof(uint32_t), SBCF_MAYBELOST | SBCF_INDEX32 | SBCF_DYNAMIC)); + d3d_err(ib = d3d::create_ib(ind_count * sizeof(uint32_t), SBCF_INDEX32 | SBCF_DYNAMIC)); curInd = 0; size = ind_count; } diff --git a/prog/engine/shaders/matVdataLoad.cpp b/prog/engine/shaders/matVdataLoad.cpp index c4f592f4b..b0cd8f482 100644 --- a/prog/engine/shaders/matVdataLoad.cpp +++ b/prog/engine/shaders/matVdataLoad.cpp @@ -302,14 +302,14 @@ void ShaderMatVdata::loadMatVdata(const char *name, IGenLoad &crd, unsigned flag Sbuffer::IReloadData *rld = this; unsigned used_d3d_buf_cnt = 0; - for (int i = 0; i < vdata.size(); i++) + for (int i = vdata.size() - 1; i >= 0; i--) { if ((!vdata[i].testFlags(VDATA_NO_VB) && vdata[i].getVB()) || (!vdata[i].testFlags(VDATA_NO_IB) && vdata[i].getIB())) used_d3d_buf_cnt++; - if (!vdata[i].testFlags(VDATA_NO_VB) && vdata[i].getVB() && vdata[i].getVB()->setReloadCallback(rld)) - rld = &stub; if (!vdata[i].testFlags(VDATA_NO_IB) && vdata[i].getIB() && vdata[i].getIB()->setReloadCallback(rld)) rld = &stub; + if (!vdata[i].testFlags(VDATA_NO_VB) && vdata[i].getVB() && vdata[i].getVB()->setReloadCallback(rld)) + rld = &stub; } if (rld != this) matVdataSrcRef.fname = dagor_fname_map_add_fn(crd.getTargetName()); diff --git a/prog/engine/shaders/postFxRenderer.cpp b/prog/engine/shaders/postFxRenderer.cpp index 136bf0027..9743c081f 100644 --- a/prog/engine/shaders/postFxRenderer.cpp +++ b/prog/engine/shaders/postFxRenderer.cpp @@ -22,13 +22,13 @@ void PostFxRenderer::clear() shElem = NULL; } -void PostFxRenderer::init(const char *shader_name) +void PostFxRenderer::init(const char *shader_name, bool is_optional) { shmat = new_shader_material_by_name_optional(shader_name, nullptr); if (shmat.get()) shElem = shmat->make_elem(); if (!shmat.get() || !shElem.get()) - logerr("PostFxRenderer: shader '%s' not found.", shader_name); + logmessage(is_optional ? LOGLEVEL_DEBUG : LOGLEVEL_ERR, "PostFxRenderer: shader '%s' not found.", shader_name); } void PostFxRenderer::drawInternal(int num_tiles) const diff --git a/prog/engine/shaders/rendInstRes.cpp b/prog/engine/shaders/rendInstRes.cpp index 9e2e979a1..82d7d05b5 100644 --- a/prog/engine/shaders/rendInstRes.cpp +++ b/prog/engine/shaders/rendInstRes.cpp @@ -33,6 +33,7 @@ RenderableInstanceLodsResource::ImpostorTextures RenderableInstanceLodsResource: VAR(impostor_albedo_alpha) \ VAR(impostor_normal_translucency) \ VAR(impostor_ao_smoothness) \ + VAR(impostor_preshadow) \ VAR(cross_dissolve_mul) \ VAR(cross_dissolve_add) @@ -273,11 +274,6 @@ RenderableInstanceLodsResource *RenderableInstanceLodsResource::makeStubRes(cons bool RenderableInstanceLodsResource::isBakedImpostor() const { return getImpostorParams().hasBakedTexture(); } -BaseTexture *RenderableInstanceLodsResource::getPreshadowTexture() const -{ - return isBakedImpostor() ? getImpostorTextures().shadowAtlasTex : nullptr; -} - bool RenderableInstanceLodsResource::setImpostorVars(ShaderMaterial *mat, int buffer_offset) const { auto &impostorTextures = getImpostorTextures(); @@ -291,6 +287,7 @@ bool RenderableInstanceLodsResource::setImpostorVars(ShaderMaterial *mat, int bu res = mat->set_texture_param(impostor_albedo_alphaVarId, impostorTextures.albedo_alpha) && res; res = mat->set_texture_param(impostor_normal_translucencyVarId, impostorTextures.normal_translucency) && res; res = mat->set_texture_param(impostor_ao_smoothnessVarId, impostorTextures.ao_smoothness) && res; + res = mat->set_texture_param(impostor_preshadowVarId, impostorTextures.shadowAtlas) && res; if (isBakedImpostor()) G_ASSERT(impostorTextures.albedo_alpha != BAD_TEXTUREID && impostorTextures.normal_translucency != BAD_TEXTUREID && @@ -479,9 +476,10 @@ void RenderableInstanceLodsResource::prepareTextures(const char *name, uint32_t { auto ¶ms = getImpostorParamsE(); auto &impostorTextures = getImpostorTexturesE(); - impostorTextures.close(); if (hasImpostorData()) { + if (impostorTextures.isInitialized()) + return; impostorTextures.albedo_alpha = ::get_tex_gameres(String(0, "%s_aa", name)); if (impostorTextures.albedo_alpha != BAD_TEXTUREID) { @@ -518,15 +516,14 @@ void RenderableInstanceLodsResource::prepareTextures(const char *name, uint32_t texture_format_flags | TEXCF_CLEAR_ON_CREATE, levelCount, textureName.c_str()); if (shadowTex) { - impostorTextures.shadowAtlas = register_managed_tex(name, shadowTex); + impostorTextures.shadowAtlas = register_managed_tex(textureName, shadowTex); shadowTex->texlod(0); shadowTex->texfilter(TEXFILTER_DEFAULT); shadowTex->setAnisotropy(1); add_anisotropy_exception(impostorTextures.shadowAtlas); impostorTextures.shadowAtlasTex = acquire_managed_tex(impostorTextures.shadowAtlas); - release_managed_tex(impostorTextures.shadowAtlas); // get_tex_gameres() or register_managed_tex() hold main ref to - // texture + release_managed_tex(impostorTextures.shadowAtlas); // get_tex_gameres() or register_managed_tex() hold main ref to tex } } } diff --git a/prog/engine/shaders/scriptSElem.cpp b/prog/engine/shaders/scriptSElem.cpp index 19c338fd6..e7d54f328 100644 --- a/prog/engine/shaders/scriptSElem.cpp +++ b/prog/engine/shaders/scriptSElem.cpp @@ -179,7 +179,7 @@ static PROGRAM get_compute_prg(int i) FSHADER sh = BAD_PROGRAM; d3d::driver_command(DRV3D_COMMAND_GET_SHADER, void_ptr_cast(i), void_ptr_cast(ShaderCodeType::COMPUTE), &sh); if (sh == BAD_PROGRAM) - sh = d3d::create_program_cs(shBinDumpOwner().getCode(i, ShaderCodeType::COMPUTE, tmpbuf).data()); + sh = d3d::create_program_cs(shBinDumpOwner().getCode(i, ShaderCodeType::COMPUTE, tmpbuf).data(), CSPreloaded::Yes); shBinDumpRW().fshId[i] = sh; restore_fp_exceptions_state(); if (shBinDump().fshId[i] == BAD_PROGRAM) @@ -221,7 +221,12 @@ __forceinline void ScriptedShaderElement::prepareShaderProgram(ID_T &pass_id, in if (debugInfoStr.empty()) { bool hvc = variant_code != ~0u; - debugInfoStr.printf(0, "%s%s", (const char *)shClass.name, hvc ? "\n" : ""); +#if _TARGET_C1 || _TARGET_C2 + +#else + const char *separator = "\n"; +#endif + debugInfoStr.printf(0, "%s%s", (const char *)shClass.name, hvc ? separator : ""); if (hvc) shaderbindump::decodeVariantStr(code.dynVariants.codePieces, variant_code, debugInfoStr); } @@ -958,6 +963,14 @@ void ScriptedShaderElement::exec_stcode(dag::ConstSpan cod, const shaderbin d3d::set_tex(stageDest, ind, tex); } break; + case SHCOD_SAMPLER: + { + const uint32_t slot = shaderopcode::getOp2p1(opc); + const uint32_t id = shaderopcode::getOp2p2(opc); + d3d::SamplerHandle smp = shBinDump().globVars.get(id); + d3d::set_sampler(stageDest, slot, smp); + } + break; case SHCOD_TEXTURE_VS: { TEXTUREID tid = tex_reg(regs, shaderopcode::getOp2p2(opc)); diff --git a/prog/engine/shaders/shStateBlockBindless.cpp b/prog/engine/shaders/shStateBlockBindless.cpp index 27dbb30e7..7c5d2aa31 100644 --- a/prog/engine/shaders/shStateBlockBindless.cpp +++ b/prog/engine/shaders/shStateBlockBindless.cpp @@ -43,7 +43,7 @@ static eastl::deque, EASTLAlloca enum PackedConstsStateBits { - STCODE_BITS = 12, + PACKED_STCODE_BITS = 12, BUFFER_BITS = 7, LOCAL_STATE_BITS = 13 }; @@ -76,30 +76,25 @@ static uint32_t allocate_packed_cell(int stcode_id) return stcodeIdToPacked[stcode_id]; } -static inline uint32_t remap_stcode_id(int stcode_id) -{ - if (EASTL_UNLIKELY((uint32_t)stcode_id >= stcodeIdToPacked.size() || stcodeIdToPacked[stcode_id] == INVALID_MAT_ID)) - fatal("Packed material stcode=%d/%d not created", stcode_id, stcodeIdToPacked.size()); - return stcodeIdToPacked[stcode_id]; -} - class PackedConstsState { - static_assert(STCODE_BITS + BUFFER_BITS + LOCAL_STATE_BITS == 32, "We want to use the whole uint32_t for state id."); + static_assert(PACKED_STCODE_BITS + BUFFER_BITS + LOCAL_STATE_BITS == 32, "We want to use the whole uint32_t for state id."); uint32_t stateId; public: - PackedConstsState(int stcode_id, uint32_t buffer_id, uint32_t local_state_id) + PackedConstsState(int packed_stcode_id, uint32_t buffer_id, uint32_t local_state_id) { - G_ASSERT(stcode_id < (1 << STCODE_BITS) && buffer_id < (1 << BUFFER_BITS) && local_state_id < (1 << LOCAL_STATE_BITS)); - stateId = (stcode_id < 0) ? local_state_id - : (((((uint32_t)(stcode_id + 1) << BUFFER_BITS) | buffer_id) << LOCAL_STATE_BITS) | local_state_id); + G_ASSERT( + packed_stcode_id < (1 << PACKED_STCODE_BITS) && buffer_id < (1 << BUFFER_BITS) && local_state_id < (1 << LOCAL_STATE_BITS)); + stateId = (packed_stcode_id < 0) + ? local_state_id + : (((((uint32_t)(packed_stcode_id + 1) << BUFFER_BITS) | buffer_id) << LOCAL_STATE_BITS) | local_state_id); } PackedConstsState(uint32_t state) : stateId(state) {} - int getStcodeId() const { return (int)(stateId >> (LOCAL_STATE_BITS + BUFFER_BITS)) - 1; } + int getPackedStcodeId() const { return (int)(stateId >> (LOCAL_STATE_BITS + BUFFER_BITS)) - 1; } uint32_t getGlobalStateId() const { return stateId & ((1 << LOCAL_STATE_BITS) - 1); } @@ -107,7 +102,7 @@ class PackedConstsState uint32_t getBufferId() const { return (stateId >> LOCAL_STATE_BITS) & ((1 << BUFFER_BITS) - 1); } - uint32_t getMaterialId() const { return (stateId >> LOCAL_STATE_BITS) & ((1 << (STCODE_BITS + BUFFER_BITS)) - 1); } + uint32_t getMaterialId() const { return (stateId >> LOCAL_STATE_BITS) & ((1 << (PACKED_STCODE_BITS + BUFFER_BITS)) - 1); } operator uint32_t() const { return stateId; } }; @@ -120,9 +115,8 @@ struct BindlessState void updateTexForPackedMaterial(uint32_t self_idx, int tex_level) { PackedConstsState stateId(self_idx); - const int stcode_id = stateId.getStcodeId(); - G_ASSERT(stcode_id != -1); - const uint32_t packedId = remap_stcode_id(stcode_id); + const int packedId = stateId.getPackedStcodeId(); + G_ASSERT(packedId != -1); Sbuffer *constBuf = packedConstBuf[packedId][stateId.getBufferId()]; if (!constBuf) @@ -178,9 +172,8 @@ struct BindlessState void applyPackedMaterial(uint32_t self_idx) { PackedConstsState stateId(self_idx); - const int stcode_id = stateId.getStcodeId(); - G_ASSERT(stcode_id != -1); - const uint32_t packedId = remap_stcode_id(stcode_id); + const int packedId = stateId.getPackedStcodeId(); + G_ASSERT(packedId != -1); Sbuffer *constBuf = packedConstBuf[packedId][stateId.getBufferId()]; if (!constBuf) @@ -205,7 +198,7 @@ struct BindlessState void apply(uint32_t self_idx, int tex_level) { PackedConstsState stateId(self_idx); - const int stcode_id = stateId.getStcodeId(); + const int stcode_id = stateId.getPackedStcodeId(); G_ASSERT(stcode_id == -1); Sbuffer *constBuf = allConstBuf[stateId.getGlobalStateId()]; @@ -310,7 +303,7 @@ struct BindlessState const uint32_t bufferId = consts_count == 0 ? 0 : (localStateId / (MAX_CONST_BUFFER_SIZE / consts_count)); if (consts_count != 0 && stcode_id != -1) bufferNeedsUpdate[packedId].set(bufferId); - return PackedConstsState(stcode_id, bufferId, localStateId); + return PackedConstsState(stcode_id == -1 ? -1 : packedId, bufferId, localStateId); } uint32_t bid = bindlessConstParams.size(); @@ -325,7 +318,7 @@ struct BindlessState uint32_t brange = d3d::allocate_bindless_resource_range(RES3D_TEX, addedBindlessTextures.size()); for (auto &tex : addedBindlessTextures) { - uniqBindlessTex.push_back(BindlessTexRecord{TEXTUREID(tex), brange++, 0}); + uniqBindlessTex.push_back(BindlessTexRecord{TEXTUREID(tex), brange++, 0, 0}); tex = uniqBindlessTex.size() - 1; } @@ -380,12 +373,13 @@ struct BindlessState } else { - if ((dataConstToAdd.size() + MAX_CONST_BUFFER_SIZE - 1) / MAX_CONST_BUFFER_SIZE > packedConstBuf[packedId].size()) + if ((dataConstToAdd.size() + MAX_CONST_BUFFER_SIZE - 1) / MAX_CONST_BUFFER_SIZE > packedConstBuf[packedId].size() || + packedConstBuf[packedId].empty()) packedConstBuf[packedId].push_back(nullptr); bufferId = packedConstBuf[packedId].size() - 1; bufferNeedsUpdate[packedId].set(bufferId); } - return PackedConstsState(stcode_id, bufferId, id); + return PackedConstsState(stcode_id == -1 ? -1 : packedId, bufferId, id); } static void clear() { @@ -429,15 +423,14 @@ struct BindlessState static void preparePackedConstBuf(uint32_t idx) { PackedConstsState stateId(idx); - const int stcode_id = stateId.getStcodeId(); - G_ASSERT(stcode_id != -1); - const uint32_t packedId = remap_stcode_id(stcode_id); + const int packedId = stateId.getPackedStcodeId(); + G_ASSERT(packedId != -1); const BindlessState &state = packedAll[packedId][stateId.getGlobalStateId()]; auto &constBuffer = packedConstBuf[packedId][stateId.getBufferId()]; if (!interlocked_acquire_load_ptr(constBuffer) && state.constsCount) { - String s(0, "staticCbuf%d_%d", stcode_id, stateId.getBufferId()); + String s(0, "staticCbuf%d_%d", packedId, stateId.getBufferId()); Sbuffer *constBuf = d3d::buffers::create_persistent_cb(MAX_CONST_BUFFER_SIZE, s.c_str()); if (!constBuf) { @@ -498,9 +491,9 @@ ShaderStateBlock create_bindless_state(const BindlessConstParams *bindless_data, void apply_bindless_state(uint32_t const_state_idx, int tex_level) { const PackedConstsState state(const_state_idx); - const int stcodeId = state.getStcodeId(); - if (stcodeId >= 0) - packedAll[remap_stcode_id(stcodeId)][state.getGlobalStateId()].applyPackedMaterial(const_state_idx); + const int packedStcodeId = state.getPackedStcodeId(); + if (packedStcodeId >= 0) + packedAll[packedStcodeId][state.getGlobalStateId()].applyPackedMaterial(const_state_idx); else all[state.getGlobalStateId()].apply(const_state_idx, tex_level); } @@ -510,8 +503,8 @@ void clear_bindless_states() { BindlessState::clear(); } void req_tex_level_bindless(uint32_t const_state_idx, int tex_level) { const PackedConstsState state(const_state_idx); - const int stcodeId = state.getStcodeId(); - if (stcodeId < 0) + const int packedStcodeId = state.getPackedStcodeId(); + if (packedStcodeId < 0) all[state.getGlobalStateId()].reqTexLevel(tex_level); } @@ -525,20 +518,22 @@ void dump_bindless_states_stat() { debug(" %d bindless states (%d total)", all.s void update_bindless_state(uint32_t const_state_idx, int tex_level) { const PackedConstsState state(const_state_idx); - G_ASSERT(state.getStcodeId() != -1); - packedAll[remap_stcode_id(state.getStcodeId())][state.getGlobalStateId()].updateTexForPackedMaterial(const_state_idx, tex_level); + G_ASSERT(state.getPackedStcodeId() != -1); + packedAll[state.getPackedStcodeId()][state.getGlobalStateId()].updateTexForPackedMaterial(const_state_idx, tex_level); } uint32_t PackedConstsState::getLocalStateId() const { - return getGlobalStateId() - - (MAX_CONST_BUFFER_SIZE / packedAll[remap_stcode_id(getStcodeId())][getGlobalStateId()].constsCount) * getBufferId(); + const uint32_t constsCount = packedAll[getPackedStcodeId()][getGlobalStateId()].constsCount; + if (!constsCount) + return 0; + return getGlobalStateId() - (MAX_CONST_BUFFER_SIZE / constsCount) * getBufferId(); } uint32_t get_material_offset(uint32_t const_state_idx) { const PackedConstsState state(const_state_idx); - G_ASSERT(state.getStcodeId() != -1); + G_ASSERT(state.getPackedStcodeId() != -1); return state.getLocalStateId(); } diff --git a/prog/engine/shaders/shUtils.cpp b/prog/engine/shaders/shUtils.cpp index d7c3973b9..2d1343e3e 100644 --- a/prog/engine/shaders/shUtils.cpp +++ b/prog/engine/shaders/shUtils.cpp @@ -106,6 +106,7 @@ const char *shcod_tokname(int t) case SHCOD_LVIEW: return "LVIEW"; case SHCOD_TMWORLD: return "WTM"; case SHCOD_MAKE_VEC: return "MAKE_VEC"; + case SHCOD_SAMPLER: return "SAMPLER"; case SHCOD_TEXTURE: return "TEXTURE"; case SHCOD_TEXTURE_VS: return "TEXTURE_VS"; case SHCOD_VPR_CONST: return "VPR_CONST"; @@ -313,6 +314,20 @@ void shcod_dump(dag::ConstSpan cod, const shaderbindump::VarList *globals, str.aprintf(128, "ind=%d ofs=%d", ind, ofs); } break; + case SHCOD_SAMPLER: + { + int ind = shaderopcode::getOp2p1(cod[i]); + int ofs = shaderopcode::getOp2p2(cod[i]); + if ((uint32_t)ofs < (uint32_t)globals->v.size()) + { + debug_("%sreg=%d var_ofs=%d |", str.str(), ind, ofs); +#if DAGOR_DBGLEVEL > 0 + shaderbindump::dumpVar(*globals, ofs); +#endif + continue; + } + } + break; case SHCOD_STATIC_BLOCK: str.aprintf(128, "texCnt=%u vsCnt=%u psCnt=%u texBase=%d vsBase=%d psBase=%d", shaderopcode::getOp3p1(cod[i]), shaderopcode::getOp3p2(cod[i]), shaderopcode::getOp3p3(cod[i]), (cod[i + 1] >> 16) & 0xFF, (cod[i + 1] >> 8) & 0xFF, diff --git a/prog/engine/shaders/shaderMesh.cpp b/prog/engine/shaders/shaderMesh.cpp index 7f32bd112..c36b6f87d 100644 --- a/prog/engine/shaders/shaderMesh.cpp +++ b/prog/engine/shaders/shaderMesh.cpp @@ -43,7 +43,7 @@ void GlobalVertexData::initGvd(const char *name, unsigned vNum, unsigned vStride } // create vertex buffer - unsigned vbFlags = (flags & VDATA_NO_VB) || !(flags & VDATA_D3D_RESET_READY) ? 0 : SBCF_MAYBELOST; + unsigned vbFlags = 0; if (flags & VDATA_BIND_SHADER_RES) vbFlags |= SBCF_BIND_SHADER_RES; if (flags & VDATA_NO_VB) @@ -60,7 +60,7 @@ void GlobalVertexData::initGvd(const char *name, unsigned vNum, unsigned vStride // create index buffer if (idxSize) { - unsigned ibFlags = (flags & VDATA_NO_IB) || !(flags & VDATA_D3D_RESET_READY) ? 0 : SBCF_MAYBELOST; + unsigned ibFlags = 0; if (flags & VDATA_I32) ibFlags |= SBCF_INDEX32; if (flags & VDATA_NO_IB) diff --git a/prog/engine/shaders/shaderResUnitedData.cpp b/prog/engine/shaders/shaderResUnitedData.cpp index 79cc8c8eb..a5661a952 100644 --- a/prog/engine/shaders/shaderResUnitedData.cpp +++ b/prog/engine/shaders/shaderResUnitedData.cpp @@ -451,7 +451,7 @@ bool unitedvdata::BufPool::allocateBuffer(int idx, size_t size, const char *name sbuf[idx], idx, freeChunks[idx].size(), allowRebuild); const bool isIb = idx == IDX_IB; - const int flags = (isIb ? SBCF_BIND_INDEX : SBCF_BIND_VERTEX) | SBCF_MAYBELOST | SBCF_BIND_SHADER_RES | get_optional_buffer_flags(); + const int flags = (isIb ? SBCF_BIND_INDEX : SBCF_BIND_VERTEX) | SBCF_BIND_SHADER_RES | get_optional_buffer_flags(); Sbuffer *candidate = d3d::create_sbuffer(4, round_up<4>(size) / 4, flags, 0, name); diff --git a/prog/engine/shaders/shadersBinaryDataDbg.cpp b/prog/engine/shaders/shadersBinaryDataDbg.cpp index 15eef0d9a..ac76fba69 100644 --- a/prog/engine/shaders/shadersBinaryDataDbg.cpp +++ b/prog/engine/shaders/shadersBinaryDataDbg.cpp @@ -158,6 +158,7 @@ void shaderbindump::dumpVar(const shaderbindump::VarList &vars, int i) debug_("int4(%d,%d,%d,%d)\n", i4.x, i4.y, i4.z, i4.w); break; } + case SHVT_SAMPLER: debug_("sampler\n"); break; default: debug_("unknown type: %d\n", vars.getType(i)); } } diff --git a/prog/engine/shaders/shadersBinaryDataVars.cpp b/prog/engine/shaders/shadersBinaryDataVars.cpp index e21daeb81..4ad36137f 100644 --- a/prog/engine/shaders/shadersBinaryDataVars.cpp +++ b/prog/engine/shaders/shadersBinaryDataVars.cpp @@ -532,6 +532,15 @@ bool ShaderGlobal::set_texture(int variable_id, const ManagedTex &texture) return ShaderGlobal::set_texture(variable_id, texture.getTexId()); } +bool ShaderGlobal::set_sampler(int var_id, d3d::SamplerHandle handle) +{ + CHECK_VAR_ID(SHVT_SAMPLER); + + auto &smp = dump.globVars.get(id); + smp = handle; + return true; +} + bool ShaderGlobal::set_buffer(int var_id, D3DRESID buf_id) { #if DAGOR_DBGLEVEL > 0 diff --git a/prog/engine/sharedInclude/shaders/shOpcode.h b/prog/engine/sharedInclude/shaders/shOpcode.h index f167b29f0..42f4c110b 100644 --- a/prog/engine/sharedInclude/shaders/shOpcode.h +++ b/prog/engine/sharedInclude/shaders/shOpcode.h @@ -26,7 +26,7 @@ enum SHCOD_FSH_CONST, // 2p | set PS const[ind] from VEC4 reg | p1=ind p2=reg# SHCOD_TEXTURE, // 2p | set texture | p1=ind p2=reg# SHCOD_G_TM, // 2p_8_16 | set 4xVEC4 const for GM/PM/VPM | p1=type (8 bits) p2=ind - SHCOD_NOP, + SHCOD_SAMPLER, // 2p | set sampler | p1=ind p2=varId SHCOD_MUL_REAL, // 3p | REAL: dest# = left# * right# | p1=dest# p2=left# p3=right# SHCOD_DIV_REAL, // 3p | REAL: dest# = left# / right# | p1=dest# p2=left# p3=right# diff --git a/prog/engine/startup/globals3d.cpp b/prog/engine/startup/globals3d.cpp index daa0d2c75..5d2072240 100644 --- a/prog/engine/startup/globals3d.cpp +++ b/prog/engine/startup/globals3d.cpp @@ -2,7 +2,5 @@ bool grs_draw_wire = false; -DagorCurView grs_cur_view; - int dgs_tex_quality = 0; int dgs_tex_anisotropy = 1; diff --git a/prog/engine/startup/initCrt.cpp b/prog/engine/startup/initCrt.cpp index 8a31a8a71..dfde8fbf5 100644 --- a/prog/engine/startup/initCrt.cpp +++ b/prog/engine/startup/initCrt.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +26,7 @@ void default_crt_init_kernel_lib() #if !_TARGET_XBOX init_main_thread_id(); #endif + DaThread::setCurrentThreadName("Main Thread"); } void default_crt_init_core_lib() diff --git a/prog/engine/workCycle/initVideo.cpp b/prog/engine/workCycle/initVideo.cpp index c27aff09c..5d7f2da45 100644 --- a/prog/engine/workCycle/initVideo.cpp +++ b/prog/engine/workCycle/initVideo.cpp @@ -149,7 +149,7 @@ class VideoRestartProc : public SRestartProc workcycle_internal::curFrameActs = 0; } - workcycle_internal::is_window_in_thread = pblk_video->getBool("threaded_window", false); + workcycle_internal::is_window_in_thread = pblk_video->getBool("threadedWindow", false); ::dgs_limit_fps = pblk_gr->getBool("limitfps", false); diff --git a/prog/gameLibs/asyncHTTPClient/curl.cpp b/prog/gameLibs/asyncHTTPClient/curl.cpp index 69eaf6f0b..14af4cc3f 100644 --- a/prog/gameLibs/asyncHTTPClient/curl.cpp +++ b/prog/gameLibs/asyncHTTPClient/curl.cpp @@ -129,11 +129,19 @@ static void logerr_http_in_retail(const eastl::string &url) } static CURL *make_curl_handle(const char *url, const char *user_agent, bool verify_peer, bool verify_host, void *user_data, - const char *chunk_range) + const char *chunk_range, bool need_resp_headers) { + G_UNUSED(need_resp_headers); CURL *curl = curl_easy_init(); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback); - curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, header_callback); + // To consider: always pass header callback for pre-reserve `response` using `Content-Length` header +#ifndef USE_XCURL // For custom decompressor (e.g. brotli) setup + if (need_resp_headers || verbose_debug) +#endif + { + curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, header_callback); + curl_easy_setopt(curl, CURLOPT_HEADERDATA, user_data); + } curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback); curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0); curl_easy_setopt(curl, CURLOPT_SHARE, curlsh); @@ -146,7 +154,6 @@ static CURL *make_curl_handle(const char *url, const char *user_agent, bool veri curl_easy_setopt(curl, CURLOPT_URL, url); curl_easy_setopt(curl, CURLOPT_USERAGENT, user_agent); curl_easy_setopt(curl, CURLOPT_WRITEDATA, user_data); - curl_easy_setopt(curl, CURLOPT_HEADERDATA, user_data); curl_easy_setopt(curl, CURLOPT_XFERINFODATA, user_data); curl_easy_setopt(curl, CURLOPT_PRIVATE, user_data); curl_easy_setopt(curl, CURLOPT_VERBOSE, verbose_debug ? 1L : 0L); @@ -192,9 +199,9 @@ class RequestState urlStr = params.url; logerr_http_in_retail(urlStr); curlHandle = make_curl_handle(urlStr.c_str(), params.userAgent ? params.userAgent : default_user_agent.c_str(), params.verifyCert, - params.verifyHost, this, params.chunkRange); + params.verifyHost, this, params.chunkRange, params.needResponseHeaders); callback = params.callback; - needHeaders = params.needHeaders; + needResponseHeaders = params.needResponseHeaders; maxDownloadSize = params.maxDownloadSize; sendResponseInMainThreadOnly = params.sendResponseInMainThreadOnly; headersList = nullptr; @@ -322,15 +329,15 @@ class RequestState if (!sendResponseInMainThreadOnly || is_main_thread()) { - callback->onRequestDone(result, httpCode, response, headersMap); + callback->onRequestDone(result, httpCode, response, respHeadersMap); callback->release(); } else { delayed_call([callback = this->callback, result, httpCode = this->httpCode, response = eastl::move(response), - headers = copy_string_map_view(headersMap)] { - StringMap headersView(headers.begin(), headers.end()); - callback->onRequestDone(result, httpCode, response, headersView); + resp_headers = needResponseHeaders ? copy_string_map_view(respHeadersMap) : HeadersList{}] { + StringMap respHeaders(resp_headers.begin(), resp_headers.end()); + callback->onRequestDone(result, httpCode, response, respHeaders); callback->release(); }); } @@ -383,12 +390,11 @@ class RequestState void trySetDecompressorByHeader(dag::Span header) { eastl::string_view hv(header.data(), header.size()); - eastl::string_view::size_type pos = hv.find("Content-Encoding:"); - if (pos != eastl::string_view::npos) + if (eastl::string_view::size_type pos = hv.find("Content-Encoding:"); pos != hv.npos) { decompressor.reset(); pos = hv.find("br", pos); - if (pos != eastl::string_view::npos) + if (pos != hv.npos) decompressor.reset(new BrotliStreamDecompress()); } } @@ -400,7 +406,11 @@ class RequestState #if defined(USE_XCURL) trySetDecompressorByHeader(header); #endif - responseHeaders.emplace_back(header.begin(), header.end()); + if (needResponseHeaders) + { + responseHeaders.emplace_back(header.begin(), header.end()); + responseHeaders.back().reserve(sizeof(void *) * 3); // Disable SSO for string_view in `respHeadersMap` + } } void finishResponseHeader() @@ -414,35 +424,26 @@ class RequestState return; } - for (eastl::string const &header : responseHeaders) + for (const auto &header : responseHeaders) { - DEBUG_VERBOSE("finishResponseHeader header: %.*s", header.size(), header.data()); + DEBUG_VERBOSE("finishResponseHeader header: %s", header.c_str()); eastl::string_view hv(header.data(), header.size()); - auto delimPos = hv.find(":"); - if (delimPos != hv.npos) + if (auto delimPos = hv.find(":"); delimPos != hv.npos) { auto key = hv.substr(0, delimPos); auto value = hv.substr(delimPos + 1); - - auto crlfPos = value.find("\r\n"); - if (crlfPos != value.npos) - { + if (auto crlfPos = value.find("\r\n"); crlfPos != value.npos) value.remove_suffix(value.size() - crlfPos); - } - - auto nonSpacePos = value.find_first_not_of(' '); - if (nonSpacePos != value.npos) + if (auto nonSpacePos = value.find_first_not_of(' '); nonSpacePos != value.npos) { value.remove_prefix(nonSpacePos); - headersMap[key] = value; + respHeadersMap[key] = value; } } } - if (!needHeaders) - return; - - callback->onHttpHeaderResponse(headersMap); + if (needResponseHeaders) + callback->onHttpHeadersResponse(respHeadersMap); } void onDownloadProgress(size_t dltotal, size_t dlnow) @@ -465,21 +466,23 @@ class RequestState CURL *curlHandle; IAsyncHTTPCallback *callback; size_t maxDownloadSize; - bool needHeaders; - bool sendResponseInMainThreadOnly; curl_slist *headersList; eastl::string urlStr; - eastl::vector reqData; + dag::Vector reqData; + bool needResponseHeaders; + bool sendResponseInMainThreadOnly; bool abortFlag = false; bool shutdownFlag = false; int httpCode = 0; CURLcode curlResult = CURLE_OK; dag::Vector response; - eastl::list responseHeaders; - httprequests::StringMap headersMap; + dag::Vector responseHeaders; // Note: string_view in `respHeadersMap` are pointing within + httprequests::StringMap respHeadersMap; +#ifdef USE_XCURL eastl::unique_ptr decompressor; +#endif }; using RequestStatePtr = eastl::unique_ptr; diff --git a/prog/gameLibs/consoleKeyBindings/consoleKeyBindings.cpp b/prog/gameLibs/consoleKeyBindings/consoleKeyBindings.cpp index 113457550..3b53901d2 100644 --- a/prog/gameLibs/consoleKeyBindings/consoleKeyBindings.cpp +++ b/prog/gameLibs/consoleKeyBindings/consoleKeyBindings.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -12,14 +13,21 @@ OSSpinlock console_keybindings_mutex; Tab console_keybindings; SimpleString keybindings_file_path; -__forceinline bool is_mods_valid(unsigned checked_mods, unsigned mods) + +__forceinline bool check_modifier(unsigned a, unsigned b, unsigned mask) +{ + return ((a & mask) == (b & mask) || ((b & mask) == mask) && (a & mask) || ((a & mask) == mask) && (b & mask)); +} + + +__forceinline bool check_modifiers(unsigned a, unsigned b) { - unsigned u = checked_mods & mods; - return ((mods & HumanInput::KeyboardRawState::CTRL_BITS) != 0 ? u & HumanInput::KeyboardRawState::CTRL_BITS : 1) && - ((mods & HumanInput::KeyboardRawState::ALT_BITS) != 0 ? u & HumanInput::KeyboardRawState::ALT_BITS : 1) && - ((mods & HumanInput::KeyboardRawState::SHIFT_BITS) != 0 ? u & HumanInput::KeyboardRawState::SHIFT_BITS : 1); + return check_modifier(a, b, HumanInput::KeyboardRawState::CTRL_BITS) && + check_modifier(a, b, HumanInput::KeyboardRawState::ALT_BITS) && + check_modifier(a, b, HumanInput::KeyboardRawState::SHIFT_BITS); } + // unsafe, because not thread safe __forceinline ConsoleKeybinding &insert_in_best_place_unsafe(unsigned modifiers) { @@ -29,102 +37,162 @@ __forceinline ConsoleKeybinding &insert_in_best_place_unsafe(unsigned modifiers) return console_keybindings.push_back(); } -bool bind(const char *key_comb, const char *command) + +struct ModifierToken +{ + const char *modName; + unsigned modMask; +}; + + +static const ModifierToken modifier_tokens[] = { + {"lctrl", HumanInput::KeyboardRawState::LCTRL_BIT}, + {"rctrl", HumanInput::KeyboardRawState::RCTRL_BIT}, + {"ctrl", HumanInput::KeyboardRawState::CTRL_BITS}, + {"lalt", HumanInput::KeyboardRawState::LALT_BIT}, + {"ralt", HumanInput::KeyboardRawState::RALT_BIT}, + {"alt", HumanInput::KeyboardRawState::ALT_BITS}, + {"lshift", HumanInput::KeyboardRawState::LSHIFT_BIT}, + {"rshift", HumanInput::KeyboardRawState::RSHIFT_BIT}, + {"shift", HumanInput::KeyboardRawState::SHIFT_BITS}, +}; + + +static const char *find_next_delimiter(const char *str) +{ + for (; *str; str++) + if (strchr("+-_ ", *str)) + return str; + return nullptr; +} + + +static const int get_key_code(HumanInput::IGenKeyboard *kbd, const char *key_name) { - G_ASSERT(key_comb); + for (int i = 0; i < kbd->getKeyCount(); i++) + { + String underscoredKeyName(kbd->getKeyName(i)); + for (int j = 0; j < underscoredKeyName.length(); j++) + if (underscoredKeyName[j] == ' ') + underscoredKeyName[j] = '_'; + + underscoredKeyName.toLower(); + if (!strcmp(key_name, underscoredKeyName.str())) + { + return i; + } + } + + return -1; +} + + +static bool parse_shortcut(const char *key_comb, const char *error_prefix, unsigned &result_modifiers, int &result_key_code) +{ + result_modifiers = 0; + result_key_code = -1; + String keyComb(key_comb); - if (!global_cls_drv_kbd || !global_cls_drv_kbd->getDeviceCount()) - return false; keyComb.toLower(); HumanInput::IGenKeyboard *kbd = global_cls_drv_kbd->getDevice(0); + if (!kbd) + return false; - unsigned modifiers = 0; - int shift = 0; - if (strstr(keyComb.str(), "lctrl")) - { - modifiers |= HumanInput::KeyboardRawState::LCTRL_BIT; - shift += 6; - } - else if (strstr(keyComb.str(), "rctrl")) - { - modifiers |= HumanInput::KeyboardRawState::RCTRL_BIT; - shift += 6; - } - else if (strstr(keyComb.str(), "ctrl")) - { - modifiers |= HumanInput::KeyboardRawState::CTRL_BITS; - shift += 5; - } - if (strstr(keyComb.str(), "lalt")) - { - modifiers |= HumanInput::KeyboardRawState::LALT_BIT; - shift += 5; - } - else if (strstr(keyComb.str(), "ralt")) - { - modifiers |= HumanInput::KeyboardRawState::RALT_BIT; - shift += 5; - } - else if (strstr(keyComb.str(), "alt")) - { - modifiers |= HumanInput::KeyboardRawState::ALT_BITS; - shift += 4; - } - if (strstr(keyComb.str(), "lshift")) - { - modifiers |= HumanInput::KeyboardRawState::LSHIFT_BIT; - shift += 7; - } - else if (strstr(keyComb.str(), "rshift")) - { - modifiers |= HumanInput::KeyboardRawState::RSHIFT_BIT; - shift += 7; - } - else if (strstr(keyComb.str(), "shift")) - { - modifiers |= HumanInput::KeyboardRawState::SHIFT_BITS; - shift += 6; - } + // just check if we have return key, if not, then we don't have a keyboard, or this is null device driver + if (kbd->getKeyName(HumanInput::DKEY_RETURN) == nullptr) + return false; - const char *keyName = keyComb.str(); - if (modifiers) + const char *cur = keyComb.str(); + const char *next = find_next_delimiter(cur); + while (next) { - for (int i = shift - 1, len = keyComb.length(); i < len; ++i) - if (strchr("+-_ ", keyComb[i])) + if (next == cur) + break; + + bool found = false; + for (const ModifierToken &token : modifier_tokens) + if (!strncmp(cur, token.modName, next - cur) && strlen(token.modName) == next - cur) { - keyName += (i + 1); + if (result_modifiers & token.modMask) + { + console::error("%s %s - duplicate modifier '%s'", error_prefix, key_comb, token.modName); + return false; + } + result_modifiers |= token.modMask; + found = true; break; } + + if (!found) + { + // if we didn't find a modifier, maybe we hit some strange key which we wanted to treat as a modifier (e.g. num_1) + result_key_code = get_key_code(kbd, cur); + if (result_key_code != -1) + break; + + console::error("%s %s - invalid modifier '%.*s'", error_prefix, key_comb, next - cur, cur); + return false; + } + + cur = next + 1; + next = find_next_delimiter(cur); } + + + const char *keyName = cur; if (!keyName || !strlen(keyName)) + { + console::error("%s %s - key name is empty", error_prefix, key_comb); return false; + } - int keyCode = -1; - for (int i = 0; i < kbd->getKeyCount(); i++) + if (result_key_code == -1) // we might have found the key while checking for modifiers, so let's not do the search twice + result_key_code = get_key_code(kbd, keyName); + + if (result_key_code == -1) { - String underscoredKeyName(kbd->getKeyName(i)); - for (int j = 0; j < underscoredKeyName.length(); j++) - if (underscoredKeyName[j] == ' ') - underscoredKeyName[j] = '_'; + console::error("%s %s - invalid key name", error_prefix, key_comb); + return false; + } - underscoredKeyName.toLower(); - if (!strcmp(keyName, underscoredKeyName.str())) - { - keyCode = i; - break; - } + return true; +} + + +bool bind(const char *key_comb, const char *command) +{ + if (!global_cls_drv_kbd || !global_cls_drv_kbd->getDeviceCount()) + return false; + + if (!key_comb || !command || !*key_comb || !*command) + { + console::error("console.bind - invalid arguments, shortcut or command is empty"); + return false; } + + int keyCode = -1; + unsigned modifiers = 0; + if (!parse_shortcut(key_comb, "console.bind", modifiers, keyCode)) + return false; + + String keyComb(key_comb); + keyComb.toLower(); + { OSSpinlockScopedLock lock(&console_keybindings_mutex); int index = -1; for (int i = console_keybindings.size() - 1; i >= 0; i--) - if (console_keybindings[i].keyCode == keyCode && is_mods_valid(modifiers, console_keybindings[i].modifiers)) + if (console_keybindings[i].keyCode == keyCode && check_modifiers(modifiers, console_keybindings[i].modifiers)) { index = i; break; } + if (index != -1 && console_keybindings[index].command != command) + console::warning("console.bind %s - shortcut already bound to '%s' and will be replaced with '%s'", key_comb, + console_keybindings[index].command.str(), command); + ConsoleKeybinding &binding = (index == -1) ? insert_in_best_place_unsafe(modifiers) : console_keybindings[index]; binding.command = command; binding.keyCode = keyCode; @@ -134,24 +202,63 @@ bool bind(const char *key_comb, const char *command) return true; } + +bool unbind(const char *key_comb) +{ + if (!global_cls_drv_kbd || !global_cls_drv_kbd->getDeviceCount()) + return false; + + if (!key_comb || !*key_comb) + { + console::error("console.unbind - invalid arguments, shortcut is empty"); + return false; + } + + int keyCode = -1; + unsigned modifiers = 0; + if (!parse_shortcut(key_comb, "console.unbind", modifiers, keyCode)) + return false; + + { + bool found = false; + OSSpinlockScopedLock lock(&console_keybindings_mutex); + for (int i = console_keybindings.size() - 1; i >= 0; i--) + if (console_keybindings[i].keyCode == keyCode && check_modifiers(modifiers, console_keybindings[i].modifiers)) + { + console_keybindings.erase(console_keybindings.begin() + i); + found = true; + } + + if (found) + return true; + } + + console::warning("console.unbind %s - shotrcut not bound", key_comb); + return false; +} + + const char *get_command_by_key_code(int key_code, unsigned modifiers) { OSSpinlockScopedLock lock(&console_keybindings_mutex); for (const auto &keybinding : console_keybindings) - if (keybinding.keyCode == key_code && is_mods_valid(modifiers, keybinding.modifiers)) + if (keybinding.keyCode == key_code && check_modifiers(modifiers, keybinding.modifiers)) return keybinding.command.str(); return nullptr; } + void clear() { OSSpinlockScopedLock lock(&console_keybindings_mutex); clear_and_shrink(console_keybindings); } + void set_binds_file_path(const String &path) { keybindings_file_path = path; } + void load_binds_from_file() { DataBlock consoleBinds; @@ -165,11 +272,12 @@ void load_binds_from_file() const char *command = bind->getStr("command", nullptr); if (key && command) if (!console_keybindings::bind(key, command)) - logerr("Binding <%s> error. Check key name.", key); + console::error("Failed to load console binding '%s' from file '%s'", key, keybindings_file_path.str()); } } } + void save_binds_to_file() { OSSpinlockScopedLock lock(&console_keybindings_mutex); @@ -184,19 +292,21 @@ void save_binds_to_file() } } // namespace console_keybindings + static bool consoleKeybindings_console_handler(const char *argv[], int argc) { int found = 0; - found = console::collector_cmp(argv[0], argc, "consoleKeybindings.bind", 3, 3, "", " \"\""); + found = console::collector_cmp(argv[0], argc, "consoleKeybindings.bind", 3, 3, "", " \"\""); if (found == 0) - found = console::collector_cmp(argv[0], argc, "console.bind", 3, 3, "", " \"\""); + found = console::collector_cmp(argv[0], argc, "console.bind", 3, 3, "", " \"\""); if (found != 0) { if (found == -1) { - console::print("Usage: %s \"\" to call a console command on key press", argv[0]); - console::print("Usage: use '+' to add moddifiers like 'rshift+lctrl+k' for the key. Possible moddifiers: ctrl, shift, alt (and " - "l, r versions)"); + console::print("Usage: %s \"\" to call a console command on key/shortcut press", argv[0]); + console::print( + "Usage: use '+' to add modifiers like 'rshift+lctrl+k' for the shortcut. Possible moddifiers: ctrl, shift, alt (and " + "l, r versions)"); return true; } @@ -206,9 +316,7 @@ static bool consoleKeybindings_console_handler(const char *argv[], int argc) commandLen > 2 && command[0] == '"' && command[commandLen - 1] == '"' ? String(&command[1], commandLen - 2) : String(command); if (console_keybindings::bind(argv[1], clearedCommand.c_str())) - console::print_d("Key '%s' bound for '%s'", argv[1], clearedCommand.c_str()); - else - logerr("Binding <%s> error. Check key name.", argv[1]); + console::print_d("Shortcut '%s' bound for '%s'", argv[1], clearedCommand.c_str()); return true; } @@ -217,11 +325,17 @@ static bool consoleKeybindings_console_handler(const char *argv[], int argc) console_keybindings::save_binds_to_file(); return true; } + CONSOLE_CHECK_NAME_EX("console", "unbind", 2, 2, "Remove binding", "") + { + if (console_keybindings::unbind(argv[1])) + console::print_d("Shortcut '%s' unbound", argv[1]); + return true; + } CONSOLE_CHECK_NAME_EX("console", "binds_list", 1, 1, "List all current binds", "") { OSSpinlockScopedLock lock(&console_keybindings::console_keybindings_mutex); for (const auto &bind : console_keybindings::console_keybindings) - console::print_d("Key '%s' bound for '%s'", bind.keyCombo, bind.command); + console::print_d("Shortcut '%s' bound for '%s'", bind.keyCombo, bind.command); return true; } diff --git a/prog/gameLibs/daEditorE/daEditorE.cpp b/prog/gameLibs/daEditorE/daEditorE.cpp index 9464802b3..00ab7b268 100644 --- a/prog/gameLibs/daEditorE/daEditorE.cpp +++ b/prog/gameLibs/daEditorE/daEditorE.cpp @@ -13,6 +13,7 @@ #include #include #include <3d/dag_render.h> +#include #include #include #include diff --git a/prog/gameLibs/daFx/buffers.cpp b/prog/gameLibs/daFx/buffers.cpp index 0ec1d64ff..523f53665 100644 --- a/prog/gameLibs/daFx/buffers.cpp +++ b/prog/gameLibs/daFx/buffers.cpp @@ -208,7 +208,7 @@ unsigned char *start_updating_render_buffer(Context &ctx, int tag) if (recreate) { - uint flags = SBCF_DYNAMIC | SBCF_MAYBELOST | SBCF_CPU_ACCESS_WRITE | SBCF_BIND_SHADER_RES; + uint flags = SBCF_DYNAMIC | SBCF_CPU_ACCESS_WRITE | SBCF_BIND_SHADER_RES; uint fmt = TEXFMT_R32UI; if (ctx.cfg.use_render_sbuffer) @@ -257,7 +257,7 @@ unsigned char *start_updating_staging(Context &ctx, int size) ctx.staging.ring.reset(ctx.staging.size); bool r = create_gpu_res(ctx.staging.buffer, DAFX_ELEM_STRIDE, ctx.staging.size / DAFX_ELEM_STRIDE, - SBCF_DYNAMIC | SBCF_BIND_SHADER_RES | SBCF_CPU_ACCESS_WRITE | SBCF_MAYBELOST, 0, "dafx_staging"); + SBCF_DYNAMIC | SBCF_BIND_SHADER_RES | SBCF_CPU_ACCESS_WRITE, 0, "dafx_staging"); debug("dafx staging reset: %d | %d", ctx.staging.size, r); if (!r) diff --git a/prog/gameLibs/daFx/context.h b/prog/gameLibs/daFx/context.h index 322b00124..98fee12f8 100644 --- a/prog/gameLibs/daFx/context.h +++ b/prog/gameLibs/daFx/context.h @@ -148,4 +148,9 @@ extern GenerationReferencedData g_ctx_list; Context *ctxp = g_ctx_list.get(cid); \ G_ASSERTF_RETURN(ctxp, v, "dafx: GET_CTX_RET failed, rid: %d", (uint32_t)cid); \ Context &ctx = *ctxp; -} // namespace dafx \ No newline at end of file +} // namespace dafx + +DAG_DECLARE_RELOCATABLE(dafx::AsyncPrepareJob); +DAG_DECLARE_RELOCATABLE(dafx::AsyncCpuComputeJob); +DAG_DECLARE_RELOCATABLE(dafx::AsyncStartNextComputeBatchJob); +DAG_DECLARE_RELOCATABLE(dafx::AsyncCpuCullJob); diff --git a/prog/gameLibs/daGI/gi3d.cpp b/prog/gameLibs/daGI/gi3d.cpp index 80c65fece..e46361d96 100644 --- a/prog/gameLibs/daGI/gi3d.cpp +++ b/prog/gameLibs/daGI/gi3d.cpp @@ -114,6 +114,22 @@ bool GI3D::ensureSampledTarget(int w, int h, uint32_t fmt) return true; } +static int find_max_msaa_format() +{ + using namespace eastl; + int maxMsaaFormats[] = { + TEXFMT_R8, // sort by bit count + TEXFMT_L16, + TEXFMT_R8G8, + TEXFMT_R8G8B8A8, + }; + int maxMsaaSamples[size(maxMsaaFormats)] = {}; + transform(begin(maxMsaaFormats), end(maxMsaaFormats), begin(maxMsaaSamples), + [](auto fmt) { return d3d::get_max_sample_count(fmt); }); + auto maxIdx = distance(begin(maxMsaaSamples), max_element(begin(maxMsaaSamples), end(maxMsaaSamples))); + return make_sample_count_flag(maxMsaaSamples[maxIdx]) | maxMsaaFormats[maxIdx]; +} + ConVarT gi_use_forced_sample_count("render.gi_use_forced_sample_count", 1, 0, 2, "0 - OFF, 1 - ON, 2 - UAV only"); bool GI3D::setSubsampledTargetAndOverride(int w, int h) { @@ -121,7 +137,7 @@ bool GI3D::setSubsampledTargetAndOverride(int w, int h) { BaseTexture *backBuf = d3d::get_backbuffer_tex(); TextureInfo tinfo; - if (!backBuf->getinfo(tinfo) || (tinfo.cflg & TEXCF_MULTISAMPLED) || tinfo.w < w || tinfo.h < h) + if (!backBuf->getinfo(tinfo) || (tinfo.cflg & TEXCF_SAMPLECOUNT_MASK) || tinfo.w < w || tinfo.h < h) { if (!ensureSampledTarget(w, h, TEXFMT_R8)) return false; @@ -135,7 +151,7 @@ bool GI3D::setSubsampledTargetAndOverride(int w, int h) } else { - if (!ensureSampledTarget(w, h, TEXFMT_MSAA_MAX_SAMPLES)) + if (!ensureSampledTarget(w, h, find_max_msaa_format())) { //? w *= 2; h *= 2;//supersampling should be better, but probably videocard is really slow if (!ensureSampledTarget(w, h, TEXFMT_R8)) @@ -331,8 +347,8 @@ void GI3D::VolmapCommonData::initCommon() { typedef Point3_vec4 float3; - poissonBuf = dag::create_sbuffer(16, (sizeof(POISSON_SAMPLES) + 15) / 16, - SBCF_MAYBELOST | SBCF_CPU_ACCESS_WRITE | SBCF_BIND_SHADER_RES, TEXFMT_A32B32G32R32F, "poissonSamples"); + poissonBuf = dag::create_sbuffer(16, (sizeof(POISSON_SAMPLES) + 15) / 16, SBCF_CPU_ACCESS_WRITE | SBCF_BIND_SHADER_RES, + TEXFMT_A32B32G32R32F, "poissonSamples"); poissonBuf->updateDataWithLock(0, sizeof(POISSON_SAMPLES), POISSON_SAMPLES, 0); } ssgi_clear_volmap_cs.reset(new_compute_shader("ssgi_clear_volmap_cs")); diff --git a/prog/gameLibs/daGI/shaders/dagi_debug_scene.sh b/prog/gameLibs/daGI/shaders/dagi_debug_scene.sh index 6daa7e4cb..5f4a4a934 100644 --- a/prog/gameLibs/daGI/shaders/dagi_debug_scene.sh +++ b/prog/gameLibs/daGI/shaders/dagi_debug_scene.sh @@ -179,7 +179,7 @@ shader ssgi_debug_rasterize_voxels float t = 0; pt = 0; - int cnt = n; + dist = MAX_REAL; voxel_normal = 0; result = half4(0,0,0,1); diff --git a/prog/gameLibs/daGI/shaders/dagi_volmap_culling.sh b/prog/gameLibs/daGI/shaders/dagi_volmap_culling.sh index e75ec641a..16f8e3b97 100644 --- a/prog/gameLibs/daGI/shaders/dagi_volmap_culling.sh +++ b/prog/gameLibs/daGI/shaders/dagi_volmap_culling.sh @@ -86,10 +86,10 @@ shader cull_ambient_voxels_cs, cull_ambient_voxels_cs_warp_64 //check occlusion (voxel totally occluded) bool intersects = false; - float2 minMaxBoxRawDepth; float level; + float2 minMaxBoxRawDepth; float4 sbox; BRANCH - if (!check_box_occl_visible_base(centerb - extent, centerb + extent, sbox, level, minMaxBoxRawDepth)) + if (!check_box_occl_visible_base(centerb - extent, centerb + extent, sbox, minMaxBoxRawDepth)) return; /*##if downsampled_close_depth_tex != NULL //using downsampled_close_depth_tex is also an option, but provides more false positive (intersected) diff --git a/prog/gameLibs/daPhys/particlePhys.cpp b/prog/gameLibs/daPhys/particlePhys.cpp index 16ec5596b..9f5e1ca26 100644 --- a/prog/gameLibs/daPhys/particlePhys.cpp +++ b/prog/gameLibs/daPhys/particlePhys.cpp @@ -108,7 +108,7 @@ struct EdgeConstraint : public Constraint { if (auto nodeB = ptB->gnNodeId) { - tree->setNodeWtmRelScalar(nodeB, render_space_tm * (ptB->tm * ptB->helperTm)); + tree->setNodeWtmScalar(nodeB, render_space_tm * (ptB->tm * ptB->helperTm)); tree->markNodeTmInvalid(nodeB); } } @@ -318,7 +318,7 @@ struct ProjectionConstraint : public SliderConstraint Quat rot = quat_rotation_arc(posInitial, posNow); TMatrix rotTm = makeTM(rot); TMatrix finalTm = render_space_tm * ((pt->tm * rotTm) * pt->helperTm); - tree->setNodeWtmRelScalar(node, finalTm); + tree->setNodeWtmScalar(node, finalTm); tree->markNodeTmInvalid(node); } } @@ -381,7 +381,7 @@ struct RevoluteConstraint : public Constraint TMatrix tm = makeBase(edge, axis) % inv_base_tm % pt_a->tm; tm.setcol(3, pt_a->tm.getcol(3)); TMatrix finalTm = render_space_tm * (tm * pt_a->helperTm); - tree->setNodeWtmRelScalar(nodeA, finalTm); + tree->setNodeWtmScalar(nodeA, finalTm); tree->markNodeTmInvalid(nodeA); } } diff --git a/prog/gameLibs/daRg/canvasDraw.cpp b/prog/gameLibs/daRg/canvasDraw.cpp index 8bee4657c..52a55a9d7 100644 --- a/prog/gameLibs/daRg/canvasDraw.cpp +++ b/prog/gameLibs/daRg/canvasDraw.cpp @@ -248,28 +248,101 @@ void RenderCanvasContext::renderRectangle(const Sqrat::Array &cmd) const void RenderCanvasContext::renderQuads(const Sqrat::Array &cmd) const { - // VECTOR_QUADS, (x, y, color) * 4 times, ... - bool isValidParams = cmd.Length() > 1 && (cmd.Length() - 1) % 12 == 0; - - if (!isValidParams) + if (cmd.Length() == 3) { - darg_assert_trace_var("invalid number of parameters for VECTOR_QUADS", cmd, 0); - return; - } + // VECTOR_QUADS, [vertices], [quad indices] + + if (cmd[1].GetType() != OT_ARRAY || cmd[2].GetType() != OT_ARRAY) + { + darg_assert_trace_var("invalid parameters for VECTOR_QUADS, expected array of points and array of indices", cmd, 0); + return; + } + + const Sqrat::Array &sqPoints = cmd[1]; + const Sqrat::Array &sqIndices = cmd[2]; + + if (sqPoints.Length() % 3 != 0) + { + darg_assert_trace_var("invalid size of points array for VECTOR_QUADS", cmd[1], 0); + return; + } + + if (sqIndices.Length() % 4 != 0) + { + darg_assert_trace_var("invalid size of indices array for VECTOR_QUADS", cmd[2], 0); + return; + } + + if (!fillColor) + return; + + Tab points(framemem_ptr()); + Tab colors(framemem_ptr()); + points.reserve(sqPoints.Length() / 3); + colors.reserve(sqPoints.Length() / 3); - Point2 p[4]; - E3DCOLOR colors[4]; + for (int i = 0, len = sqPoints.Length(); i < len; i += 3) + points.push_back(offset + Point2(sqPoints[i].Cast() * scale.x, sqPoints[i + 1].Cast() * scale.y)); - for (int i = 1; i < cmd.Length(); i += 12) + if (fillColor == 0xFFFFFFFFu) + { + for (int i = 0, len = sqPoints.Length(); i < len; i += 3) + colors.push_back(script_decode_e3dcolor(sqPoints[i + 2].Cast())); + } + else + { + for (int i = 0, len = sqPoints.Length(); i < len; i += 3) + colors.push_back(e3dcolor_mul(fillColor, script_decode_e3dcolor(sqPoints[i + 2].Cast()))); + } + + + for (int i = 0, len = sqIndices.Length(); i < len; i += 4) + { + int idx[4]; + + for (int k = 0; k < 4; k++) + { + idx[k] = sqIndices[i + k].Cast(); + if (idx[k] < 0 || idx[k] >= points.size()) + { + debug("ERROR: VECTOR_QUADS: point index = %d, points.size() = %d", idx[k], int(points.size())); + darg_assert_trace_var("VECTOR_QUADS: point index is out of range", sqIndices, 0); + return; + } + } + + ctx->render_quad_color(points[idx[0]], points[idx[1]], points[idx[2]], points[idx[3]], Point2(0, 0), Point2(0, 0), Point2(0, 0), + Point2(0, 0), colors[idx[0]], colors[idx[1]], colors[idx[2]], colors[idx[3]]); + } + } + else { - for (int k = 0, index = 0; k < 12; k += 3, index++) // 3 numbers per vertex - x, y, color + // VECTOR_QUADS, (x, y, color) * 4 times, ... + bool isValidParams = cmd.Length() > 1 && (cmd.Length() - 1) % 12 == 0; + + if (!isValidParams) { - p[index] = offset + Point2(cmd[i + k].Cast() * scale.x, cmd[i + k + 1].Cast() * scale.y); - colors[index] = e3dcolor_mul(fillColor, script_decode_e3dcolor(cmd[i + k + 2].Cast())); + darg_assert_trace_var("invalid number of parameters for VECTOR_QUADS", cmd, 0); + return; } - ctx->render_quad_color(p[0], p[1], p[2], p[3], Point2(0, 0), Point2(0, 0), Point2(0, 0), Point2(0, 0), colors[0], colors[1], - colors[2], colors[3]); + if (!fillColor) + return; + + Point2 p[4]; + E3DCOLOR colors[4]; + + for (int i = 1, len = cmd.Length(); i < len; i += 12) + { + for (int k = 0, index = 0; k < 12; k += 3, index++) // 3 numbers per vertex - x, y, color + { + p[index] = offset + Point2(cmd[i + k].Cast() * scale.x, cmd[i + k + 1].Cast() * scale.y); + colors[index] = e3dcolor_mul(fillColor, script_decode_e3dcolor(cmd[i + k + 2].Cast())); + } + + ctx->render_quad_color(p[0], p[1], p[2], p[3], Point2(0, 0), Point2(0, 0), Point2(0, 0), Point2(0, 0), colors[0], colors[1], + colors[2], colors[3]); + } } } diff --git a/prog/gameLibs/daRg/panelRenderer.cpp b/prog/gameLibs/daRg/panelRenderer.cpp index e69174f25..2e5ee379e 100644 --- a/prog/gameLibs/daRg/panelRenderer.cpp +++ b/prog/gameLibs/daRg/panelRenderer.cpp @@ -91,7 +91,7 @@ static const PanelVertex rectVertices[4] = { template static bool buildBuffer(Func createFunc, UniqueBuf &buffer, const T (&data)[count], const char *name) { - buffer = createFunc(sizeof(data), SBCF_CPU_ACCESS_WRITE | SBCF_MAYBELOST, name); + buffer = createFunc(sizeof(data), SBCF_CPU_ACCESS_WRITE, name); G_ASSERT_RETURN(buffer.getBuf(), false); G_ASSERT_RETURN(buffer.getBuf()->updateData(0, sizeof(data), data, VBLOCK_WRITEONLY), false); return true; diff --git a/prog/gameLibs/daRg/stdRendObj.cpp b/prog/gameLibs/daRg/stdRendObj.cpp index d9050bf8e..4c4df7293 100644 --- a/prog/gameLibs/daRg/stdRendObj.cpp +++ b/prog/gameLibs/daRg/stdRendObj.cpp @@ -104,6 +104,27 @@ void RenderObjectSolid::renderCustom(StdGuiRender::GuiContext &ctx, const Elemen } +void RenderObjectDebug::renderCustom(StdGuiRender::GuiContext &ctx, const Element *, const ElemRenderData *rdata, + const RenderState &render_state) +{ + RobjParamsColorOnly *params = static_cast(rdata->params); + G_ASSERT(params); + if (!params) + return; + + E3DCOLOR color = color_apply_mods(params->color, render_state.opacity, params->brightness); + + ctx.set_color(color); + ctx.set_texture(BAD_TEXTUREID); + + Point2 lt = rdata->pos; + Point2 rb = lt + rdata->size; + ctx.render_frame(lt.x, lt.y, rb.x, rb.y, 1); + ctx.draw_line(lt.x, lt.y, rb.x, rb.y); + ctx.draw_line(lt.x, rb.y, rb.x, lt.y); +} + + bool RobjParamsText::load(const Element *elem) { const Properties &props = elem->props; @@ -2057,6 +2078,7 @@ void RenderObjectMovie::renderCustom(StdGuiRender::GuiContext &ctx, const Elemen ROBJ_FACTORY_IMPL(RenderObjectSolid, RobjParamsColorOnly) +ROBJ_FACTORY_IMPL(RenderObjectDebug, RobjParamsColorOnly) ROBJ_FACTORY_IMPL(RenderObjectText, RobjParamsText) ROBJ_FACTORY_IMPL(RenderObjectInscription, RobjParamsInscription) ROBJ_FACTORY_IMPL(RenderObjectImage, RobjParamsImage) @@ -2105,6 +2127,7 @@ void register_std_rendobj_factories() #define RF(name, cls) add_rendobj_factory(#name, ROBJ_FACTORY_PTR(cls)) RF(ROBJ_SOLID, RenderObjectSolid); + RF(ROBJ_DEBUG, RenderObjectDebug); rendobj_text_id = RF(ROBJ_TEXT, RenderObjectText); rendobj_inscription_id = RF(ROBJ_INSCRIPTION, RenderObjectInscription); rendobj_image_id = RF(ROBJ_IMAGE, RenderObjectImage); diff --git a/prog/gameLibs/daRg/stdRendObj.h b/prog/gameLibs/daRg/stdRendObj.h index 5b15135df..903fc17c5 100644 --- a/prog/gameLibs/daRg/stdRendObj.h +++ b/prog/gameLibs/daRg/stdRendObj.h @@ -17,6 +17,12 @@ class RenderObjectSolid : public RenderObject const RenderState &render_state); }; +class RenderObjectDebug : public RenderObject +{ + virtual void renderCustom(StdGuiRender::GuiContext &ctx, const Element *elem, const ElemRenderData *, + const RenderState &render_state); +}; + struct GuiTextCache { SmallTab v; diff --git a/prog/gameLibs/daSkies2/clouds2.h b/prog/gameLibs/daSkies2/clouds2.h index 40a499566..21bee85b0 100644 --- a/prog/gameLibs/daSkies2/clouds2.h +++ b/prog/gameLibs/daSkies2/clouds2.h @@ -1842,7 +1842,7 @@ struct Clouds2 setCloudRenderingVars(); } - void init(bool useHole = true) + void init(bool use_hole = true) { #define VAR(a, opt) a##VarId = ::get_shader_variable_id(#a, opt); CLOUDS_VARS_LIST @@ -1856,7 +1856,7 @@ struct Clouds2 light.init(); cloudsForm.init(); calcCloudsAlt(); - if (useHole) + if (use_hole) initHole(); invalidateWeather(); @@ -1907,6 +1907,9 @@ struct Clouds2 void processHole() { + if (!useHole) + return; + if (needHoleCPUUpdate == HOLE_UPDATED) return; @@ -2050,7 +2053,7 @@ struct Clouds2 bool findHole(const Point3 &main_light_dir) { - if (!((holeBuf || holeTex) && cloudShadows.cloudsShadowsVol)) + if (!useHole || (!((holeBuf || holeTex) && cloudShadows.cloudsShadowsVol))) return false; ShaderGlobal::set_real(clouds_hole_densityVarId, holeDensity); @@ -2094,6 +2097,7 @@ struct Clouds2 } return true; } + void setUseHole(bool set) { useHole = set; } void resetHole(const Point3 &hole_target, const float &hole_density) { holeTarget = hole_target; @@ -2168,6 +2172,7 @@ struct Clouds2 Point3 holeTarget = {0, 0, 0}; float holeDensity = 0; bool holeFound = true; + bool useHole = true; UniqueBufHolder holeBuf; UniqueTexHolder holeTex; UniqueTexHolder holePosTex; diff --git a/prog/gameLibs/daSkies2/daSkies.cpp b/prog/gameLibs/daSkies2/daSkies.cpp index ed9d310fe..8c4e4c53e 100644 --- a/prog/gameLibs/daSkies2/daSkies.cpp +++ b/prog/gameLibs/daSkies2/daSkies.cpp @@ -167,7 +167,7 @@ bool DaSkies::currentGroundSunSkyColor(float &sun_cos, float &moon_cos, Color3 & static inline void set_resolution_var(int, int) {} -bool DaSkies::isPrepareRequired() const { return clouds->isPrepareRequired(); } +bool DaSkies::isPrepareRequired() const { return clouds && clouds->isPrepareRequired(); } void DaSkies::prepare(const Point3 &dir_to_sun, bool force_update, float dt) { @@ -879,6 +879,12 @@ void DaSkies::resetCloudsHole() return clouds->resetHole(); } +void DaSkies::setUseCloudsHole(bool set) +{ + if (clouds) + clouds->setUseHole(set); +} + Point2 DaSkies::getCloudsHolePosition() const { if (!clouds) diff --git a/prog/gameLibs/daSkies2/daStars.cpp b/prog/gameLibs/daSkies2/daStars.cpp index 13c1550cb..dbc31948c 100644 --- a/prog/gameLibs/daSkies2/daStars.cpp +++ b/prog/gameLibs/daSkies2/daStars.cpp @@ -75,10 +75,10 @@ void DaStars::init(const char *stars, const char *moon) starsRendElem.numVert = star_catalog::g_star_count * 4; starsRendElem.startIndex = 0; starsRendElem.numPrim = star_catalog::g_star_count * 2; - starsVb = dag::create_vb(starsRendElem.numVert * starsRendElem.stride, SBCF_MAYBELOST, "starsVb"); + starsVb = dag::create_vb(starsRendElem.numVert * starsRendElem.stride, 0, "starsVb"); G_ASSERT(starsVb); - starsIb = dag::create_ib(starsRendElem.numPrim * 3 * sizeof(uint16_t), SBCF_MAYBELOST, + starsIb = dag::create_ib(starsRendElem.numPrim * 3 * sizeof(uint16_t), 0, "starsIb"); // To be filled on scene change. G_ASSERT(starsIb); diff --git a/prog/gameLibs/daSkies2/shaders/clouds2/daCloudsApply.sh b/prog/gameLibs/daSkies2/shaders/clouds2/daCloudsApply.sh index f3b4a27d2..be07a9b3f 100644 --- a/prog/gameLibs/daSkies2/shaders/clouds2/daCloudsApply.sh +++ b/prog/gameLibs/daSkies2/shaders/clouds2/daCloudsApply.sh @@ -197,7 +197,9 @@ shader clouds2_apply, clouds2_apply_has_empty, clouds2_apply_no_empty half4 apply_clouds_ps_main(VsOutput input, float4 screenpos, out float raw_depth) { - float2 depth_texcoord = input.tc; + float2 linearTc = getLinearTc(screenpos.xy, input.tc); + float2 depth_texcoord = linearTc; + #ifdef FSR_DISTORTION raw_depth = texelFetchOffset(fullres_depth_gbuf, screenpos.xy, 0, 0).x; #else @@ -205,16 +207,17 @@ shader clouds2_apply, clouds2_apply_has_empty, clouds2_apply_no_empty #endif // On DX10 we cannot use depth as a target and as a shader resouce at the same time. Do the depth test in the shader - ##if !hardware.fsh_5_0 - if (screenpos.z < raw_depth) - return 0.0f; - ##endif + ##if !hardware.fsh_5_0 + if (screenpos.z < raw_depth) + return 0.0f; + ##endif float2 texcoord = input.tc; -##if use_bounding_vr_reprojection == on - texcoord = vr_bounding_view_reproject_tc(texcoord,0); - depth_texcoord = texcoord; -##endif + ##if use_bounding_vr_reprojection == on + texcoord = vr_bounding_view_reproject_tc(texcoord,0); + depth_texcoord = texcoord; + ##endif + #if SIMPLE_APPLY half4 distPlane = 0; if (HAS_EMPTY_TILES==0 || !tile_is_empty(uint2(texcoord.xy*clouds2_far_res))) @@ -230,15 +233,17 @@ shader clouds2_apply, clouds2_apply_has_empty, clouds2_apply_no_empty //todo: check tile and exit immediately if close_layer_should_early_exit(), otherwise just apply close layer. float3 view = normalize(input.viewVect); + #ifndef CHECK_DIST_TO_CLOUDS - #define CHECK_DIST_TO_CLOUDS 0 + #define CHECK_DIST_TO_CLOUDS 0 #endif + #if CHECK_DIST_TO_CLOUDS - //can happen only when we are above/below clouds layer - // tht is so rare, that doesn't make sense to optimize - float distToClouds = 0; - float dist1; distance_to_clouds(-view, distToClouds, dist1); - distToClouds *= 1000; + //can happen only when we are above/below clouds layer + // tht is so rare, that doesn't make sense to optimize + float distToClouds = 0; + float dist1; distance_to_clouds(-view, distToClouds, dist1); + distToClouds *= 1000; #endif float linearDepth = linearize_z(raw_depth, zn_zfar.zw); @@ -257,9 +262,8 @@ shader clouds2_apply, clouds2_apply_has_empty, clouds2_apply_no_empty } else #endif { - float3 viewVec = getViewVecOptimized(input.tc); if (linearDist > closeSequenceEndDist && (HAS_EMPTY_TILES==0 || !tile_is_empty(uint2(texcoord.xy*clouds2_far_res)))) - distPlane = bilateral_get(viewVec, texcoord, depth_texcoord, linearDepth, raw_depth); + distPlane = bilateral_get(input.viewVect, texcoord, depth_texcoord, linearDepth, raw_depth); } return half4(TAA_BRIGHTNESS_SCALE*(distPlane.rgb*(1-closePlane.a) + closePlane.rgb), 1-(1-closePlane.a)*(1-distPlane.a)); #endif diff --git a/prog/gameLibs/daSkies2/shaders/clouds2/daCloudsTonemap.hlsl b/prog/gameLibs/daSkies2/shaders/clouds2/daCloudsTonemap.hlsl index 1291c6228..f814883dc 100644 --- a/prog/gameLibs/daSkies2/shaders/clouds2/daCloudsTonemap.hlsl +++ b/prog/gameLibs/daSkies2/shaders/clouds2/daCloudsTonemap.hlsl @@ -3,7 +3,7 @@ #define TAA_IN_HDR_SPACE 0 #define TAA_BRIGHTNESS_SCALE 1. -#define TAA_CLOUDS_FRAMES 8 +#define TAA_CLOUDS_FRAMES 16 float simple_luma_tonemap(float luma, float exposure) { return rcp(luma * exposure + 1.0); } float simple_luma_tonemap_inv(float luma, float exposure) { return rcp(max(1.0 - luma * exposure, 0.001)); } diff --git a/prog/gameLibs/daSkies2/shaders/clouds2/noise_functions.hlsl b/prog/gameLibs/daSkies2/shaders/clouds2/noise_functions.hlsl index 124ebfe9d..4469085a8 100644 --- a/prog/gameLibs/daSkies2/shaders/clouds2/noise_functions.hlsl +++ b/prog/gameLibs/daSkies2/shaders/clouds2/noise_functions.hlsl @@ -19,7 +19,7 @@ void perlin_hash(float3 gridcell, float s, bool3 tile, float d = DOMAIN - 1.5; float3 gridcell_inc1 = step( gridcell, float3( d,d,d ) ) * ( gridcell + 1.0 ); - #if SHADER_COMPILER_HLSL2021 + #if __HLSL_VERSION >= 2021 gridcell_inc1 = select(tile, gridcell_inc1 % s, gridcell_inc1); #else gridcell_inc1 = tile ? gridcell_inc1 % s : gridcell_inc1; diff --git a/prog/gameLibs/dasModules/common/dagorMath.cpp b/prog/gameLibs/dasModules/common/dagorMath.cpp index 82809f302..a296c14a4 100644 --- a/prog/gameLibs/dasModules/common/dagorMath.cpp +++ b/prog/gameLibs/dasModules/common/dagorMath.cpp @@ -364,8 +364,7 @@ class DagorMath final : public das::Module das::addExtern(*this, lib, "safe_acos", das::SideEffects::none, "safe_acos"); das::addExtern(*this, lib, "safe_asin", das::SideEffects::none, "safe_asin"); das::addExtern(*this, lib, "safe_asin", das::SideEffects::none, "safe_asin"); - das::addExtern(*this, lib, "safe_atan2", das::SideEffects::none, "safe_atan2"); - das::addExtern(*this, lib, "safe_atan2", das::SideEffects::none, "safe_atan2"); + // don't add safe_atan2. Use atan2 instead. It's safe // todo: add TMatrix, Capsule functions das::addCtorAndUsing(*this, lib, "Capsule", " ::Capsule"); diff --git a/prog/gameLibs/dasModules/daInput/dagorInput.cpp b/prog/gameLibs/dasModules/daInput/dagorInput.cpp index 0cfb74213..b5c2bf98a 100644 --- a/prog/gameLibs/dasModules/daInput/dagorInput.cpp +++ b/prog/gameLibs/dasModules/daInput/dagorInput.cpp @@ -109,6 +109,8 @@ class DagorInputModule final : public das::Module pType->alias = "action_handle_t"; addAlias(pType); + das::addExtern(*this, lib, "get_double_click_time", das::SideEffects::accessExternal, + "dainput::get_double_click_time"); das::addExtern(*this, lib, "get_actions_binding_column_active", das::SideEffects::accessExternal, "dainput::get_actions_binding_column_active"); das::addExtern(*this, lib, "get_action_handle", das::SideEffects::accessExternal, diff --git a/prog/gameLibs/dasModules/pathFinder/pathFinder.cpp b/prog/gameLibs/dasModules/pathFinder/pathFinder.cpp index eb17bb248..2a98283f2 100644 --- a/prog/gameLibs/dasModules/pathFinder/pathFinder.cpp +++ b/prog/gameLibs/dasModules/pathFinder/pathFinder.cpp @@ -394,6 +394,10 @@ class PathfinderModule final : public das::Module das::addConstant(*this, "POLYAREA_JUMP", (int)pathfinder::POLYAREA_JUMP); das::addConstant(*this, "POLYAREA_WALKABLE", 63); // @see DT_TILECACHE_WALKABLE_AREA + das::addConstant(*this, "NM_MAIN", (int)pathfinder::NM_MAIN); + das::addConstant(*this, "NM_EXT_1", (int)pathfinder::NM_EXT_1); + das::addConstant(*this, "NMS_COUNT", (int)pathfinder::NMS_COUNT); + compileBuiltinModule("pathFinder.das", (unsigned char *)pathFinder_das, sizeof(pathFinder_das)); verifyAotReady(); } diff --git a/prog/gameLibs/dasModules/render/dagorDriver3dConsts.cpp b/prog/gameLibs/dasModules/render/dagorDriver3dConsts.cpp index ccea32b10..4623dd15e 100644 --- a/prog/gameLibs/dasModules/render/dagorDriver3dConsts.cpp +++ b/prog/gameLibs/dasModules/render/dagorDriver3dConsts.cpp @@ -76,7 +76,6 @@ void bind_driver_consts(das::Module &module) // bitfield BufferFlag BIND_UINT_CONST(SBCF_DYNAMIC) - BIND_UINT_CONST(SBCF_MAYBELOST) BIND_UINT_CONST(SBCF_ZEROMEM) BIND_UINT_CONST(SBCF_INDEX32) BIND_UINT_CONST(SBCF_FRAMEMEM) @@ -248,8 +247,9 @@ void bind_driver_consts(das::Module &module) BIND_UINT_CONST(TEXCF_MAYBELOST) BIND_UINT_CONST(TEXCF_STREAMING) BIND_UINT_CONST(TEXCF_SYSMEM) - BIND_UINT_CONST(TEXCF_MULTISAMPLED) - BIND_UINT_CONST(TEXCF_MSAATARGET) + BIND_UINT_CONST(TEXCF_SAMPLECOUNT_2) + BIND_UINT_CONST(TEXCF_SAMPLECOUNT_4) + BIND_UINT_CONST(TEXCF_SAMPLECOUNT_8) BIND_UINT_CONST(TEXCF_CPU_CACHED_MEMORY) BIND_UINT_CONST(TEXCF_LINEAR_LAYOUT) BIND_UINT_CONST(TEXCF_ESRAM_ONLY) diff --git a/prog/gameLibs/dasModules/sound/common/aotSoundSystem.cpp b/prog/gameLibs/dasModules/sound/common/aotSoundSystem.cpp index 4b49caa19..364dc9e88 100644 --- a/prog/gameLibs/dasModules/sound/common/aotSoundSystem.cpp +++ b/prog/gameLibs/dasModules/sound/common/aotSoundSystem.cpp @@ -16,11 +16,10 @@ class SoundSystemModule final : public das::Module // functions SND_BIND_FUN(have_sound, das::SideEffects::accessExternal); SND_BIND_FUN(get_listener_pos, das::SideEffects::accessExternal); - SND_BIND_FUN(update_listener, das::SideEffects::modifyExternal); - SND_BIND_FUN(reset_3d_listener, das::SideEffects::modifyExternal); + SND_BIND_FUN(sound_update_listener, das::SideEffects::modifyExternal); + SND_BIND_FUN(sound_reset_3d_listener, das::SideEffects::modifyExternal); SND_BIND_FUN(sound_banks_is_preset_loaded, das::SideEffects::accessExternal); SND_BIND_FUN(sound_debug, das::SideEffects::modifyExternal); - SND_BIND_FUN(get_enable_debug_draw, das::SideEffects::accessExternal); SND_BIND_FUN(sound_enable_distant_delay, das::SideEffects::modifyExternal); SND_BIND_FUN(sound_release_delayed_events, das::SideEffects::modifyExternal); SND_BIND_FUN(sound_override_time_speed, das::SideEffects::modifyExternal); @@ -28,6 +27,7 @@ class SoundSystemModule final : public das::Module SND_BIND_FUN(sound_banks_enable_preset_starting_with, das::SideEffects::modifyExternal); SND_BIND_FUN(sound_banks_is_preset_enabled, das::SideEffects::accessExternal); SND_BIND_FUN(sound_debug_enum_events, das::SideEffects::accessExternal); + SND_BIND_FUN(sound_update, das::SideEffects::modifyExternal); verifyAotReady(); } diff --git a/prog/gameLibs/dataBlockUtils/blkUtils.cpp b/prog/gameLibs/dataBlockUtils/blkUtils.cpp index e4c67d09b..019659c13 100644 --- a/prog/gameLibs/dataBlockUtils/blkUtils.cpp +++ b/prog/gameLibs/dataBlockUtils/blkUtils.cpp @@ -3,18 +3,18 @@ #include #include -bool check_param_exist(const DataBlock *blk, std::initializer_list param_names) +bool check_param_exist(const DataBlock &blk, std::initializer_list param_names) { G_ASSERT(param_names.size() > 0); for (auto param_name : param_names) - if (blk->paramExists(param_name)) + if (blk.paramExists(param_name)) return true; return false; } -bool check_all_params_exist(const DataBlock *blk, const char *prop_name, std::initializer_list param_names) +bool check_all_params_exist(const DataBlock &blk, const char *prop_name, std::initializer_list param_names) { G_ASSERT(param_names.size() > 0); @@ -23,7 +23,7 @@ bool check_all_params_exist(const DataBlock *blk, const char *prop_name, std::in for (auto param_name : param_names) { - if (blk->paramExists(param_name)) + if (blk.paramExists(param_name)) existSome = true; else existAll = false; @@ -32,25 +32,27 @@ bool check_all_params_exist(const DataBlock *blk, const char *prop_name, std::in if (existSome && !existAll) { for (auto param_name : param_names) - if (!blk->paramExists(param_name)) + if (!blk.paramExists(param_name)) debug("parameter %s in %s not defined!", param_name, prop_name); } return existAll; } -bool check_all_params_exist_in_subblocks(const DataBlock *blk, const char *subblock_name, const char *prop_name, +bool check_all_params_exist_in_subblocks(const DataBlock &blk, const char *subblock_name, const char *prop_name, std::initializer_list param_names) { - int subblockNameId = blk->getNameId(subblock_name); - if (subblockNameId < 0 || !blk->getBlockByName(subblockNameId)) + G_ASSERT(param_names.size() > 0); + + int subblockNameId = blk.getNameId(subblock_name); + if (subblockNameId < 0 || !blk.getBlockByName(subblockNameId)) return false; bool res = true; - for (int i = 0; i < blk->blockCount(); ++i) - if (blk->getBlock(i)->getBlockNameId() == subblockNameId) - res &= check_all_params_exist(blk->getBlock(i), prop_name, param_names); + for (int i = 0; i < blk.blockCount(); ++i) + if (blk.getBlock(i)->getBlockNameId() == subblockNameId) + res &= check_all_params_exist(*blk.getBlock(i), prop_name, param_names); return res; } diff --git a/prog/gameLibs/dataBlockUtils/interpolateBlk.cpp b/prog/gameLibs/dataBlockUtils/interpolateBlk.cpp index e4ebd85c4..610ef81d9 100644 --- a/prog/gameLibs/dataBlockUtils/interpolateBlk.cpp +++ b/prog/gameLibs/dataBlockUtils/interpolateBlk.cpp @@ -1,3 +1,5 @@ +#include + #include #include #include diff --git a/prog/gameLibs/datacache/file/filebackend.h b/prog/gameLibs/datacache/file/filebackend.h index a56bb04a9..afca185a0 100644 --- a/prog/gameLibs/datacache/file/filebackend.h +++ b/prog/gameLibs/datacache/file/filebackend.h @@ -12,7 +12,7 @@ namespace datacache class FileEntry; struct FindFilesAsyncJob; -class FileBackend : public Backend +class FileBackend final : public Backend { public: typedef eastl::hash_map EntriesMap; @@ -21,25 +21,25 @@ class FileBackend : public Backend ~FileBackend(); static Backend *create(const FileBackendConfig &config); - virtual Entry *get(const char *key, ErrorCode *error, completion_cb_t, void *); - virtual Entry *set(const char *key, int64_t modtime); - virtual bool del(const char *key); - virtual void delAll(); + Entry *get(const char *key, ErrorCode *error, completion_cb_t, void *) override; + Entry *set(const char *key, int64_t modtime) override; + bool del(const char *key) override; + void delAll() override; - virtual void control(int opcode, void *p1, void *) + void control(int opcode, void *p1, void *) override { if (opcode == _MAKE4C('CS')) csMgr = (WinCritSec *)p1; } - virtual Entry *nextEntry(void **iter); - virtual void endEnumeration(void **iter); + Entry *nextEntry(void **iter) override; + void endEnumeration(void **iter) override; - virtual int getEntriesCount(); + int getEntriesCount() override; - virtual void poll() {} + void poll() override {} - virtual bool hasFreeSpace() const override { return !manualEviction || curSize < maxSize; } + bool hasFreeSpace() const override { return !manualEviction || curSize < maxSize; } public: void doPopulate(); diff --git a/prog/gameLibs/datacache/web/webbackend.cpp b/prog/gameLibs/datacache/web/webbackend.cpp index 78deb3d1c..8093c14f0 100644 --- a/prog/gameLibs/datacache/web/webbackend.cpp +++ b/prog/gameLibs/datacache/web/webbackend.cpp @@ -301,11 +301,11 @@ struct AsyncJob : public cpujobs::IJob userCb[i].cb(key, ERR_UNKNOWN, NULL, userCb[i].arg); } - void callHeaderCb(streamio::StringMap const &headers) + void callRespHeadersCb(streamio::StringMap const &resp_headers) { for (int i = 0; i < userCb.size(); ++i) - if (userCb[i].header_cb) - userCb[i].header_cb(key, headers, userCb[i].arg); + if (userCb[i].resp_headers_cb) + userCb[i].resp_headers_cb(key, resp_headers, userCb[i].arg); } }; @@ -321,8 +321,6 @@ struct DownloadRequest : public AsyncJob ~DownloadRequest() { delete stream; } virtual streamio::ProcessResult onStreamData(dag::ConstSpan) { return streamio::ProcessResult::Discarded; } - - virtual void onHttpHeader() {} }; struct IndexDownloadRequest : public DownloadRequest @@ -582,30 +580,30 @@ streamio::ProcessResult WebBackend::onHttpData(DownloadRequest *req, dag::ConstS static eastl::string to_string(eastl::string_view const &sv) { return eastl::string(sv.begin(), sv.end()); } -struct PassHeaderToCallback final : public cpujobs::IJob +struct PassRespHeadersToCallback final : public cpujobs::IJob { DownloadRequest *req; - eastl::vector> headersList; - explicit PassHeaderToCallback(DownloadRequest *_req, streamio::StringMap const &_headers) : req(_req) + eastl::vector> respHeadersList; + explicit PassRespHeadersToCallback(DownloadRequest *_req, streamio::StringMap const &resp_headers) : req(_req) { - headersList.reserve(_headers.size()); - for (auto const &kv : _headers) - headersList.push_back(eastl::make_pair(to_string(kv.first), to_string(kv.second))); + respHeadersList.reserve(resp_headers.size()); + for (auto const &kv : resp_headers) + respHeadersList.push_back(eastl::make_pair(to_string(kv.first), to_string(kv.second))); } virtual void doJob() override{}; void releaseJob() override { WinAutoLockOpt lock(req->backend->csMgr); - req->callHeaderCb(streamio::StringMap(headersList.begin(), headersList.end())); + req->callRespHeadersCb(streamio::StringMap(respHeadersList.begin(), respHeadersList.end())); delete this; } }; -void WebBackend::onHttpHeader(streamio::StringMap const &headers, DownloadRequest *req) +void WebBackend::onHttpRespHeaders(streamio::StringMap const &resp_headers, DownloadRequest *req) { WinAutoLockOpt lock(csMgr); - auto job = new PassHeaderToCallback(req, headers); + auto job = new PassRespHeadersToCallback(req, resp_headers); if (req->syncJob) { job->doJob(); @@ -668,7 +666,7 @@ void WebBackend::onHttpReqComplete(DownloadRequest *req, const char *url, int er { DOTRACE1("re-request '%s' from new base %s", url, baseUrl.data()); delete stream; - getOrCreateStreamCtx().createStream(url, onHttpReqCompleteCb, onHttpDataCb, onHttpHeaderCb, nullptr, req, req->lastModified); + getOrCreateStreamCtx().createStream(url, onHttpReqCompleteCb, onHttpDataCb, onHttpRespHeadersCb, nullptr, req, req->lastModified); } } @@ -692,11 +690,11 @@ streamio::ProcessResult WebBackend::onHttpDataCb(dag::ConstSpan data, void return streamio::ProcessResult::Discarded; } -void WebBackend::onHttpHeaderCb(streamio::StringMap const &headers, void *arg) +void WebBackend::onHttpRespHeadersCb(streamio::StringMap const &resp_headers, void *arg) { auto req = reinterpret_cast(arg); if (req->backend->UBMagic == INIT_UB_MAGIC) - req->backend->onHttpHeader(headers, req); + req->backend->onHttpRespHeaders(resp_headers, req); else fatal("attempt to access to deleted instance!"); } @@ -808,15 +806,16 @@ DownloadRequest *WebBackend::downloadFile(const char *url, const char *key, User streamio::Context &ctx = getOrCreateStreamCtx(); lock.unlockFinal(); req->syncJob = true; - ctx.createStream(url, onHttpReqCompleteCb, onHttpDataCb, onHttpHeaderCb, nullptr, req, modified_since, true); + auto hdrCb = user_cb.resp_headers_cb ? onHttpRespHeadersCb : nullptr; + ctx.createStream(url, onHttpReqCompleteCb, onHttpDataCb, hdrCb, nullptr, req, modified_since, true); return NULL; } addAsyncJob(key, req, user_cb); req->lastModified = modified_since; // save for possible re-request - intptr_t reqId = - getOrCreateStreamCtx().createStream(url, onHttpReqCompleteCb, onHttpDataCb, onHttpHeaderCb, nullptr, req, req->lastModified); + auto hdrCb = user_cb.resp_headers_cb ? onHttpRespHeadersCb : nullptr; + intptr_t reqId = getOrCreateStreamCtx().createStream(url, onHttpReqCompleteCb, onHttpDataCb, hdrCb, nullptr, req, req->lastModified); if (reqId != 0) activeRequests.push_back(reqId); return req; @@ -934,16 +933,17 @@ Entry *WebBackend::getEntryNoIndex(const char *key, const char *url, ErrorCode * Entry *WebBackend::get(const char *key, ErrorCode *error, completion_cb_t cb, void *cb_arg) { - return getWithHeaders(key, error, cb, NULL, cb_arg); + return getWithRespHeaders(key, error, cb, NULL, cb_arg); } -Entry *WebBackend::getWithHeaders(const char *key, ErrorCode *error, completion_cb_t cb, header_cb_t headers_cb, void *cb_arg) +Entry *WebBackend::getWithRespHeaders(const char *key, ErrorCode *error, completion_cb_t cb, resp_headers_cb_t resp_headers_cb, + void *cb_arg) { DOTRACE3("%s '%s'", __FUNCTION__, key); if (shutdowning) RETURN_ENTRY(NULL, ERR_UNKNOWN); - UserCb userCb{cb, headers_cb, cb_arg}; + UserCb userCb{cb, resp_headers_cb, cb_arg}; char urlBuf[URL_BUF_SIZE], keyBuf[DAGOR_MAX_PATH]; if (is_url(key)) return getEntryNoIndex(NULL, key, error, userCb); diff --git a/prog/gameLibs/datacache/web/webbackend.h b/prog/gameLibs/datacache/web/webbackend.h index 31f2c7cb1..2d92deb90 100644 --- a/prog/gameLibs/datacache/web/webbackend.h +++ b/prog/gameLibs/datacache/web/webbackend.h @@ -21,7 +21,7 @@ struct IndexDownloadRequest; typedef eastl::vector> WeightedUrlsType; -class WebBackend : public Backend +class WebBackend final : public Backend { public: WebBackend(); @@ -32,7 +32,7 @@ class WebBackend : public Backend struct UserCb { completion_cb_t cb; - header_cb_t header_cb; + resp_headers_cb_t resp_headers_cb; void *arg; }; @@ -45,25 +45,25 @@ class WebBackend : public Backend }; Entry *getEntryNoIndex(const char *key, const char *url, ErrorCode *error, UserCb const &user_cb); - virtual Entry *get(const char *key, ErrorCode *error, completion_cb_t cb, void *cb_arg); - virtual Entry *getWithHeaders(const char *key, ErrorCode *error, completion_cb_t cb = NULL, header_cb_t headers_cb = NULL, - void *cb_arg = NULL); - virtual Entry *set(const char *key, int64_t modTime); - virtual bool del(const char *key); - virtual void delAll(); - virtual int getEntriesCount(); - virtual Entry *nextEntry(void **iter); - virtual void endEnumeration(void **iter); - virtual void control(int opc, void *p0, void *p1); - virtual void poll(); + Entry *get(const char *key, ErrorCode *error, completion_cb_t cb, void *cb_arg) override; + Entry *getWithRespHeaders(const char *key, ErrorCode *error, completion_cb_t cb = NULL, resp_headers_cb_t resp_headers_cb = NULL, + void *cb_arg = NULL) override; + Entry *set(const char *key, int64_t modTime) override; + bool del(const char *key) override; + void delAll() override; + int getEntriesCount() override; + Entry *nextEntry(void **iter) override; + void endEnumeration(void **iter) override; + void control(int opc, void *p0, void *p1) override; + void poll() override; void onHttpReqComplete(DownloadRequest *req, const char *url, int error, IGenLoad *stream, int64_t last_modified, intptr_t req_id); streamio::ProcessResult onHttpData(DownloadRequest *req, dag::ConstSpan data, intptr_t req_id); - void onHttpHeader(streamio::StringMap const &headers, DownloadRequest *req); + void onHttpRespHeaders(streamio::StringMap const &resp_headers, DownloadRequest *req); static void onHttpReqCompleteCb(const char *, int error, IGenLoad *stream, void *arg, int64_t last_modified, intptr_t req_id); static streamio::ProcessResult onHttpDataCb(dag::ConstSpan data, void *arg, intptr_t req_id); - static void onHttpHeaderCb(streamio::StringMap const &headers, void *arg); + static void onHttpRespHeadersCb(streamio::StringMap const &headers, void *arg); enum RequestType { @@ -76,7 +76,7 @@ class WebBackend : public Backend bool do_sync = false); IndexDownloadRequest *downloadIndex(const char *key = "", UserCb const &user_cb = UserCb{}); - virtual void abortActiveRequests(); + void abortActiveRequests() override; const char *getUrl(const char *url, char *buf, int buf_size); void collectGarbage(); diff --git a/prog/gameLibs/dxil/compiler_dxc.cpp b/prog/gameLibs/dxil/compiler_dxc.cpp index 574137718..43b127812 100644 --- a/prog/gameLibs/dxil/compiler_dxc.cpp +++ b/prog/gameLibs/dxil/compiler_dxc.cpp @@ -173,6 +173,10 @@ CompileResult compile(IDxcCompiler3 *compiler, UINT32 major, UINT32 minor, Wrapp { compilerParams[compilerParamsCount++] = L"-HV 2021"; } + else + { + compilerParams[compilerParamsCount++] = L"-HV 2018"; + } wchar_t spaceName[] = L"AUTO_DX12_REGISTER_SPACE=space?"; spaceName[30] = autoSpace[0]; diff --git a/prog/gameLibs/ecs/camera/cameraView.cpp.inl b/prog/gameLibs/ecs/camera/cameraView.cpp.inl index 46bb3b026..537012879 100644 --- a/prog/gameLibs/ecs/camera/cameraView.cpp.inl +++ b/prog/gameLibs/ecs/camera/cameraView.cpp.inl @@ -2,6 +2,7 @@ #include #include <3d/dag_drv3d.h> #include <3d/dag_render.h> +#include #include #include #include @@ -118,25 +119,26 @@ TMatrix4 calc_active_camera_globtm() return TMatrix4(viewTm) * projTm; } -void calc_camera_values(const CameraSetup &camera_setup, TMatrix &view_tm, Driver3dPerspective &persp, int &view_w, int &view_h) +TMatrix calc_camera_view_tm(const TMatrix &view_itm) { - G_ASSERT(!check_nan(camera_setup.transform)); + G_ASSERT(!check_nan(view_itm)); // we assume it is orthonormalized already #if DAGOR_DBGLEVEL > 0 - if (fabsf(lengthSq(camera_setup.transform.getcol(0)) - 1) > 1e-5f || fabsf(lengthSq(camera_setup.transform.getcol(1)) - 1) > 1e-5f || - fabsf(lengthSq(camera_setup.transform.getcol(2)) - 1) > 1e-5f || - fabsf(dot(camera_setup.transform.getcol(0), camera_setup.transform.getcol(1))) > 1e-6f || - fabsf(dot(camera_setup.transform.getcol(0), camera_setup.transform.getcol(2))) > 1e-6f || - fabsf(dot(camera_setup.transform.getcol(1), camera_setup.transform.getcol(2))) > 1e-6f) + if (fabsf(lengthSq(view_itm.getcol(0)) - 1) > 1e-5f || fabsf(lengthSq(view_itm.getcol(1)) - 1) > 1e-5f || + fabsf(lengthSq(view_itm.getcol(2)) - 1) > 1e-5f || fabsf(dot(view_itm.getcol(0), view_itm.getcol(1))) > 1e-6f || + fabsf(dot(view_itm.getcol(0), view_itm.getcol(2))) > 1e-6f || fabsf(dot(view_itm.getcol(1), view_itm.getcol(2))) > 1e-6f) { - logerr("view matrix should be orthonormalized %@ %@ %@", camera_setup.transform.getcol(0), camera_setup.transform.getcol(1), - camera_setup.transform.getcol(2)); + logerr("view matrix should be orthonormalized %@ %@ %@", view_itm.getcol(0), view_itm.getcol(1), view_itm.getcol(2)); } #endif - d3d::get_screen_size(view_w, view_h); - view_tm = orthonormalized_inverse(camera_setup.transform); + return orthonormalized_inverse(view_itm); +} + +Driver3dPerspective calc_camera_perspective(const CameraSetup &camera_setup, int view_w, int view_h) +{ + Driver3dPerspective persp; persp.zn = camera_setup.znear; persp.zf = camera_setup.zfar; persp.ox = 0; @@ -162,20 +164,33 @@ void calc_camera_values(const CameraSetup &camera_setup, TMatrix &view_tm, Drive persp.wk = horFov; persp.hk = verFov; + return persp; } -void apply_camera_setup(const CameraSetup &camera_setup) +void calc_camera_values(const CameraSetup &camera_setup, TMatrix &view_tm, Driver3dPerspective &persp, int &view_w, int &view_h) { - TMatrix viewTm; - Driver3dPerspective persp; - int view_w, view_h; - calc_camera_values(camera_setup, viewTm, persp, view_w, view_h); + d3d::get_screen_size(view_w, view_h); + persp = calc_camera_perspective(camera_setup, view_w, view_h); + view_tm = calc_camera_view_tm(camera_setup.transform); +} - ::grs_cur_view.itm = camera_setup.transform; - ::grs_cur_view.tm = viewTm; +void apply_camera_setup(const TMatrix &view_itm, const TMatrix &view_tm, const Driver3dPerspective &persp, int view_w, int view_h) +{ + ::grs_cur_view.itm = view_itm; + ::grs_cur_view.tm = view_tm; ::grs_cur_view.pos = ::grs_cur_view.itm.getcol(3); d3d::settm(TM_VIEW, ::grs_cur_view.tm); d3d::setpersp(persp); d3d::setview(0, 0, view_w, view_h, 0, 1); } + +void apply_camera_setup(const CameraSetup &camera_setup) +{ + TMatrix viewTm; + Driver3dPerspective persp; + int view_w, view_h; + calc_camera_values(camera_setup, viewTm, persp, view_w, view_h); + + apply_camera_setup(camera_setup.transform, viewTm, persp, view_w, view_h); +} diff --git a/prog/gameLibs/ecs/input/input.cpp b/prog/gameLibs/ecs/input/input.cpp index eb3ec8817..8c7a0e533 100644 --- a/prog/gameLibs/ecs/input/input.cpp +++ b/prog/gameLibs/ecs/input/input.cpp @@ -171,7 +171,7 @@ class DaInputGamepadPollThread final : public DaThread } public: - DaInputGamepadPollThread(int step) : DaThread("dainput::poll"), stepMsec(step) {} + DaInputGamepadPollThread(int step) : DaThread("dainput::poll"), stepMsec(step) { stripStackInMinidump(); } }; static eastl::unique_ptr poll_thread = nullptr; diff --git a/prog/gameLibs/ecs/phys/animCharFastPhysES.cpp.gen.es.cpp b/prog/gameLibs/ecs/phys/animCharFastPhysES.cpp.gen.es.cpp index a9577ac2c..6e5889d70 100644 --- a/prog/gameLibs/ecs/phys/animCharFastPhysES.cpp.gen.es.cpp +++ b/prog/gameLibs/ecs/phys/animCharFastPhysES.cpp.gen.es.cpp @@ -6,10 +6,9 @@ static constexpr ecs::ComponentDesc debug_draw_fast_phys_es_comps[] = { //start of 1 rw components at [0] {ECS_HASH("animchar"), ecs::ComponentTypeInfo()}, -//start of 1 ro components at [1] - {ECS_HASH("animchar__res"), ecs::ComponentTypeInfo()}, -//start of 1 rq components at [2] - {ECS_HASH("animchar_fast_phys"), ecs::ComponentTypeInfo()} +//start of 2 rq components at [1] + {ECS_HASH("animchar_fast_phys"), ecs::ComponentTypeInfo()}, + {ECS_HASH("animchar_fast_phys_debug_render"), ecs::ComponentTypeInfo()} }; static void debug_draw_fast_phys_es_all(const ecs::UpdateStageInfo &__restrict info, const ecs::QueryView & __restrict components) { @@ -17,7 +16,6 @@ static void debug_draw_fast_phys_es_all(const ecs::UpdateStageInfo &__restrict i do debug_draw_fast_phys_es(*info.cast() , ECS_RW_COMP(debug_draw_fast_phys_es_comps, "animchar", AnimV20::AnimcharBaseComponent) - , ECS_RO_COMP(debug_draw_fast_phys_es_comps, "animchar__res", ecs::string) ); while (++comp != compE); } @@ -27,12 +25,12 @@ static ecs::EntitySystemDesc debug_draw_fast_phys_es_es_desc "prog/gameLibs/ecs/phys/animCharFastPhysES.cpp.inl", ecs::EntitySystemOps(debug_draw_fast_phys_es_all), make_span(debug_draw_fast_phys_es_comps+0, 1)/*rw*/, - make_span(debug_draw_fast_phys_es_comps+1, 1)/*ro*/, - make_span(debug_draw_fast_phys_es_comps+2, 1)/*rq*/, + empty_span(), + make_span(debug_draw_fast_phys_es_comps+1, 2)/*rq*/, empty_span(), ecs::EventSetBuilder<>::build(), (1<::build(), 0 ); +static constexpr ecs::ComponentDesc get_animchar_by_name_ecs_query_comps[] = +{ +//start of 1 rw components at [0] + {ECS_HASH("animchar__res"), ecs::ComponentTypeInfo()}, +//start of 1 ro components at [1] + {ECS_HASH("eid"), ecs::ComponentTypeInfo()} +}; +static ecs::CompileTimeQueryDesc get_animchar_by_name_ecs_query_desc +( + "get_animchar_by_name_ecs_query", + make_span(get_animchar_by_name_ecs_query_comps+0, 1)/*rw*/, + make_span(get_animchar_by_name_ecs_query_comps+1, 1)/*ro*/, + empty_span(), + empty_span()); +template +inline void get_animchar_by_name_ecs_query(Callable function) +{ + perform_query(g_entity_mgr, get_animchar_by_name_ecs_query_desc.getHandle(), + [&function](const ecs::QueryView& __restrict components) + { + auto comp = components.begin(), compE = components.end(); G_ASSERT(comp != compE); do + { + function( + ECS_RO_COMP(get_animchar_by_name_ecs_query_comps, "eid", ecs::EntityId) + , ECS_RW_COMP(get_animchar_by_name_ecs_query_comps, "animchar__res", ecs::string) + ); + + }while (++comp != compE); + } + ); +} diff --git a/prog/gameLibs/ecs/phys/animCharFastPhysES.cpp.inl b/prog/gameLibs/ecs/phys/animCharFastPhysES.cpp.inl index 9920e48d6..598ba81dc 100644 --- a/prog/gameLibs/ecs/phys/animCharFastPhysES.cpp.inl +++ b/prog/gameLibs/ecs/phys/animCharFastPhysES.cpp.inl @@ -10,6 +10,8 @@ #include #include #include +#include +#include using namespace AnimV20; @@ -92,17 +94,87 @@ static void animchar_fast_phys_destroy_es_event_handler(const ecs::Event &, Anim } } +eastl::vector_set debugAnimCharsSet; +#define TEMPLATE_NAME "animchar_fast_phys_debug_render" +const char *template_name = TEMPLATE_NAME; +void createTemplate() +{ + ecs::ComponentsMap map; + map[ECS_HASH(TEMPLATE_NAME)] = ecs::Tag(); + eastl::string name = template_name; + g_entity_mgr->addTemplate(ecs::Template(template_name, eastl::move(map), ecs::Template::component_set(), + ecs::Template::component_set(), ecs::Template::component_set(), false)); + g_entity_mgr->instantiateTemplate(g_entity_mgr->buildTemplateIdByName(template_name)); +} + +void removeSubTemplateAsync(ecs::EntityId eid) +{ + if (const char *fromTemplate = g_entity_mgr->getEntityTemplateName(eid)) + { + if (!g_entity_mgr->getTemplateDB().getTemplateByName(template_name)) + createTemplate(); + const auto newTemplate = remove_sub_template_name(fromTemplate, template_name ? template_name : ""); + g_entity_mgr->reCreateEntityFromAsync(eid, newTemplate.c_str()); + } +} + +void addSubTemplateAsync(ecs::EntityId eid) +{ + if (const char *fromTemplate = g_entity_mgr->getEntityTemplateName(eid)) + { + if (!g_entity_mgr->getTemplateDB().getTemplateByName(template_name)) + createTemplate(); + const auto newTemplate = add_sub_template_name(fromTemplate, template_name ? template_name : ""); + g_entity_mgr->reCreateEntityFromAsync(eid, newTemplate.c_str()); + } +} + +template +static void get_animchar_by_name_ecs_query(Callable c); + + +void toggleDebugAnimChar(eastl::string &str) +{ + auto it = debugAnimCharsSet.find(str); + if (it != debugAnimCharsSet.end()) + { + get_animchar_by_name_ecs_query([&](ecs::EntityId eid, ecs::string &animchar__res) { + if (animchar__res == str) + removeSubTemplateAsync(eid); + }); + debugAnimCharsSet.erase(it); + } + else + { + get_animchar_by_name_ecs_query([&](ecs::EntityId eid, ecs::string &animchar__res) { + if (animchar__res == str) + addSubTemplateAsync(eid); + }); + debugAnimCharsSet.insert(str); + } +} + +void resetDebugAnimChars() +{ + for (auto it = debugAnimCharsSet.begin(); it != debugAnimCharsSet.end(); ++it) + { + eastl::string str = *it; + get_animchar_by_name_ecs_query([&](ecs::EntityId eid, ecs::string &animchar__res) { + if (animchar__res == str) + removeSubTemplateAsync(eid); + }); + } + debugAnimCharsSet.clear(); +} + ECS_NO_ORDER -ECS_REQUIRE(FastPhysTag animchar_fast_phys) -ECS_TAG(render) -static void debug_draw_fast_phys_es(const UpdateStageInfoRenderDebug &, AnimcharBaseComponent &animchar, ecs::string animchar__res) +ECS_REQUIRE(FastPhysTag animchar_fast_phys, ecs::Tag animchar_fast_phys_debug_render) +ECS_TAG(dev, render) +static void debug_draw_fast_phys_es(const UpdateStageInfoRenderDebug &, AnimcharBaseComponent &animchar) { FastPhysSystem *fastPhys = animchar.getFastPhysSystem(); if (!fastPhys) return; - if (!FastPhys::checkDebugAnimChar(animchar__res)) - return; - begin_draw_cached_debug_lines(); for (auto action : fastPhys->updateActions) @@ -121,10 +193,10 @@ static bool fastphys_console_handler(const char *argv[], int argc) if (argc > 1) { eastl::string resName(argv[1]); - FastPhys::toggleDebugAnimChar(resName); + toggleDebugAnimChar(resName); } else - FastPhys::resetDebugAnimChars(); + resetDebugAnimChars(); } return found; } diff --git a/prog/gameLibs/ecs/phys/particlePhysSys.cpp b/prog/gameLibs/ecs/phys/particlePhysSys.cpp index eff0ebc7e..d03ef6417 100644 --- a/prog/gameLibs/ecs/phys/particlePhysSys.cpp +++ b/prog/gameLibs/ecs/phys/particlePhysSys.cpp @@ -15,8 +15,18 @@ ECS_TAG(render) static __forceinline void particle_phys_es_event_handler(const ecs::EventEntityCreated &, const ecs::string &particle_phys__blk, const AnimV20::AnimcharBaseComponent &animchar, daphys::ParticlePhysSystem &particle_phys) { - const DataBlock blk(particle_phys__blk.c_str(), framemem_ptr()); - particle_phys.loadFromBlk(&blk, animchar.getOriginalNodeTree()); + char blkName[DAGOR_MAX_PATH]; + strncpy(blkName, particle_phys__blk.c_str(), sizeof(blkName) - 1); + blkName[sizeof(blkName) - 1] = '\0'; + char *pColon = strchr(blkName, ':'); + const char *blockName = NULL; + if (pColon) + { + *pColon = '\0'; // cut block name off + blockName = pColon + 1; + } + const DataBlock blk(blkName, framemem_ptr()); + particle_phys.loadFromBlk(blockName ? blk.getBlockByName(blockName) : &blk, animchar.getOriginalNodeTree()); } ECS_NO_ORDER diff --git a/prog/gameLibs/ecs/scripts/das/das_es.cpp b/prog/gameLibs/ecs/scripts/das/das_es.cpp index c65e331d4..171c5771e 100644 --- a/prog/gameLibs/ecs/scripts/das/das_es.cpp +++ b/prog/gameLibs/ecs/scripts/das/das_es.cpp @@ -1637,7 +1637,7 @@ bool load_das_script(const char *name, const char *program_text) return scripts.loadScript(fname, access, globally_aot_mode, globally_resolve_ecs_on_load, globally_log_aot_errors); } -inline bool internal_load_das_script_sync(const char *fname, ResolveECS resolve_ecs) +static inline bool internal_load_das_script_sync(const char *fname, ResolveECS resolve_ecs) { String tmpPath; return scripts.loadScript(dd_resolve_named_mount(tmpPath, fname) ? tmpPath.c_str() : fname, @@ -1645,16 +1645,24 @@ inline bool internal_load_das_script_sync(const char *fname, ResolveECS resolve_ globally_log_aot_errors); } -inline bool internal_load_das_script(const char *fname, ResolveECS resolve_ecs) +bool load_das_script(const char *fname) { if (globally_loading_in_queue) - return bind_dascript::enqueue_das_script(fname); + { + loadingQueue.push_back(fname); +#if DAGOR_DBGLEVEL > 0 + const uint32_t pathHash = ecs_str_hash(fname); + if (loadingQueueHash.find(pathHash) != loadingQueueHash.end()) + logerr("das: file '%s' was loaded multiple times, probably load_folder() was called multiple times with same arguments", fname); + else + loadingQueueHash.insert(pathHash); +#endif + return true; + } - return internal_load_das_script_sync(fname, resolve_ecs); + return internal_load_das_script_sync(fname, globally_resolve_ecs_on_load); } -bool load_das_script(const char *fname) { return internal_load_das_script(fname, globally_resolve_ecs_on_load); } - bool load_das_script_debugger(const char *fname) { #if DAGOR_DBGLEVEL > 0 && _TARGET_PC @@ -1805,7 +1813,7 @@ static bool load_scripts_from_serialized_data() return ok; } -static bool stop_das_loading_queue(TInitDas init) +bool stop_loading_queue(TInitDas init) { globally_loading_in_queue = false; @@ -1899,50 +1907,42 @@ static bool stop_das_loading_queue(TInitDas init) return ok; } -bool internal_load_entry_script(const char *fname, TInitDas init) +static bool internal_load_entry_script(const char *fname) { - G_ASSERT(!globally_loading_in_queue && loadingQueue.empty()); - globally_loading_in_queue = true; const bool res = scripts.loadScript(fname, das::make_smart(scripts.getFileAccess(), globally_hot_reload), globally_aot_mode, ResolveECS::NO, globally_log_aot_errors); - return stop_das_loading_queue(init) && res; + return res; } -bool enqueue_das_script(const char *fname) +static uint64_t entryScriptStartTime; +static size_t entryScriptMemUsed; + +void begin_loading_queue() { - loadingQueue.push_back(fname); -#if DAGOR_DBGLEVEL > 0 - const uint32_t pathHash = ecs_str_hash(fname); - if (loadingQueueHash.find(pathHash) != loadingQueueHash.end()) - logerr("das: file '%s' was loaded multiple times, probably load_folder() was called multiple times with same arguments", fname); - else - loadingQueueHash.insert(pathHash); -#endif - return true; + entryScriptStartTime = profile_ref_ticks(); + scripts.statistics = {}; // reset das load stats from previous load + entryScriptMemUsed = dagor_memory_stat::get_memory_allocated(true); + G_ASSERT(!globally_loading_in_queue && loadingQueue.empty()); + globally_loading_in_queue = true; } bool load_entry_script(const char *entry_point_name, TInitDas init, LoadEntryScriptCtx ctx) { - const uint64_t startTime = profile_ref_ticks(); - scripts.statistics = {}; // reset das load stats from previous load - const size_t memUsed = dagor_memory_stat::get_memory_allocated(true); - bool res = false; - if (globally_load_threads_num > 1) - { - das::ReuseCacheGuard guard; - res = internal_load_entry_script(entry_point_name, init); - } - else - { - res = internal_load_entry_script(entry_point_name, init); - } + begin_loading_queue(); + bool res = internal_load_entry_script(entry_point_name) && stop_loading_queue(init); + end_loading_queue(ctx); + return res; +} + +void end_loading_queue(LoadEntryScriptCtx ctx) +{ scripts.done(); scripts.cleanupMemoryUsage(); initDeserializer.moduleLibrary = nullptr; // Memory has already been reset - scripts.statistics.loadTimeMs = profile_time_usec(startTime) / 1000; - scripts.statistics.memoryUsage = int64_t(dagor_memory_stat::get_memory_allocated(true) - memUsed) + ctx.additionalMemoryUsage; + scripts.statistics.loadTimeMs = profile_time_usec(entryScriptStartTime) / 1000; + scripts.statistics.memoryUsage = + int64_t(dagor_memory_stat::get_memory_allocated(true) - entryScriptMemUsed) + ctx.additionalMemoryUsage; dump_statistics(); - return res; } bool main_thread_post_load() @@ -3065,6 +3065,7 @@ void das_load_ecs_templates() scripts.postProcessModuleGroups(); // currently module groups contains only templates } } // namespace bind_dascript +DAG_DECLARE_RELOCATABLE(bind_dascript::DascriptLoadJob); static void pull() { diff --git a/prog/gameLibs/eventLog/httpRequest.cpp b/prog/gameLibs/eventLog/httpRequest.cpp index 82f6891e7..490fc111d 100644 --- a/prog/gameLibs/eventLog/httpRequest.cpp +++ b/prog/gameLibs/eventLog/httpRequest.cpp @@ -20,6 +20,7 @@ static httprequests::RequestId make_request(const char *url, const void *data, u reqParams.postData = dag::ConstSpan((const char *)data, size); reqParams.reqTimeoutMs = timeout * 1000; // S -> MS reqParams.connectTimeoutMs = timeout * 1000; // S -> MS + reqParams.needResponseHeaders = false; reqParams.callback = httprequests::make_http_callback(callback); return httprequests::async_request(reqParams); } diff --git a/prog/gameLibs/fftWater/flowMap.cpp b/prog/gameLibs/fftWater/flowMap.cpp index b5ce9f498..a9926a856 100644 --- a/prog/gameLibs/fftWater/flowMap.cpp +++ b/prog/gameLibs/fftWater/flowMap.cpp @@ -70,8 +70,7 @@ void build_flowmap(FFTWater *handle, FlowmapParams &flowmap_params, int flowmap_ frame = 1; init_shader_vars(); - set_flowmap_params(flowmap_params); - set_flowmap_foam_params(flowmap_params); + set_flowmap_tex(flowmap_params); texA.close(); texB.close(); @@ -88,13 +87,16 @@ void build_flowmap(FFTWater *handle, FlowmapParams &flowmap_params, int flowmap_ builder.init("water_flowmap"); - windsBuf = dag::create_sbuffer(sizeof(FlowmapWind), MAX_FLOWMAP_WINDS, - SBCF_BIND_CONSTANT | SBCF_CPU_ACCESS_WRITE | SBCF_MAYBELOST | SBCF_DYNAMIC, 0, "water_flowmap_winds"); + windsBuf = dag::create_sbuffer(sizeof(FlowmapWind), MAX_FLOWMAP_WINDS, SBCF_BIND_CONSTANT | SBCF_CPU_ACCESS_WRITE | SBCF_DYNAMIC, + 0, "water_flowmap_winds"); } if (!texA || !texB) return; + set_flowmap_params(flowmap_params); + set_flowmap_foam_params(flowmap_params); + float cameraSnap = float(flowmap_texture_size) / (range * 2); Point3 cameraPos = camera_pos; if (cameraSnap != 0) @@ -165,20 +167,11 @@ void build_flowmap(FFTWater *handle, FlowmapParams &flowmap_params, int flowmap_ d3d::driver_command(DRV3D_COMMAND_RELEASE_OWNERSHIP, NULL, NULL, NULL); } -void set_flowmap_params(FlowmapParams &flowmap_params) +void set_flowmap_tex(FlowmapParams &flowmap_params) { if (flowmap_params.frame == 0) return; - Point4 flowmapStrength = flowmap_params.flowmapStrength; - if (!flowmap_params.usingFoamFx) - flowmapStrength.w = 0; - - ShaderGlobal::set_real(water_wind_strengthVarId, flowmap_params.windStrength); - ShaderGlobal::set_real(water_flowmap_fadingVarId, flowmap_params.flowmapFading); - ShaderGlobal::set_color4(water_flowmap_strengthVarId, flowmapStrength); - ShaderGlobal::set_color4(water_flowmap_strength_addVarId, flowmap_params.flowmapStrengthAdd); - SharedTexHolder &tex = flowmap_params.tex; String &texName = flowmap_params.texName; Point4 &texArea = flowmap_params.texArea; @@ -192,6 +185,21 @@ void set_flowmap_params(FlowmapParams &flowmap_params) } } +void set_flowmap_params(FlowmapParams &flowmap_params) +{ + if (flowmap_params.frame == 0) + return; + + Point4 flowmapStrength = flowmap_params.flowmapStrength; + if (!flowmap_params.usingFoamFx) + flowmapStrength.w = 0; + + ShaderGlobal::set_real(water_wind_strengthVarId, flowmap_params.windStrength); + ShaderGlobal::set_real(water_flowmap_fadingVarId, flowmap_params.flowmapFading); + ShaderGlobal::set_color4(water_flowmap_strengthVarId, flowmapStrength); + ShaderGlobal::set_color4(water_flowmap_strength_addVarId, flowmap_params.flowmapStrengthAdd); +} + void set_flowmap_foam_params(FlowmapParams &flowmap_params) { if (flowmap_params.frame == 0) @@ -270,6 +278,9 @@ void flowmap_floodfill(int texSize, Texture *heightmapTex, Texture *floodfillTex int fx = 0; int fy = 0; + int nx = 0; + int ny = 0; + if ((u - 1 >= 0) && (height[-1] < heightmapLevel)) { if (flood[-1] == 0) @@ -281,7 +292,11 @@ void flowmap_floodfill(int texSize, Texture *heightmapTex, Texture *floodfillTex queueEnd = 0; } else if (flood[-1] > 1) + { fx++; + nx += ((flood[-1] >> 0) & 0xff) - 0x80; + ny += ((flood[-1] >> 8) & 0xff) - 0x80; + } } if ((u + 1 < texSize) && (height[1] < heightmapLevel)) { @@ -294,7 +309,11 @@ void flowmap_floodfill(int texSize, Texture *heightmapTex, Texture *floodfillTex queueEnd = 0; } else if (flood[1] > 1) + { fx--; + nx += ((flood[1] >> 0) & 0xff) - 0x80; + ny += ((flood[1] >> 8) & 0xff) - 0x80; + } } if ((v - 1 >= 0) && (height[-texSize] < heightmapLevel)) { @@ -307,7 +326,11 @@ void flowmap_floodfill(int texSize, Texture *heightmapTex, Texture *floodfillTex queueEnd = 0; } else if (flood[-texSize] > 1) + { fy++; + nx += ((flood[-texSize] >> 0) & 0xff) - 0x80; + ny += ((flood[-texSize] >> 8) & 0xff) - 0x80; + } } if ((v + 1 < texSize) && (height[texSize] < heightmapLevel)) { @@ -320,10 +343,25 @@ void flowmap_floodfill(int texSize, Texture *heightmapTex, Texture *floodfillTex queueEnd = 0; } else if (flood[texSize] > 1) + { fy--; + nx += ((flood[texSize] >> 0) & 0xff) - 0x80; + ny += ((flood[texSize] >> 8) & 0xff) - 0x80; + } + } + + fx = fx * 6 + nx; + fy = fy * 6 + ny; + + int i = fx * fx + fy * fy; + if (i) + { + float f = 127.0f / sqrtf(float(i)); + fx = int(float(fx) * f); + fy = int(float(fy) * f); } - flood[0] = uint16_t(((fy * 0x7f + 0x80) << 8) | (fx * 0x7f + 0x80)); + flood[0] = uint16_t(((fy + 0x80) << 8) | (fx + 0x80)); } } } diff --git a/prog/gameLibs/fftWater/gpuFetch.cpp b/prog/gameLibs/fftWater/gpuFetch.cpp index d07392631..27b50df25 100644 --- a/prog/gameLibs/fftWater/gpuFetch.cpp +++ b/prog/gameLibs/fftWater/gpuFetch.cpp @@ -108,7 +108,7 @@ class GpuFetchQuery destroy(); maxNumber = clamp(maxNumber, (int)0, (int)4096); gpuTexture = d3d::create_tex(NULL, maxNumber, 1, TEXCF_RTARGET | TEXCF_LINEAR_LAYOUT | TEXFMT_R32F, 1, "water_fetch_gpu"); - vb = d3d::create_vb(maxNumber * sizeof(float) * 4, SBCF_DYNAMIC | SBCF_MAYBELOST, "histogram"); + vb = d3d::create_vb(maxNumber * sizeof(float) * 4, SBCF_DYNAMIC, "histogram"); event = d3d::create_event_query(); clear_and_resize(results, maxNumber); for (int i = 0; i < maxNumber; ++i) diff --git a/prog/gameLibs/fftWater/waterRenderGpGPU.cpp b/prog/gameLibs/fftWater/waterRenderGpGPU.cpp index 33b0a46c5..40e7f51d7 100644 --- a/prog/gameLibs/fftWater/waterRenderGpGPU.cpp +++ b/prog/gameLibs/fftWater/waterRenderGpGPU.cpp @@ -283,11 +283,11 @@ bool GPGPUData::init(const NVWaveWorks_FFT_CPU_Simulation *fft, int numCascades) G_ASSERT(h0Mat); h0Element = h0Mat->make_elem(); int num_quads = (numCascades + 1) / 2; - ht0Ibuf = d3d::create_ib(sizeof(uint16_t) * 6 * num_quads, SBCF_MAYBELOST, "ht0Ibuf"); + ht0Ibuf = d3d::create_ib(sizeof(uint16_t) * 6 * num_quads, 0, "ht0Ibuf"); d3d_err(ht0Ibuf); // create vbuffer for fft/butterflies - fftVbuf = d3d::create_vb(sizeof(Point3) * 3 * miNumButterflies, SBCF_MAYBELOST, "fftVbuf"); + fftVbuf = d3d::create_vb(sizeof(Point3) * 3 * miNumButterflies, 0, "fftVbuf"); d3d_err(fftVbuf); buffersReady = false; fftVMat = new_shader_material_by_name("fftV"); @@ -319,7 +319,7 @@ void GPGPUData::updateHt0WindowsVB(const NVWaveWorks_FFT_CPU_Simulation *fft, in int num_quads = (numCascades + 1) / 2; del_d3dres(ht0Vbuf); - ht0Vbuf = d3d::create_vb(sizeof(Ht0Vertex) * 4 * num_quads, SBCF_MAYBELOST, "ht0Vbuf"); + ht0Vbuf = d3d::create_vb(sizeof(Ht0Vertex) * 4 * num_quads, 0, "ht0Vbuf"); d3d_err(ht0Vbuf); G_ASSERT(ht0Vbuf); h0GPUUpdateRequired = true; diff --git a/prog/gameLibs/gamePhys/phys/destrRender/destructablesRender.cpp b/prog/gameLibs/gamePhys/phys/destrRender/destructablesRender.cpp index f8074af4a..9b38e9b22 100644 --- a/prog/gameLibs/gamePhys/phys/destrRender/destructablesRender.cpp +++ b/prog/gameLibs/gamePhys/phys/destrRender/destructablesRender.cpp @@ -8,7 +8,6 @@ #include #include - void destructables::before_render(const Point3 &view_pos) { for (const auto destr : destructables::getDestructableObjects()) @@ -18,10 +17,13 @@ void destructables::before_render(const Point3 &view_pos) } } -void destructables::render(dynrend::ContextId inst_ctx, const Frustum &frustum) +void destructables::render(dynrend::ContextId inst_ctx, const Frustum &frustum, float min_bbox_radius) { static int instanceInitPosVarId = get_shader_variable_id("instance_init_pos_const_no", true); static int instanceInitPosConstNo = instanceInitPosVarId >= 0 ? ShaderGlobal::get_int_fast(instanceInitPosVarId) : -1; + + float min_bbox_r2 = min_bbox_radius * min_bbox_radius; + for (const auto destr : destructables::getDestructableObjects()) { if (!destr->isAlive()) @@ -31,12 +33,16 @@ void destructables::render(dynrend::ContextId inst_ctx, const Frustum &frustum) if (dynrend::is_initialized()) { if (destr->rendData) + { for (const DestrRendData::RendData &rdata : destr->rendData->rendData) { if (!rdata.inst) continue; - if (!frustum.testSphere(rdata.inst->getBoundingSphere())) + const BSphere3 &boundingSphere = rdata.inst->getBoundingSphere(); + + if ( + !frustum.testSphere(boundingSphere) || (min_bbox_r2 > 0 && rdata.inst->getBoundingBox().width().lengthSq() < min_bbox_r2)) continue; dynrend::PerInstanceRenderData renderData; @@ -44,6 +50,7 @@ void destructables::render(dynrend::ContextId inst_ctx, const Frustum &frustum) renderData.params.push_back(initialPos); dynrend::add(inst_ctx, rdata.inst, rdata.initialNodes, &renderData); } + } } else { diff --git a/prog/gameLibs/gamePhys/phys/destrRenderStub/destructablesStub.cpp b/prog/gameLibs/gamePhys/phys/destrRenderStub/destructablesStub.cpp index 685b85142..514fb3cf4 100644 --- a/prog/gameLibs/gamePhys/phys/destrRenderStub/destructablesStub.cpp +++ b/prog/gameLibs/gamePhys/phys/destrRenderStub/destructablesStub.cpp @@ -6,4 +6,4 @@ destructables::DestrRendData *destructables::init_rend_data(DynamicPhysObjectClass *) { return nullptr; } void destructables::clear_rend_data(destructables::DestrRendData *) {} void destructables::before_render(const Point3 &) {} -void destructables::render(dynrend::ContextId, const Frustum &) {} +void destructables::render(dynrend::ContextId, const Frustum &, float) {} diff --git a/prog/gameLibs/gamePhys/phys/rendinstDestr.cpp b/prog/gameLibs/gamePhys/phys/rendinstDestr.cpp index dd7e3d25a..c41e5e610 100644 --- a/prog/gameLibs/gamePhys/phys/rendinstDestr.cpp +++ b/prog/gameLibs/gamePhys/phys/rendinstDestr.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,7 @@ static const Point3 RI_DESTROY_BBOX_MARGIN_UP = Point3(0, 0.05, 0); static bool apply_reflection = false; static bool apply_cached_destr = false; +static bool apply_delayed_extra_destr = false; static rendinstdestr::create_tree_rend_inst_destr_cb create_tree_cb = NULL; static rendinstdestr::remove_tree_rendinst_destr_cb rem_tree_cb = NULL; static rendinstdestr::remove_physx_collision_object_callback rem_physx_collision_obj_cb = NULL; @@ -53,9 +55,12 @@ static rendinstdestr::DestrSettings destrSettings; static rendinst::ri_damage_effect_cb ri_effect_cb = nullptr; static PhysWorld *phys_world = NULL; static danet::BitStream cachedDestr; +static WinCritSec delayed_ri_extra_destr_mutex; +static dag::Vector delayed_ri_extra_destruction; static Tab riPhys(midmem); static Tab cachedCollisionObjects(midmem); static CollisionObject tree_collision; +static void do_delayed_ri_extra_destruction_impl(); #if ENABLE_APEX template <> struct eastl::hash @@ -175,6 +180,8 @@ void rendinstdestr::init_ex(mpi::ObjectID oid, rendinstdestr::on_destr_changed_c { on_changed_destr_cb = on_destr_cb; init_refl_object(oid); + apply_cached_destr = true; + apply_delayed_extra_destr = true; rendinst::enable_apex = false; create_tree_cb = create_tree_destr_cb; rem_tree_cb = rem_tree_destr_cb; @@ -184,6 +191,7 @@ void rendinstdestr::init_ex(mpi::ObjectID oid, rendinstdestr::on_destr_changed_c apex_force_remove_actor_cb = apex_remove_actor_cb; get_current_camera_pos = get_current_camera_pos_; rendinst::registerRIGenExtraInvalidateHandleCb(invalidate_handle_cb); + rendinst::do_delayed_ri_extra_destruction = do_delayed_ri_extra_destruction_impl; } void rendinstdestr::init(rendinstdestr::on_destr_changed_callback on_destr_cb, bool apply_cache, @@ -240,6 +248,9 @@ void rendinstdestr::endSession() enable_reflection_refl_object(false); apply_reflection = false; cachedDestr.Reset(); + + WinAutoLock lock(delayed_ri_extra_destr_mutex); + clear_and_shrink(delayed_ri_extra_destruction); } mpi::IObject *rendinstdestr::getReflectionObject() { return get_refl_object(); } @@ -841,6 +852,30 @@ bool rendinstdestr::serialize_destr_data(danet::BitStream &bs) return cellCount > 0; } +static void do_delayed_ri_extra_destruction_impl() +{ + using namespace rendinstdestr; + if (!apply_delayed_extra_destr) + return; + + WinAutoLock lock(delayed_ri_extra_destr_mutex); + for (auto it = delayed_ri_extra_destruction.begin(); it != delayed_ri_extra_destruction.end();) + { + auto desc = *it; + if (const auto idx = rendinst::find_restorable_data_index(desc); idx >= 0) + { + desc.idx = idx; + destroyRendinst(desc, false, ZERO(), ZERO(), 0.f, NULL, get_ri_damage_effect_cb() != nullptr, + get_ri_damage_effect_cb(), get_destr_settings().isClient); + it = delayed_ri_extra_destruction.erase_unsorted(it); + } + else + { + ++it; + } + } +} + void rendinstdestr::deserialize_destr_data(const danet::BitStream &bs, int apply_flags, int max_simultaneous_destrs) { // Temporary structure to keep it @@ -851,30 +886,30 @@ void rendinstdestr::deserialize_destr_data(const danet::BitStream &bs, int apply uint16_t cellCount = 0; bs.ReadCompressed(cellCount); - rendinst::getDestrCellData(0 /*primary layer*/, [&](const Tab &destrCellData) { - if (!apply_reflection) + if (!apply_reflection) + { + cachedDestr = bs; + for (int i = 0; i < cellCount; ++i) { - cachedDestr = bs; - for (int i = 0; i < cellCount; ++i) - { - uint16_t poolCount = 0; + uint16_t poolCount = 0; - bs.IgnoreBytes(sizeof(int16_t)); - bs.ReadCompressed(poolCount); - for (int j = 0; j < poolCount; ++j) - { - uint16_t rangeCount = 0; - uint16_t poolIdx = 0; + bs.IgnoreBytes(sizeof(int16_t)); + bs.ReadCompressed(poolCount); + for (int j = 0; j < poolCount; ++j) + { + uint16_t rangeCount = 0; + uint16_t poolIdx = 0; - bs.ReadCompressed(poolIdx); - bs.ReadCompressed(rangeCount); + bs.ReadCompressed(poolIdx); + bs.ReadCompressed(rangeCount); - bs.IgnoreBytes(2 * sizeof(uint16_t) * rangeCount); - } + bs.IgnoreBytes(2 * sizeof(uint16_t) * rangeCount); } - return true; } + return; + } + rendinst::getDestrCellData(0 /*primary layer*/, [&](const Tab &destrCellData) { cellsNewDestrInfo.resize(cellCount); for (int i = 0; i < cellCount; ++i) @@ -1043,6 +1078,7 @@ void rendinstdestr::deserialize_destr_data(const danet::BitStream &bs, int apply }); newDestrs = ((apply_flags & INITIAL_REPLICATION) != 0) ? 0 : min(newDestrs, max_simultaneous_destrs); + dag::Vector delayedRiExtraDestruction; for (int i = 0; i < cellsNewDestrInfo.size(); ++i) { @@ -1078,6 +1114,12 @@ void rendinstdestr::deserialize_destr_data(const danet::BitStream &bs, int apply desc.idx = idx; ok = true; } + else if (apply_delayed_extra_destr) + { + // if ri extra was not found, add it to delayed destruction list to destroy it + // in do_delayed_ri_extra_destruction, if it will appear later + delayedRiExtraDestruction.push_back(desc); + } } // Only call destroyRendinst if a handle (i.e. idx exists) is found, if it's not found, // then desc.idx will be 0 and will trigger all kinds of side effects with make_handle(pool, 0). @@ -1097,6 +1139,17 @@ void rendinstdestr::deserialize_destr_data(const danet::BitStream &bs, int apply if (shouldUpdateVb && cell.cellId >= 0) rendinst::updateRiGenVbCell(0, cell.cellId); } + + if (!delayedRiExtraDestruction.empty()) + { + WinAutoLock lock(delayed_ri_extra_destr_mutex); + for (const auto &desc : delayedRiExtraDestruction) + { + if (eastl::find(delayed_ri_extra_destruction.begin(), delayed_ri_extra_destruction.end(), desc) == + delayed_ri_extra_destruction.end()) + delayed_ri_extra_destruction.push_back(desc); + } + } } bool rendinstdestr::apply_damage_to_riextra(rendinst::riex_handle_t handle, float dmg, const Point3 &pos, const Point3 &impulse, @@ -1616,6 +1669,22 @@ void rendinstdestr::doRIExtraDamageInBox(const BBox3 &box, rendinst::ri_damage_e rendinst::CollisionInfo collInfo(riDesc); collInfo = rendinst::getRiGenDestrInfo(riDesc); + Point3 riPos = collInfo.tm.getcol(3); + if (check_sphere) + { + const BBox3 riBBox = collInfo.tm * collInfo.localBBox; + if (!(*check_sphere & riBBox)) + continue; + } + if (check_itm) + { + BBox3 checkBox(Point3(-0.5f, 0.f, -0.5f), Point3(0.5f, 1.f, 0.5f)); + Point3 p = *check_itm * riPos; + p.y = 0.5f; + if (!(checkBox & p)) + continue; + } + if (calc_expl_dmg_cb) { TIME_PROFILE(rendinstdestr__doRIExtraDamageInBox_calc_expl_dmg); @@ -1631,17 +1700,6 @@ void rendinstdestr::doRIExtraDamageInBox(const BBox3 &box, rendinst::ri_damage_e continue; } bool local_create_destr = false; - Point3 riPos = collInfo.tm.getcol(3); - if (check_sphere && !(*check_sphere & riPos)) - continue; - if (check_itm) - { - BBox3 checkBox(Point3(-0.5f, 0.f, -0.5f), Point3(0.5f, 1.f, 0.5f)); - Point3 p = *check_itm * riPos; - p.y = 0.5f; - if (!(checkBox & p)) - continue; - } if (create_destr && (!tooManyDestructables || ((view_pos - riPos).lengthSq() < destructables::minDestrRadiusSq))) local_create_destr = true; if (collInfo.isDestr) diff --git a/prog/gameLibs/gamePhys/phys/rendinstDestrRefl/stub/rdrstub.cpp b/prog/gameLibs/gamePhys/phys/rendinstDestrRefl/stub/rdrstub.cpp index e9690349d..610521816 100644 --- a/prog/gameLibs/gamePhys/phys/rendinstDestrRefl/stub/rdrstub.cpp +++ b/prog/gameLibs/gamePhys/phys/rendinstDestrRefl/stub/rdrstub.cpp @@ -8,4 +8,4 @@ void destroy_refl_object() {} void enable_reflection_refl_object(bool) {} mpi::IObject *get_refl_object() { return nullptr; } -} // namespace rendinstdestr +} // namespace rendinstdestr \ No newline at end of file diff --git a/prog/gameLibs/gamePhys/phys/walker/humanPhys.cpp b/prog/gameLibs/gamePhys/phys/walker/humanPhys.cpp index b38f00c56..c333cb98f 100644 --- a/prog/gameLibs/gamePhys/phys/walker/humanPhys.cpp +++ b/prog/gameLibs/gamePhys/phys/walker/humanPhys.cpp @@ -272,6 +272,7 @@ void HumanPhys::loadFromBlk(const DataBlock *blk, const CollisionResource * /*co maxWalkSpeedLimitRestoreSpeed = blk->getReal("maxWalkSpeedLimitRestoreSpeed", maxWalkSpeedLimitRestoreSpeed); maxObstacleHeight = blk->getReal("maxObstacleHeight", maxObstacleHeight) * scale; + maxStepOverHeight = blk->getReal("maxStepOverHeight", maxStepOverHeight) * scale; maxCrawlObstacleHeight = blk->getReal("maxCrawlObstacleHeight", maxObstacleHeight * invScale) * scale; maxObstacleDownReach = blk->getReal("maxObstacleDownReach", maxObstacleDownReach); @@ -1609,6 +1610,10 @@ HumanPhys::TorsoCollisionResults HumanPhys::processTorsoCollision(TMatrix &tm, i if (!overObstacle) contact.wnormB = normalize(Point3::x0z(contact.wnormB)); // when climbing, reduce normal to horizontal offsets only } + bool canStepOver = + maxStepOverHeight > 0.f && dot(contact.wpos - tm.getcol(3), currentState.vertDirection) < maxStepOverHeight; + if (canStepOver) + speedCollHardness = 0.f; applyPushingImpulse(contact, torsoPosOffs, 1.f, speedCollHardness); } } diff --git a/prog/gameLibs/gpuObjects/gpuObjects.cpp b/prog/gameLibs/gpuObjects/gpuObjects.cpp index b04ec88aa..cea8a3f15 100644 --- a/prog/gameLibs/gpuObjects/gpuObjects.cpp +++ b/prog/gameLibs/gpuObjects/gpuObjects.cpp @@ -194,8 +194,7 @@ void ObjectManager::recreateGrid(int cell_tile, int cells_size_count, float cell if (maxObjectsCountInCell > 0) { gatheredBuffer = dag::create_sbuffer(sizeof(Point4), maxObjectsCountInCell * cellsCount * ROWS_IN_MATRIX, - SBCF_BIND_SHADER_RES | SBCF_BIND_UNORDERED | SBCF_MAYBELOST, TEXFMT_A32B32G32R32F, - String(0, "gathered_gpu_objects_%s", assetName)); + SBCF_BIND_SHADER_RES | SBCF_BIND_UNORDERED, TEXFMT_A32B32G32R32F, String(0, "gathered_gpu_objects_%s", assetName)); countersBuffer = dag::buffers::create_ua_byte_address_readback(cellsCount, String(0, "ObjectManagerCounts_%s", assetName), dag::buffers::Init::Zero); bboxesBuffer = dag::buffers::create_ua_structured_readback(sizeof(int32_t), BBOX3F_SIZE_IN_INT * cellsCount, @@ -229,7 +228,7 @@ void ObjectManager::makeMatricesOffsetsBuffer() { matricesOffsetsBuffer.close(); uint32_t size = maxObjectsCountInCell > 0 ? cellsCount : 0; - matricesOffsetsBuffer = dag::create_sbuffer(sizeof(uint32_t) * 2, size + 1, SBCF_BIND_SHADER_RES | SBCF_MAYBELOST, TEXFMT_R32G32UI, + matricesOffsetsBuffer = dag::create_sbuffer(sizeof(uint32_t) * 2, size + 1, SBCF_BIND_SHADER_RES, TEXFMT_R32G32UI, String(0, "ObjectManager_MatricesOffsets_%s", assetName)); } @@ -646,8 +645,8 @@ void GpuObjects::setGpuInstancingRelemParams(int cascade_no) String bufferName; bufferName.printf(0, "generateIndirectParamsBuffer%d", cascade_no); - cascades[cascade_no].generateIndirectParamsBuffer = dag::create_sbuffer(sizeof(vec4f), GPUOBJDATA_SIZE * MAX_LODS * objCount, - SBCF_BIND_SHADER_RES | SBCF_MAYBELOST, TEXFMT_A32B32G32R32F, bufferName); + cascades[cascade_no].generateIndirectParamsBuffer = + dag::create_sbuffer(sizeof(vec4f), GPUOBJDATA_SIZE * MAX_LODS * objCount, SBCF_BIND_SHADER_RES, TEXFMT_A32B32G32R32F, bufferName); for (int lod = 0; lod < MAX_LODS; lod++) { @@ -889,8 +888,8 @@ void GpuObjects::beforeDraw(rendinst::RenderPass render_pass, int cascade, const else logdbg("buffer for gpu objects created, %dMB for %d objects, %d instances placed on RI", (rowsInBuffer * sizeof(Point4)) >> 20, maxInstancesCount, maxInstancesCountOnRi); - cascades[cascade].matricesBuffer = dag::create_sbuffer(sizeof(Point4), rowsInBuffer, - SBCF_MAYBELOST | SBCF_BIND_SHADER_RES | SBCF_BIND_UNORDERED, TEXFMT_A32B32G32R32F, "GPUobjects"); + cascades[cascade].matricesBuffer = dag::create_sbuffer(sizeof(Point4), rowsInBuffer, SBCF_BIND_SHADER_RES | SBCF_BIND_UNORDERED, + TEXFMT_A32B32G32R32F, "GPUobjects"); } } diff --git a/prog/gameLibs/gpuObjects/volumePlacerES.cpp.inl b/prog/gameLibs/gpuObjects/volumePlacerES.cpp.inl index a25c7643a..58f620ad9 100644 --- a/prog/gameLibs/gpuObjects/volumePlacerES.cpp.inl +++ b/prog/gameLibs/gpuObjects/volumePlacerES.cpp.inl @@ -529,8 +529,7 @@ bool VolumePlacer::gatherGeometryInBox(const TMatrix &transform, float min_trian { geometryMeshesBuffer.close(); geometryMeshesBuffer = dag::create_sbuffer(sizeof(GeometryMesh), max(geometryMeshes.size(), 64), - SBCF_BIND_SHADER_RES | SBCF_DYNAMIC | SBCF_MAYBELOST | SBCF_CPU_ACCESS_WRITE | SBCF_MISC_STRUCTURED, 0, - "gpu_objects_geometry_meshes"); + SBCF_BIND_SHADER_RES | SBCF_DYNAMIC | SBCF_CPU_ACCESS_WRITE | SBCF_MISC_STRUCTURED, 0, "gpu_objects_geometry_meshes"); G_ASSERT(geometryMeshesBuffer.getBuf()); geometryMeshesBufferSize = geometryMeshes.size(); } diff --git a/prog/gameLibs/heightmap/heightmapRenderer.cpp b/prog/gameLibs/heightmap/heightmapRenderer.cpp index ffef87d24..081cef0dd 100644 --- a/prog/gameLibs/heightmap/heightmapRenderer.cpp +++ b/prog/gameLibs/heightmap/heightmapRenderer.cpp @@ -86,6 +86,7 @@ void LodGridVertexData::close() return; del_d3dres(ib); del_d3dres(quadsIb); + patchDim = 0; } bool LodGridVertexData::init(int dim) @@ -96,12 +97,20 @@ bool LodGridVertexData::init(int dim) return true; } patchDim = dim; + + return createBuffers(); +} + +bool LodGridVertexData::createBuffers() +{ + recreateBuffers = false; verticesCnt = (patchDim + 1) * (patchDim + 1); const int indexSize = 2; indicesCnt = patchDim * patchDim * 6; int totalIndicesCnt = indicesCnt * 2; - ib = d3d::create_ib(totalIndicesCnt * indexSize, indexSize == 4 ? SBCF_INDEX32 : 0); + G_ASSERT(!ib); + ib = d3d::create_ib(totalIndicesCnt * indexSize, (indexSize == 4 ? SBCF_INDEX32 : 0), "lod_grid_vdata_ib"); d3d_err(ib); if (!ib) return false; @@ -116,7 +125,8 @@ bool LodGridVertexData::init(int dim) if (hmap_tess_factorVarId != -1) { quadsIndicesCnt = indicesCnt / 6 * 4; - quadsIb = d3d::create_ib(quadsIndicesCnt * indexSize, indexSize == 4 ? SBCF_INDEX32 : 0); + G_ASSERT(!quadsIb); + quadsIb = d3d::create_ib(quadsIndicesCnt * indexSize, (indexSize == 4 ? SBCF_INDEX32 : 0), "lod_grid_vdata_quadsIb"); d3d_err(quadsIb); if (!quadsIb) return false; @@ -131,6 +141,21 @@ bool LodGridVertexData::init(int dim) return true; } +void LodGridVertexData::beforeResetDevice() +{ + if (patchDim <= 0) + return; + + recreateBuffers = true; + del_d3dres(ib); + del_d3dres(quadsIb); +} + +void LodGridVertexData::afterResetDevice() +{ + if (recreateBuffers) + createBuffers(); +} static int heightmap_scale_offset_varId = -1; static int heightmap_scale_offset_c = 0; @@ -151,6 +176,18 @@ void HeightmapRenderer::close() vdata[dimBits - VDATA_OFS].close(); } +void HeightmapRenderer::beforeResetDevice() +{ + for (int i = 0; i < MAX_VDATA; ++i) + vdata[i].beforeResetDevice(); +} + +void HeightmapRenderer::afterResetDevice() +{ + for (int i = 0; i < MAX_VDATA; ++i) + vdata[i].afterResetDevice(); +} + bool HeightmapRenderer::init(const char *shader_name, const char *mat_script, bool do_fatal, int bits) { dimBits = clamp(bits - VDATA_OFS, 0, MAX_VDATA - 1) + VDATA_OFS; diff --git a/prog/gameLibs/landMesh/lmeshManager.cpp b/prog/gameLibs/landMesh/lmeshManager.cpp index 44273a714..d54d6fee8 100644 --- a/prog/gameLibs/landMesh/lmeshManager.cpp +++ b/prog/gameLibs/landMesh/lmeshManager.cpp @@ -1112,7 +1112,7 @@ void LandClassDetailTextures::resetGrassMask(const DataBlock &grassBlk, const ch if (grassMaskTexId == BAD_TEXTUREID) logerr("grass mask absent color_name = %@ info_grass_mask_name = %@ grassMaskName = %@", color_name, info_grass_mask_name, grassMaskName); - G_ASSERT(grassMaskTexId != BAD_TEXTUREID); + G_ASSERTF(grassMaskTexId != BAD_TEXTUREID, "grass_mask_black* mandatory asset not found"); ::acquire_managed_tex(grassMaskTexId); } diff --git a/prog/gameLibs/landMesh/lmeshRenderer.cpp b/prog/gameLibs/landMesh/lmeshRenderer.cpp index bb0ac11b4..523fdab1c 100644 --- a/prog/gameLibs/landMesh/lmeshRenderer.cpp +++ b/prog/gameLibs/landMesh/lmeshRenderer.cpp @@ -144,7 +144,7 @@ static void init_one_quad() { del_d3dres(one_quad); - one_quad = d3d::create_vb(4 * 4 * 2, SBCF_MAYBELOST, "lm-1quad"); + one_quad = d3d::create_vb(4 * 4 * 2, 0, "lm-1quad"); d3d_err(one_quad); short *vert; d3d_err(one_quad->lock(0, 0, (void **)&vert, VBLOCK_WRITEONLY)); diff --git a/prog/gameLibs/landMesh/virtualtexture.cpp b/prog/gameLibs/landMesh/virtualtexture.cpp index a89e49162..3e6e0c006 100644 --- a/prog/gameLibs/landMesh/virtualtexture.cpp +++ b/prog/gameLibs/landMesh/virtualtexture.cpp @@ -1456,8 +1456,6 @@ void ClipmapImpl::processTileFeedback(TileInfoArr &result_arr, int &result_size, for (const auto &pair : aroundCameraTileIndexMap) result_arr[result_size++] = pair.second; - - sort_tile_info_list(result_arr, result_size); } @@ -1598,21 +1596,25 @@ void ClipmapImpl::updateMip(HWFeedbackMode hw_feedback_mode, bool force_update) { sort_tile_info_list(tileInfo, tileInfoSize); - int firstMismatchId; - if (hw_feedback_mode == HWFeedbackMode::DEBUG_COMPARE && - getHWFeedbackMode(currentContext->captureTarget % MAX_FRAMES_AHEAD) == HWFeedbackMode::DEBUG_COMPARE && - !compare_tile_info_results(tileInfo, tileInfoSize, debugTileInfo, debugTileInfoSize, firstMismatchId)) + if (hw_feedback_mode == HWFeedbackMode::DEBUG_COMPARE) { - if (firstMismatchId >= 0) - { - const TexTileInfo &a = tileInfo[firstMismatchId]; // tile feedback - const TexTileInfo &b = debugTileInfo[firstMismatchId]; // texture feedback - logerr("clipmap tile feedback mismatch (%d ; %d), {%d,%d, %d,%d,%d, %d} != {%d,%d, %d,%d,%d, %d}", tileInfoSize, - debugTileInfoSize, a.x, a.y, a.ri_index, a.mip, a.count, a.sortOrder, b.x, b.y, b.ri_index, b.mip, b.count, b.sortOrder); - } - else + sort_tile_info_list(debugTileInfo, debugTileInfoSize); + + int firstMismatchId; + if (getHWFeedbackMode(currentContext->captureTarget % MAX_FRAMES_AHEAD) == HWFeedbackMode::DEBUG_COMPARE && + !compare_tile_info_results(tileInfo, tileInfoSize, debugTileInfo, debugTileInfoSize, firstMismatchId)) { - logerr("clipmap tile feedback mismatch (%d ; %d)", tileInfoSize, debugTileInfoSize); + if (firstMismatchId >= 0) + { + const TexTileInfo &a = tileInfo[firstMismatchId]; // tile feedback + const TexTileInfo &b = debugTileInfo[firstMismatchId]; // texture feedback + logerr("clipmap tile feedback mismatch (%d ; %d), {%d,%d, %d,%d,%d, %d} != {%d,%d, %d,%d,%d, %d}", tileInfoSize, + debugTileInfoSize, a.x, a.y, a.ri_index, a.mip, a.count, a.sortOrder, b.x, b.y, b.ri_index, b.mip, b.count, b.sortOrder); + } + else + { + logerr("clipmap tile feedback mismatch (%d ; %d)", tileInfoSize, debugTileInfoSize); + } } } } diff --git a/prog/gameLibs/metaballsSystem/metaballs.cpp b/prog/gameLibs/metaballsSystem/metaballs.cpp index af49d4659..a505e3cd5 100644 --- a/prog/gameLibs/metaballsSystem/metaballs.cpp +++ b/prog/gameLibs/metaballsSystem/metaballs.cpp @@ -54,8 +54,8 @@ Metaballs::Metaballs() initShader(); const int MAX_VERTICES_COUNT = MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE * 6 * 2; - vb.reset(d3d::create_vb(MAX_VERTICES_COUNT * sizeof(Point3), SBCF_MAYBELOST | SBCF_DYNAMIC, "metaball")); - ib.reset(d3d::create_ib(MAX_VERTICES_COUNT, SBCF_MAYBELOST | SBCF_DYNAMIC)); + vb.reset(d3d::create_vb(MAX_VERTICES_COUNT * sizeof(Point3), SBCF_DYNAMIC, "metaball")); + ib.reset(d3d::create_ib(MAX_VERTICES_COUNT, SBCF_DYNAMIC)); updateJobs.resize(2); #define VAR(a) a##VarId = get_shader_variable_id(#a, true); diff --git a/prog/gameLibs/movie/fullScreenMovie.cpp b/prog/gameLibs/movie/fullScreenMovie.cpp index 27ff8c1a1..d78c985eb 100644 --- a/prog/gameLibs/movie/fullScreenMovie.cpp +++ b/prog/gameLibs/movie/fullScreenMovie.cpp @@ -211,7 +211,7 @@ static void play_movie(const char *fname, const char *audio_fname, const char *s #if _TARGET_PC | _TARGET_C1 | _TARGET_C2 | _TARGET_XBOX for (int i = 0; i < 3; ++i) { - d3d::clearview(CLEAR_ZBUFFER | CLEAR_STENCIL | CLEAR_TARGET, E3DCOLOR(0, 0, 0, 0), 1, 0); + d3d::clearview(CLEAR_TARGET, E3DCOLOR(0, 0, 0, 0), 1, 0); d3d::update_screen(); dagor_idle_cycle(); } @@ -530,7 +530,7 @@ static void play_movie_gui(const char *fname, const char *audio_fname, const cha #if _TARGET_PC | _TARGET_C1 | _TARGET_C2 | _TARGET_XBOX for (int i = 0; i < 3; ++i) { - d3d::clearview(CLEAR_ZBUFFER | CLEAR_STENCIL | CLEAR_TARGET, E3DCOLOR(0, 0, 0, 0), 1, 0); + d3d::clearview(CLEAR_TARGET, E3DCOLOR(0, 0, 0, 0), 1, 0); d3d::update_screen(); dagor_idle_cycle(); } diff --git a/prog/gameLibs/nodeBasedShaderManager/platformLabels.h b/prog/gameLibs/nodeBasedShaderManager/platformLabels.h index 7a4c8c807..893214dd1 100644 --- a/prog/gameLibs/nodeBasedShaderManager/platformLabels.h +++ b/prog/gameLibs/nodeBasedShaderManager/platformLabels.h @@ -10,5 +10,5 @@ static const eastl::array #include #include -#include #include +#include #include #include namespace httprequests { typedef intptr_t RequestId; -typedef eastl::map StringMap; +typedef dag::VectorMap StringMap; enum class RequestStatus { @@ -31,9 +31,9 @@ class IAsyncHTTPCallback public: virtual ~IAsyncHTTPCallback() {} - virtual void onRequestDone(RequestStatus status, int http_code, dag::ConstSpan response, StringMap const &headers) = 0; + virtual void onRequestDone(RequestStatus status, int http_code, dag::ConstSpan response, StringMap const &resp_headers) = 0; virtual void onHttpProgress(size_t /*dltotal*/, size_t /*dlnow*/) {} - virtual void onHttpHeaderResponse(StringMap const & /*headers*/) {} + virtual void onHttpHeadersResponse(StringMap const & /*resp_headers*/) {} // return true if peace of data is processed by callback and should't be stored virtual bool onResponseData(dag::ConstSpan) { return false; } @@ -43,16 +43,16 @@ class IAsyncHTTPCallback template IAsyncHTTPCallback *make_http_callback(F on_response) { - class Callback : public IAsyncHTTPCallback + class Callback final : public IAsyncHTTPCallback { public: Callback(F on_response) : cb(eastl::move(on_response)) {} - void onRequestDone(RequestStatus status, int http_code, dag::ConstSpan response, StringMap const &headers) + void onRequestDone(RequestStatus status, int http_code, dag::ConstSpan response, StringMap const &resp_headers) override { - cb(status, http_code, response, headers); + cb(status, http_code, response, resp_headers); } - void release() { delete this; } + void release() override { delete this; } private: F cb; @@ -63,19 +63,19 @@ IAsyncHTTPCallback *make_http_callback(F on_response) template IAsyncHTTPCallback *make_http_callback(F1 on_response, F2 on_stream_data) { - class Callback : public IAsyncHTTPCallback + class Callback final : public IAsyncHTTPCallback { public: Callback(F1 on_response, F2 on_stream_data) : resp_cb(eastl::move(on_response)), stream_cb(eastl::move(on_stream_data)) {} - void onRequestDone(RequestStatus status, int http_code, dag::ConstSpan response, StringMap const &headers) + void onRequestDone(RequestStatus status, int http_code, dag::ConstSpan response, StringMap const &resp_headers) override { - resp_cb(status, http_code, response, headers); + resp_cb(status, http_code, response, resp_headers); } - bool onResponseData(dag::ConstSpan data) { return stream_cb(data); } + bool onResponseData(dag::ConstSpan data) override { return stream_cb(data); } - void release() { delete this; } + void release() override { delete this; } private: F1 resp_cb; @@ -87,28 +87,28 @@ IAsyncHTTPCallback *make_http_callback(F1 on_response, F2 on_stream_data) template IAsyncHTTPCallback *make_http_callback(F1 on_response, F2 on_stream_data, F3 on_http_header) { - class Callback : public IAsyncHTTPCallback + class Callback final : public IAsyncHTTPCallback { public: Callback(F1 on_response, F2 on_stream_data, F3 on_http_header) : - resp_cb(eastl::move(on_response)), stream_cb(eastl::move(on_stream_data)), header_cb(eastl::move(on_http_header)) + resp_cb(eastl::move(on_response)), stream_cb(eastl::move(on_stream_data)), resp_headers_cb(eastl::move(on_http_header)) {} - void onRequestDone(RequestStatus status, int http_code, dag::ConstSpan response, StringMap const &headers) + void onRequestDone(RequestStatus status, int http_code, dag::ConstSpan response, StringMap const &resp_headers) override { - resp_cb(status, http_code, response, headers); + resp_cb(status, http_code, response, resp_headers); } - bool onResponseData(dag::ConstSpan data) { return stream_cb(data); } + bool onResponseData(dag::ConstSpan data) override { return stream_cb(data); } - void onHttpHeaderResponse(StringMap const &headers) { header_cb(headers); } + void onHttpHeadersResponse(StringMap const &resp_headers) override { resp_headers_cb(resp_headers); } - void release() { delete this; } + void release() override { delete this; } private: F1 resp_cb; F2 stream_cb; - F3 header_cb; + F3 resp_headers_cb; }; return new Callback(eastl::move(on_response), eastl::move(on_stream_data), eastl::move(on_http_header)); } @@ -116,33 +116,33 @@ IAsyncHTTPCallback *make_http_callback(F1 on_response, F2 on_stream_data, F3 on_ template IAsyncHTTPCallback *make_http_callback(F1 on_response, F2 on_stream_data, F3 on_http_header, F4 on_http_progress) { - class Callback : public IAsyncHTTPCallback + class Callback final : public IAsyncHTTPCallback { public: Callback(F1 on_response, F2 on_stream_data, F3 on_http_header, F4 on_http_progress) : resp_cb(eastl::move(on_response)), stream_cb(eastl::move(on_stream_data)), - header_cb(eastl::move(on_http_header)), + resp_headers_cb(eastl::move(on_http_header)), progress_cb(eastl::move(on_http_progress)) {} - void onRequestDone(RequestStatus status, int http_code, dag::ConstSpan response, StringMap const &headers) + void onRequestDone(RequestStatus status, int http_code, dag::ConstSpan response, StringMap const &resp_headers) override { - resp_cb(status, http_code, response, headers); + resp_cb(status, http_code, response, resp_headers); } - bool onResponseData(dag::ConstSpan data) { return stream_cb(data); } + bool onResponseData(dag::ConstSpan data) override { return stream_cb(data); } - void onHttpHeaderResponse(StringMap const &headers) { header_cb(headers); } + void onHttpHeadersResponse(StringMap const &resp_headers) override { resp_headers_cb(resp_headers); } - void onHttpProgress(size_t dltotal, size_t dlnow) { progress_cb(dltotal, dlnow); } + void onHttpProgress(size_t dltotal, size_t dlnow) override { progress_cb(dltotal, dlnow); } - void release() { delete this; } + void release() override { delete this; } private: F1 resp_cb; F2 stream_cb; - F3 header_cb; + F3 resp_headers_cb; F4 progress_cb; }; return new Callback(eastl::move(on_response), eastl::move(on_stream_data), eastl::move(on_http_header), @@ -170,7 +170,7 @@ struct AsyncRequestParams bool verifyCert = true; bool verifyHost = true; bool allowHttpContentEncoding = true; - bool needHeaders = false; + bool needResponseHeaders = true; // Set to false if you don't need response resp_headers const char *clientCertificateFile = nullptr; const char *clientPrivateKeyFile = nullptr; diff --git a/prog/gameLibs/publicInclude/daFx/dafx_hlsl_funcs.hlsli b/prog/gameLibs/publicInclude/daFx/dafx_hlsl_funcs.hlsli index 526173793..c23bc03eb 100644 --- a/prog/gameLibs/publicInclude/daFx/dafx_hlsl_funcs.hlsli +++ b/prog/gameLibs/publicInclude/daFx/dafx_hlsl_funcs.hlsli @@ -43,6 +43,9 @@ DAFX_INLINE float4 float4x4_row( const float4x4 &a, int r) { return a.getrow(r); } DAFX_INLINE float pow2( float v ) { return v * v; } + DAFX_INLINE float2 pow2( float2 v ) { return v * v; } + DAFX_INLINE float3 pow2( float3 v ) { return v * v; } + DAFX_INLINE float4 pow2( float4 v ) { return v * v; } DAFX_INLINE float pow4( float v ) { v *= v; return v * v; } DAFX_INLINE int2 int_xy( const int3 & a) { return int2( a.x, a.y ); } diff --git a/prog/gameLibs/publicInclude/daSkies2/daSkies.h b/prog/gameLibs/publicInclude/daSkies2/daSkies.h index 5b230b859..725c7f30a 100644 --- a/prog/gameLibs/publicInclude/daSkies2/daSkies.h +++ b/prog/gameLibs/publicInclude/daSkies2/daSkies.h @@ -274,6 +274,7 @@ class DaSkies // hole_target_pos - that wont be in shadow, density - amount of shadow void resetCloudsHole(const Point3 &hole_target_pos, const float &hole_density = 0); void resetCloudsHole(); // but basically removes clouds hole. call updatePanorama right after, if you have one + void setUseCloudsHole(bool set); // finding hole can be disabled to avoid sky jumping around while changing params Point2 getCloudsHolePosition() const; // that's for debug only! void setCloudsHolePosition(const Point2 &); // that's for debug only! diff --git a/prog/gameLibs/publicInclude/dasModules/aotBallistics.h b/prog/gameLibs/publicInclude/dasModules/aotBallistics.h index b8b395316..6c9e4294b 100644 --- a/prog/gameLibs/publicInclude/dasModules/aotBallistics.h +++ b/prog/gameLibs/publicInclude/dasModules/aotBallistics.h @@ -17,3 +17,12 @@ MAKE_TYPE_FACTORY(ProjectileBallisticsState, ballistics::ProjectileBallistics::S MAKE_TYPE_FACTORY(ProjectileBallistics, ballistics::ProjectileBallistics); MAKE_TYPE_FACTORY(ShellEnv, ballistics::ShellEnv); MAKE_TYPE_FACTORY(ShellState, ballistics::ShellState); + +namespace bind_dascript +{ +inline void ballistics_simulate(const ballistics::ShellEnv &env, const ballistics::ShellProps &prop, ballistics::ShellState &state, + float dt, float current_time) +{ + ballistics::simulate(env, prop, state, dt, current_time); +} +} // namespace bind_dascript \ No newline at end of file diff --git a/prog/gameLibs/publicInclude/dasModules/aotDm.h b/prog/gameLibs/publicInclude/dasModules/aotDm.h index e787ae0ef..b166ff4e8 100644 --- a/prog/gameLibs/publicInclude/dasModules/aotDm.h +++ b/prog/gameLibs/publicInclude/dasModules/aotDm.h @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -23,8 +22,9 @@ #include #include #include -#include #include +#include +#include typedef dag::Vector DamageModelDataPartProps; typedef dag::Vector DamageModelDataParts; @@ -56,6 +56,7 @@ MAKE_TYPE_FACTORY(PenetrationTableProps, dm::kinetic::PenetrationTableProps); MAKE_TYPE_FACTORY(DamageTableProps, dm::kinetic::DamageTableProps); MAKE_TYPE_FACTORY(EffectsProbabilityMultiplierProps, dm::kinetic::EffectsProbabilityMultiplierProps); MAKE_TYPE_FACTORY(DamageEffectActionCluster, dm::effect::ActionCluster); +MAKE_TYPE_FACTORY(SplashProps, dm::splash::Properties); DAS_BIND_VECTOR_SET(MetaPartPartIds, MetaPartPartIds, dm::PartId, " ::MetaPartPartIds") DAS_BIND_VECTOR(MetaPartPropsVector, dm::MetaPartPropsVector, dm::MetaPartProp, " ::dm::MetaPartPropsVector") @@ -164,9 +165,14 @@ inline bool is_part_inner(const dm::DamageModelData &dm_data, int part_id) return props && props->testFlag(dm::DamagePartProps::Flag::INNER); } -inline dm::splash::Params calc_splash_params(int damage_props_id, bool underwater) +inline dm::splash::Params calc_splash_params(int damage_props_id, const dm::splash::Properties &splash_properties, bool underwater) { - return dm::splash::calc_params(damage_props_id, underwater ? dm::PhysEnvironment::WATER : dm::PhysEnvironment::AIR); + const dm::ExplosiveProps *explosiveProps = dm::ExplosiveProps::get_props(damage_props_id); + dm::splash::Params params; + dm::splash::calc_params(&splash_properties, explosiveProps, dm::splash::FallBySquare::get_value(damage_props_id), + dm::splash::DamageTypeProp::get_value(damage_props_id), underwater ? dm::PhysEnvironment::WATER : dm::PhysEnvironment::AIR, + params); + return params; } inline dm::synthetic_shatter::Params calc_synthetic_shatter_params(int damage_props_id, float shell_mass, bool underwater) diff --git a/prog/gameLibs/publicInclude/dasModules/aotSoundSystem.h b/prog/gameLibs/publicInclude/dasModules/aotSoundSystem.h index 6ccb789b2..654e6d459 100644 --- a/prog/gameLibs/publicInclude/dasModules/aotSoundSystem.h +++ b/prog/gameLibs/publicInclude/dasModules/aotSoundSystem.h @@ -16,10 +16,9 @@ namespace soundsystem_bind_dascript { inline bool have_sound() { return sndsys::is_inited(); } inline void sound_debug(const char *message) { sndsys::debug_trace_info("%s", message); } -inline bool get_enable_debug_draw() { return sndsys::get_enable_debug_draw(); } inline Point3 get_listener_pos() { return sndsys::get_3d_listener_pos(); } -inline void update_listener(float delta_time, const TMatrix &listener_tm) { sndsys::update_listener(delta_time, listener_tm); } -inline void reset_3d_listener() { sndsys::reset_3d_listener(); } +inline void sound_update_listener(float delta_time, const TMatrix &listener_tm) { sndsys::update_listener(delta_time, listener_tm); } +inline void sound_reset_3d_listener() { sndsys::reset_3d_listener(); } inline bool sound_banks_is_preset_loaded(const char *preset_name) { return sndsys::banks::is_loaded(preset_name); } inline void sound_enable_distant_delay(bool enable) { sndsys::delayed::enable_distant_delay(enable); } @@ -36,4 +35,6 @@ inline void sound_banks_enable_preset_starting_with(const char *name, bool enabl inline bool sound_banks_is_preset_enabled(const char *name) { return sndsys::banks::is_enabled(name); } inline void sound_debug_enum_events() { sndsys::debug_enum_events(); } + +inline void sound_update(float dt) { sndsys::update(dt); } } // namespace soundsystem_bind_dascript diff --git a/prog/gameLibs/publicInclude/dasModules/dasFsFileAccess.h b/prog/gameLibs/publicInclude/dasModules/dasFsFileAccess.h index 48596eeb8..449e98a7d 100644 --- a/prog/gameLibs/publicInclude/dasModules/dasFsFileAccess.h +++ b/prog/gameLibs/publicInclude/dasModules/dasFsFileAccess.h @@ -9,6 +9,7 @@ #include "daScript/simulate/fs_file_info.h" #include #include +#include #include #define DASLIB_MODULE_NAME "daslib" @@ -130,6 +131,20 @@ class DagFileAccess final : public das::ModuleFileAccess if (localAccess) localAccess->delRef(); } + int64_t getFileMtime(const das::string &fileName) const override + { + DagorStat stat; + auto res = df_stat(fileName.c_str(), &stat) >= 0 ? stat.mtime : -1; + if (res >= 0) + return res; +#if _TARGET_ANDROID + VirtualRomFsData *vrom = NULL; + VromReadHandle data = vromfs_get_file_data(fileName.c_str(), &vrom); + if (vrom && vrom->version != 0) + return vrom->version; +#endif + return 0; + } bool invalidateFileInfo(const das::string &fileName) override { bool res = false; @@ -155,11 +170,7 @@ class DagFileAccess final : public das::ModuleFileAccess } if (EASTL_UNLIKELY(!found)) { - DagorStat stat; - int64_t mtime = 0; - if (df_stat(fname.c_str(), &stat) != -1) - mtime = stat.mtime; - filesOpened.emplace(fname, mtime); + filesOpened.emplace(fname, getFileMtime(fname)); } } return res; @@ -172,11 +183,7 @@ class DagFileAccess final : public das::ModuleFileAccess } if (storeOpenedFiles) { - DagorStat stat; - int64_t mtime = 0; - if (df_fstat(f, &stat) != -1) - mtime = stat.mtime; - filesOpened.emplace(fname, mtime); + filesOpened.emplace(fname, getFileMtime(fname)); } df_close(f); return setFileInfo(fname, das::make_unique()); diff --git a/prog/gameLibs/publicInclude/dataBlockUtils/blkUtils.h b/prog/gameLibs/publicInclude/dataBlockUtils/blkUtils.h index 5fb9eb4ac..48f7ec625 100644 --- a/prog/gameLibs/publicInclude/dataBlockUtils/blkUtils.h +++ b/prog/gameLibs/publicInclude/dataBlockUtils/blkUtils.h @@ -9,8 +9,7 @@ class DataBlock; -bool check_param_exist(const DataBlock *blk, std::initializer_list param_names); -bool check_all_params_exist(const DataBlock *blk, const char *prop_name, std::initializer_list param_names); -bool check_all_params_exist_in_subblocks(const DataBlock *blk, const char *subblock_name, const char *prop_name, +bool check_param_exist(const DataBlock &blk, std::initializer_list param_names); +bool check_all_params_exist(const DataBlock &blk, const char *prop_name, std::initializer_list param_names); +bool check_all_params_exist_in_subblocks(const DataBlock &blk, const char *subblock_name, const char *prop_name, std::initializer_list param_names); -void interpolate_datablock(const DataBlock &from, DataBlock &to, float t); diff --git a/prog/gameLibs/publicInclude/dataBlockUtils/interpolateBlk.h b/prog/gameLibs/publicInclude/dataBlockUtils/interpolateBlk.h new file mode 100644 index 000000000..567fc92eb --- /dev/null +++ b/prog/gameLibs/publicInclude/dataBlockUtils/interpolateBlk.h @@ -0,0 +1,10 @@ +// +// Dagor Engine 6.5 - Game Libraries +// Copyright (C) 2023 Gaijin Games KFT. All rights reserved +// (for conditions of use see prog/license.txt) +// +#pragma once + +class DataBlock; + +void interpolate_datablock(const DataBlock &from, DataBlock &to, float t); diff --git a/prog/gameLibs/publicInclude/datacache/datacache.h b/prog/gameLibs/publicInclude/datacache/datacache.h index 8c631e053..18ec21128 100644 --- a/prog/gameLibs/publicInclude/datacache/datacache.h +++ b/prog/gameLibs/publicInclude/datacache/datacache.h @@ -73,14 +73,14 @@ enum ErrorCode class Entry; typedef void (*completion_cb_t)(const char *key, ErrorCode error, Entry *entry, void *arg); -typedef void (*header_cb_t)(const char *key, streamio::StringMap const &headers, void *arg); +typedef void (*resp_headers_cb_t)(const char *key, streamio::StringMap const &headers, void *arg); class Backend { public: virtual ~Backend() {} // If ERR_PENDING is returned in 'error' param then callback will be called upon completion - virtual Entry *getWithHeaders(const char *, ErrorCode * = NULL, completion_cb_t = NULL, header_cb_t = NULL, void * = NULL) + virtual Entry *getWithRespHeaders(const char *, ErrorCode * = NULL, completion_cb_t = NULL, resp_headers_cb_t = NULL, void * = NULL) { return nullptr; }; diff --git a/prog/gameLibs/publicInclude/decalMatrices/decal_matrices_const.hlsli b/prog/gameLibs/publicInclude/decalMatrices/decal_matrices_const.hlsli index b8eae8fec..e53c2dbc4 100644 --- a/prog/gameLibs/publicInclude/decalMatrices/decal_matrices_const.hlsli +++ b/prog/gameLibs/publicInclude/decalMatrices/decal_matrices_const.hlsli @@ -4,7 +4,6 @@ #define SPRAY_DECALS_MAX_MATRICES_COUNT 300 #define BILLBOARD_DECALS_MAX_MATRICES_COUNT 300 #define BLOOD_PUDDLES_MAX_MATRICES_COUNT 300 -#define EXPLOSION_PATCHES_MAX_MATRICES_COUNT 64 #define HEAT_SOURCE_DECALS_MAX_MATRICES_COUNT 1024 #endif \ No newline at end of file diff --git a/prog/gameLibs/publicInclude/ecs/camera/getActiveCameraSetup.h b/prog/gameLibs/publicInclude/ecs/camera/getActiveCameraSetup.h index 5709010a7..65f0e0bc5 100644 --- a/prog/gameLibs/publicInclude/ecs/camera/getActiveCameraSetup.h +++ b/prog/gameLibs/publicInclude/ecs/camera/getActiveCameraSetup.h @@ -44,6 +44,10 @@ TMatrix4 calc_active_camera_globtm(); //! Calculates camera transforms and such void calc_camera_values(const CameraSetup &camera_setup, TMatrix &view_tm, Driver3dPerspective &persp, int &view_w, int &view_h); +//! Detailed calculation functions +TMatrix calc_camera_view_tm(const TMatrix &view_itm); +Driver3dPerspective calc_camera_perspective(const CameraSetup &camera_setup, int view_w, int view_h); //! sets camera parameters to D3D void apply_camera_setup(const CameraSetup &camera_setup); +void apply_camera_setup(const TMatrix &view_itm, const TMatrix &view_tm, const Driver3dPerspective &persp, int view_w, int view_h); diff --git a/prog/gameLibs/publicInclude/ecs/scripts/dasEs.h b/prog/gameLibs/publicInclude/ecs/scripts/dasEs.h index 4867ead50..292399d94 100644 --- a/prog/gameLibs/publicInclude/ecs/scripts/dasEs.h +++ b/prog/gameLibs/publicInclude/ecs/scripts/dasEs.h @@ -82,8 +82,12 @@ bool load_das_script(const char *fname); bool load_das_script_debugger(const char *fname); bool load_das_script_with_debugcode(const char *fname); void warn_on_persistent_heap(bool value); -bool enqueue_das_script(const char *fname); bool load_entry_script(const char *entry_point_name, TInitDas init, LoadEntryScriptCtx ctx = {}); +// internal use only +void begin_loading_queue(); +bool stop_loading_queue(TInitDas init); +void end_loading_queue(LoadEntryScriptCtx ctx); + bool main_thread_post_load(); bool unload_es_script(const char *fname); bool reload_all_scripts(const char *entry_point_name, TInitDas init); diff --git a/prog/gameLibs/publicInclude/fftWater/fftWater.h b/prog/gameLibs/publicInclude/fftWater/fftWater.h index ec6093712..e46342699 100644 --- a/prog/gameLibs/publicInclude/fftWater/fftWater.h +++ b/prog/gameLibs/publicInclude/fftWater/fftWater.h @@ -244,6 +244,7 @@ void build_distance_field(UniqueTexHolder &, int texture_size, int heightmap_tex RiverRendererCB *riversCB); // if reiversCB == 0, it won't be used. if detect_rivers_width<=0 it won't be used void build_flowmap(FFTWater *handle, FlowmapParams &flowmap_params, int flowmap_texture_size, int heightmap_texture_size, const Point3 &camera_pos, float range); +void set_flowmap_tex(FlowmapParams &flowmap_params); void set_flowmap_params(FlowmapParams &flowmap_params); void set_flowmap_foam_params(FlowmapParams &flowmap_params); void close_flowmap(FlowmapParams &flowmap_params); diff --git a/prog/gameLibs/publicInclude/gamePhys/phys/destructableRendObject.h b/prog/gameLibs/publicInclude/gamePhys/phys/destructableRendObject.h index 0ce20b98f..59cc11487 100644 --- a/prog/gameLibs/publicInclude/gamePhys/phys/destructableRendObject.h +++ b/prog/gameLibs/publicInclude/gamePhys/phys/destructableRendObject.h @@ -32,5 +32,6 @@ struct DestrRendDataDeleter }; void before_render(const Point3 &view_pos); -void render(dynrend::ContextId inst_ctx, const Frustum &frustum); +// Objects with a bounding box radius < min_bbox_radius will be skipped. +void render(dynrend::ContextId inst_ctx, const Frustum &frustum, float min_bbox_radius); } // namespace destructables diff --git a/prog/gameLibs/publicInclude/gamePhys/phys/walker/humanControlState.h b/prog/gameLibs/publicInclude/gamePhys/phys/walker/humanControlState.h index e694eab20..d9151e631 100644 --- a/prog/gameLibs/publicInclude/gamePhys/phys/walker/humanControlState.h +++ b/prog/gameLibs/publicInclude/gamePhys/phys/walker/humanControlState.h @@ -101,7 +101,7 @@ struct HumanControlState //-V730 QUICK_RELOAD_SHIFT = DODGE_SHIFT + DODGE_BITS, QUICK_RELOAD_BITS = 1, QUICK_RELOAD_MASK = (1 << QUICK_RELOAD_BITS) - 1, - EXT_BITS = LEAN_BITS + TS_BITS + DEVICE_BITS + ALT_ATTACK_BITS + DODGE_BITS, + EXT_BITS = LEAN_BITS + TS_BITS + DEVICE_BITS + ALT_ATTACK_BITS + DODGE_BITS + QUICK_RELOAD_BITS, }; G_STATIC_ASSERT((HCT_NUM + EWS_BITS + /*PS_HAS_EXT_STATE*/ 1) <= sizeof(decltype(packedState)) * CHAR_BIT); G_STATIC_ASSERT(EXT_BITS <= sizeof(decltype(extendedState)) * CHAR_BIT); diff --git a/prog/gameLibs/publicInclude/gamePhys/phys/walker/humanPhys.h b/prog/gameLibs/publicInclude/gamePhys/phys/walker/humanPhys.h index 6f7c09fdb..e66be6efa 100644 --- a/prog/gameLibs/publicInclude/gamePhys/phys/walker/humanPhys.h +++ b/prog/gameLibs/publicInclude/gamePhys/phys/walker/humanPhys.h @@ -408,6 +408,7 @@ class HumanPhys final : public PhysicsBase collisionCenterPos; carray ccdPos; float maxObstacleHeight = 0.5f; + float maxStepOverHeight = 0.0f; float maxCrawlObstacleHeight = 0.5f; float maxObstacleDownReach = 0.1f; diff --git a/prog/gameLibs/publicInclude/gpuReadbackQuery/gpuReadbackQuerySystem.h b/prog/gameLibs/publicInclude/gpuReadbackQuery/gpuReadbackQuerySystem.h index 1630d5940..c6e5250a7 100644 --- a/prog/gameLibs/publicInclude/gpuReadbackQuery/gpuReadbackQuerySystem.h +++ b/prog/gameLibs/publicInclude/gpuReadbackQuery/gpuReadbackQuerySystem.h @@ -134,7 +134,7 @@ GpuReadbackQuerySystem::GpuReadbackQuerySystem(const GpuReadbac resultRingBuffer.init(sizeof(ResultT), desc.maxQueriesPerFrame, 3, desc.resultBufferName, SBCF_UA_STRUCTURED_READBACK, 0, false); - const uint32_t inputBufferFlags = SBCF_MISC_STRUCTURED | SBCF_BIND_SHADER_RES | SBCF_MAYBELOST; + const uint32_t inputBufferFlags = SBCF_MISC_STRUCTURED | SBCF_BIND_SHADER_RES; inputBuffer = dag::create_sbuffer(sizeof(InputT), desc.maxQueriesPerFrame, inputBufferFlags, 0, desc.inputBufferName); inputs.resize(desc.maxQueriesPerFrame); diff --git a/prog/gameLibs/publicInclude/heightmap/heightmapCulling.h b/prog/gameLibs/publicInclude/heightmap/heightmapCulling.h index a5878f40b..a97ee7a24 100644 --- a/prog/gameLibs/publicInclude/heightmap/heightmapCulling.h +++ b/prog/gameLibs/publicInclude/heightmap/heightmapCulling.h @@ -25,8 +25,12 @@ struct LodGridVertexData int quadsCnt = 0; int verticesCnt = 0, indicesCnt = 0, quadsIndicesCnt = 0; volatile int refCnt = 0; + bool recreateBuffers = false; void close(); bool init(int dim); + bool createBuffers(); + void beforeResetDevice(); + void afterResetDevice(); }; class HeightmapHeightCulling diff --git a/prog/gameLibs/publicInclude/heightmap/heightmapRenderer.h b/prog/gameLibs/publicInclude/heightmap/heightmapRenderer.h index 1b9156c32..f7e903162 100644 --- a/prog/gameLibs/publicInclude/heightmap/heightmapRenderer.h +++ b/prog/gameLibs/publicInclude/heightmap/heightmapRenderer.h @@ -35,6 +35,9 @@ class HeightmapRenderer ~HeightmapRenderer() { close(); } void close(); + static void beforeResetDevice(); + static void afterResetDevice(); + protected: ShaderMaterial *shmat; ShaderElement *shElem; diff --git a/prog/gameLibs/publicInclude/quirrel/bindQuirrelEx/autoCleanup.h b/prog/gameLibs/publicInclude/quirrel/bindQuirrelEx/autoCleanup.h new file mode 100644 index 000000000..92658354b --- /dev/null +++ b/prog/gameLibs/publicInclude/quirrel/bindQuirrelEx/autoCleanup.h @@ -0,0 +1,33 @@ +// +// Dagor Engine 6.5 - Game Libraries +// Copyright (C) 2023 Gaijin Games KFT. All rights reserved +// (for conditions of use see prog/license.txt) +// +#pragma once + +#include +#include + +namespace sq +{ + +typedef void (*cleanup_unregfunc_cb_t)(HSQUIRRELVM vm); + +class CleanupUnregRec // record of auto-binding registry +{ + cleanup_unregfunc_cb_t unregfuncCb; + CleanupUnregRec *next; + friend void cleanup_unreg_native_api(HSQUIRRELVM); + +public: + CleanupUnregRec(cleanup_unregfunc_cb_t cb); +}; + +// Actually perform cleanup from VM (usually before VM destruction) +void cleanup_unreg_native_api(HSQUIRRELVM vm); + +}; // namespace sq + +#define SQ_DEF_AUTO_CLEANUP_UNREGFUNC(Func) \ + static sq::CleanupUnregRec Func##_auto_cleanup_var(&Func); \ + extern const size_t sq_autocleanup_pull_##Func = (size_t)(&Func##_auto_cleanup_var); diff --git a/prog/gameLibs/publicInclude/rendInst/rendInstGen.h b/prog/gameLibs/publicInclude/rendInst/rendInstGen.h index fc3c4ce50..93872340d 100644 --- a/prog/gameLibs/publicInclude/rendInst/rendInstGen.h +++ b/prog/gameLibs/publicInclude/rendInst/rendInstGen.h @@ -45,6 +45,8 @@ typedef void *(*ri_register_collision_cb)(const CollisionResource *collRes, cons typedef void (*ri_unregister_collision_cb)(void *&handle); typedef eastl::fixed_function<2 * sizeof(void *), void(const char *)> res_walk_cb; +extern void (*do_delayed_ri_extra_destruction)(); + extern void (*shadow_invalidate_cb)(const BBox3 &box); extern BBox3 (*get_shadows_bbox_cb)(); @@ -148,8 +150,6 @@ extern bool rendinstSecondaryLayer; // should be set only once before init extern bool tmInst12x32bit; void set_billboards_vertical(bool is_vertical); -void setImpostorDiffuseSizeMul(int value); -bool enable_impostors_compression(bool enabled); void setDistMul(float distMul, float distOfs, bool force_impostors_and_mul = false, float impostors_far_dist_additional_mul = 1.f); // 0.2353, 0.0824 will remap 0.5 .. 2.2 to 0.2 .. 0.6 void setImpostorsDistAddMul(float impostors_dist_additional_mul); diff --git a/prog/gameLibs/publicInclude/rendInst/rendInstGenRender.h b/prog/gameLibs/publicInclude/rendInst/rendInstGenRender.h index fc173d74e..c5c42cdd4 100644 --- a/prog/gameLibs/publicInclude/rendInst/rendInstGenRender.h +++ b/prog/gameLibs/publicInclude/rendInst/rendInstGenRender.h @@ -20,7 +20,6 @@ namespace rendinst::render { inline constexpr int MAX_LOD_COUNT_WITH_ALPHA = rendinst::MAX_LOD_COUNT + 1; -inline constexpr int INSTANCING_TEXREG = 14; inline constexpr int GPU_INSTANCING_OFSBUFFER_TEXREG = 11; inline constexpr int TREECROWN_TEXREG = 16; @@ -32,6 +31,7 @@ extern bool tmInstColored; extern bool impostorPreshadowNeedUpdate; extern float riExtraMinSizeForReflection; extern float riExtraMinSizeForDraftDepth; +extern int instancingTexRegNo; void useRiDepthPrepass(bool use); void useRiCellsDepthPrepass(bool use); diff --git a/prog/gameLibs/publicInclude/render/capsulesAO.cpp.inl b/prog/gameLibs/publicInclude/render/capsulesAO.cpp.inl index 009058f8a..d76383fef 100644 --- a/prog/gameLibs/publicInclude/render/capsulesAO.cpp.inl +++ b/prog/gameLibs/publicInclude/render/capsulesAO.cpp.inl @@ -27,10 +27,9 @@ struct CapsulesAOHolder return; max_ao_units_count = clamp(max_ao_units_count, uint32_t(1), uint32_t(MAX_AO_UNITS)); capsuled_units_indirection = dag::create_sbuffer(sizeof(uint), UNITS_AO_GRID_SIZE * UNITS_AO_GRID_SIZE, - SBCF_DYNAMIC | SBCF_MAYBELOST | SBCF_CPU_ACCESS_WRITE | SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED, 0, - "capsuled_units_indirection"); + SBCF_DYNAMIC | SBCF_CPU_ACCESS_WRITE | SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED, 0, "capsuled_units_indirection"); capsuled_units_ao = dag::create_sbuffer(sizeof(CapsuledAOUnit), maxAOUnitsCount = max_ao_units_count, - SBCF_DYNAMIC | SBCF_MAYBELOST | SBCF_CPU_ACCESS_WRITE | SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED, 0, "capsuled_units_ao"); + SBCF_DYNAMIC | SBCF_CPU_ACCESS_WRITE | SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED, 0, "capsuled_units_ao"); } ~CapsulesAOHolder() diff --git a/prog/gameLibs/publicInclude/render/cascadeShadows.h b/prog/gameLibs/publicInclude/render/cascadeShadows.h index 761fdc925..83e222106 100644 --- a/prog/gameLibs/publicInclude/render/cascadeShadows.h +++ b/prog/gameLibs/publicInclude/render/cascadeShadows.h @@ -53,6 +53,9 @@ class CascadeShadows float shadowConstDepthBias = 0.00002; float shadowDepthSlopeBias = 0.83; float zRangeToDepthBiasScale = 1e-4; + // Skip rendering to CSM any destructable whose bounding box radius is less than + // (static shadow texel size) * (this multiplier) + float destructablesMinBboxRadiusTexelMul = 0.f; }; struct ModeSettings diff --git a/prog/gameLibs/publicInclude/render/dag_cur_view.h b/prog/gameLibs/publicInclude/render/dag_cur_view.h new file mode 100644 index 000000000..da60453fc --- /dev/null +++ b/prog/gameLibs/publicInclude/render/dag_cur_view.h @@ -0,0 +1,20 @@ +// +// Dagor Engine 6.5 +// Copyright (C) 2023 Gaijin Games KFT. All rights reserved +// (for conditions of use see prog/license.txt) +// +#pragma once + +#include + +#include + +struct DagorCurView +{ + TMatrix tm, itm; + Point3 pos; // current scene view position +}; + +extern DagorCurView grs_cur_view; + +#include diff --git a/prog/gameLibs/publicInclude/render/debug3dSolidBuffered.h b/prog/gameLibs/publicInclude/render/debug3dSolidBuffered.h index b12a0367f..c2911cecc 100644 --- a/prog/gameLibs/publicInclude/render/debug3dSolidBuffered.h +++ b/prog/gameLibs/publicInclude/render/debug3dSolidBuffered.h @@ -8,7 +8,7 @@ #include "debug3dSolid.h" #include -void flush_buffered_debug_meshes(bool decriment_buffer_frames); +void flush_buffered_debug_meshes(bool game_is_paused); void draw_debug_solid_mesh_buffered(const uint16_t *indices, int faces_count, const float *xyz_pos, int vertex_size, int vertices_count, const TMatrix &tm, Color4 color, size_t frames); diff --git a/prog/gameLibs/publicInclude/render/dynamicCube.h b/prog/gameLibs/publicInclude/render/dynamicCube.h deleted file mode 100644 index f89304985..000000000 --- a/prog/gameLibs/publicInclude/render/dynamicCube.h +++ /dev/null @@ -1,65 +0,0 @@ -// -// Dagor Engine 6.5 - Game Libraries -// Copyright (C) 2023 Gaijin Games KFT. All rights reserved -// (for conditions of use see prog/license.txt) -// -#pragma once - -#include <3d/dag_texMgr.h> -#include <3d/dag_drv3dConsts.h> - - -class PostFxRenderer; -class BaseTexture; -typedef BaseTexture Texture; -typedef BaseTexture CubeTexture; - - -class IRenderDynamicCubeFace -{ -public: - virtual void renderDynamicCubeFace(int tex_var_id, TEXTUREID tex_id, CubeTexture *texture, unsigned int face_no) = 0; -}; - - -class DynamicCube -{ -public: - int dynamicCubeTex1VarId; - int dynamicCubeTex2VarId; - int dynamicCubeTexVarId; - - - DynamicCube(unsigned int num_mips, unsigned int size, float blur, unsigned texFormat = 0); - - ~DynamicCube(); - - bool refresh(); - void beforeRender(float blend_to_next, IRenderDynamicCubeFace *render); - void reset(IRenderDynamicCubeFace *render); - -protected: - unsigned int numDynamicCubeTexMips; - unsigned int dynamicCubeSize; - float dynamicCubeBlur; - - CubeTexture *dynamicCubeTex1; - TEXTUREID dynamicCubeTex1Id; - - CubeTexture *dynamicCubeTex2; - TEXTUREID dynamicCubeTex2Id; - - CubeTexture *dynamicCubeTex; - TEXTUREID dynamicCubeTexId; - - Texture *dynamicCubeDepthTex; - - int blendCubesParamsVarId; - int dynamicCubeTexBlendVarId; - - PostFxRenderer *blendCubesRenderer; - PostFxRenderer *blurCubesRenderer; - - int dynamicCubeFaceNo; - int blendCubesStage; -}; diff --git a/prog/gameLibs/publicInclude/render/heatHazeRenderer.h b/prog/gameLibs/publicInclude/render/heatHazeRenderer.h index ac77db19d..0aacd8420 100644 --- a/prog/gameLibs/publicInclude/render/heatHazeRenderer.h +++ b/prog/gameLibs/publicInclude/render/heatHazeRenderer.h @@ -18,6 +18,8 @@ class HeatHazeRenderer public: using RenderHazeParticlesCallback = eastl::function; using RenderCustomHazeCallback = eastl::function; + using BeforeApplyHazeCallback = eastl::function; + using AfterApplyHazeCallback = eastl::function; struct RenderTargets { @@ -46,7 +48,8 @@ class HeatHazeRenderer void render(double total_time, const RenderTargets &targets, const IPoint2 &back_buffer_resolution, int depth_tex_lod, RenderHazeParticlesCallback render_haze_particles, RenderCustomHazeCallback render_custom_haze, - RenderCustomHazeCallback render_ri_haze); + RenderCustomHazeCallback render_ri_haze, BeforeApplyHazeCallback before_apply_haze = nullptr, + AfterApplyHazeCallback after_apply_haze = nullptr); void clearTargets(Texture *haze_color, Texture *haze_offset, Texture *haze_depth); int getHazeResolutionDivisor() const { return hazeResolutionDivisor; } diff --git a/prog/gameLibs/publicInclude/render/temporalAA.h b/prog/gameLibs/publicInclude/render/temporalAA.h index 54094e91f..f23cbe988 100644 --- a/prog/gameLibs/publicInclude/render/temporalAA.h +++ b/prog/gameLibs/publicInclude/render/temporalAA.h @@ -44,6 +44,7 @@ struct TemporalAAParams }; class TMatrix4D; +extern Point2 get_halton_jitter(int counter, int subsamples, float subsample_scale); extern Point2 get_taa_jitter(int counter, const TemporalAAParams &p); extern void set_temporal_reprojection_matrix(const TMatrix4D &cur_view_proj_no_jitter, const TMatrix4D &prev_view_proj_jittered); extern void set_temporal_resampling_filter_parameters(const Point2 &temporal_jitter_proj_offset); diff --git a/prog/gameLibs/publicInclude/render/variance.h b/prog/gameLibs/publicInclude/render/variance.h index 7536d0149..4c1d6b3c6 100644 --- a/prog/gameLibs/publicInclude/render/variance.h +++ b/prog/gameLibs/publicInclude/render/variance.h @@ -8,7 +8,7 @@ #include <3d/dag_tex3d.h> #include <3d/dag_texMgr.h> #include <3d/dag_drv3d.h> -#include <3d/dag_render.h> +#include #include #include #include diff --git a/prog/gameLibs/publicInclude/render/waterProjFx.h b/prog/gameLibs/publicInclude/render/waterProjFx.h index 4b64f8d1a..e436f227c 100644 --- a/prog/gameLibs/publicInclude/render/waterProjFx.h +++ b/prog/gameLibs/publicInclude/render/waterProjFx.h @@ -51,8 +51,8 @@ class WaterProjectedFx bool getView(TMatrix4 &view_tm, TMatrix4 &proj_tm, Point3 &camera_pos); bool isValidView() const; - void prepare(const TMatrix &view_tm, const TMatrix4 &proj_tm, const TMatrix4 &glob_tm, float water_level, - float significant_wave_height, int frame_no); + void prepare(const TMatrix &view_tm, const TMatrix &view_itm, const TMatrix4 &proj_tm, const TMatrix4 &glob_tm, float water_level, + float significant_wave_height, int frame_no, bool change_projection); bool render(IWwaterProjFxRenderHelper *render_helper); bool render(IWwaterProjFxRenderHelper *render_helper, dag::Span targets, dag::Span taaTemp0 = {}, dag::Span taaTemp1 = {}); @@ -67,7 +67,7 @@ class WaterProjectedFx uint32_t getTargetAdditionalFlags() const; private: - void setView(const TMatrix &view_tm, const TMatrix4 &proj_tm, const TMatrix &view_itm); + void setView(const TMatrix &view_tm, const TMatrix4 &proj_tm); void setWaterMatrix(const TMatrix4 &glob_tm); TMatrix4 newProjTM, newProjTMJittered; diff --git a/prog/gameLibs/publicInclude/render/wind/fluidWind.h b/prog/gameLibs/publicInclude/render/wind/fluidWind.h index 6d57c49a7..e1b904d14 100644 --- a/prog/gameLibs/publicInclude/render/wind/fluidWind.h +++ b/prog/gameLibs/publicInclude/render/wind/fluidWind.h @@ -39,6 +39,12 @@ class FluidWind bool enabled = false; int numWaves = 1; }; + + struct PhaseAttack + { + bool enabled = false; + float maxAttackTime = 0.0f; + }; enum ShapeType { SHAPE_SPHERE, @@ -59,6 +65,7 @@ class FluidWind float duration = 0; float strength = 0; bool shake = false; + PhaseAttack phaseAttack; PhaseFade phaseFade; PhaseSin phaseSin; }; diff --git a/prog/gameLibs/publicInclude/soundSystem/banks.h b/prog/gameLibs/publicInclude/soundSystem/banks.h index 970fcc8a0..f2163acf3 100644 --- a/prog/gameLibs/publicInclude/soundSystem/banks.h +++ b/prog/gameLibs/publicInclude/soundSystem/banks.h @@ -30,7 +30,7 @@ void init(const DataBlock &blk); void enable(const char *preset_name, bool enable = true, const PathTags &path_tags = {}); void enable_starting_with(const char *preset_name_starts_with, bool enable = true, const PathTags &path_tags = {}); -const char *get_default_preset(const DataBlock &); +const char *get_master_preset(); bool is_enabled(const char *preset_name); bool is_loaded(const char *preset_name); @@ -44,7 +44,6 @@ void clear_prohibited_guids(); void set_preset_loaded_cb(PresetLoadedCallback cb); bool any_banks_pending(); -bool are_inited(); void set_err_cb(ErrorCallback cb); void unload_banks_sample_data(void); diff --git a/prog/gameLibs/publicInclude/soundSystem/debug.h b/prog/gameLibs/publicInclude/soundSystem/debug.h index 3e12d5383..e56fb310b 100644 --- a/prog/gameLibs/publicInclude/soundSystem/debug.h +++ b/prog/gameLibs/publicInclude/soundSystem/debug.h @@ -15,8 +15,6 @@ void debug_trace_err(const char *format, ...); void debug_trace_log(const char *format, ...); void debug_draw(const TMatrix4 &glob_tm); -void set_enable_debug_draw(bool enable); -bool get_enable_debug_draw(); void set_draw_audibility(bool enable); void debug_enum_events(); }; // namespace sndsys diff --git a/prog/gameLibs/publicInclude/soundSystem/soundSystem.h b/prog/gameLibs/publicInclude/soundSystem/soundSystem.h index f41d40bd2..6cba26b62 100644 --- a/prog/gameLibs/publicInclude/soundSystem/soundSystem.h +++ b/prog/gameLibs/publicInclude/soundSystem/soundSystem.h @@ -52,12 +52,14 @@ struct DeviceInfo int rate; }; -void init(const DataBlock &blk); +bool init(const DataBlock &blk); void shutdown(); bool is_inited(); void update_listener(float dt, const TMatrix &listener_tm); -void update(float dt, float time_speed = 1.f); +void set_time_speed(float time_speed); +void update(float dt); +void lazy_update(); void override_time_speed(float time_speed); // should be > 0 to override value provided within update(float dt, float time_speed = 1.f) @@ -107,8 +109,12 @@ eastl::vector get_record_devices(); void set_output_device(int device_id); -int get_last_records_list_changed_time(); -int get_last_outputs_list_changed_time(); +typedef void (*record_list_changed_cb_t)(); +typedef void (*output_list_changed_cb_t)(); +typedef void (*device_lost_cb_t)(); + +void set_device_changed_async_callbacks(record_list_changed_cb_t record_list_changed_cb, + output_list_changed_cb_t output_list_changed_cb, device_lost_cb_t device_lost_cb); void flush_commands(); diff --git a/prog/gameLibs/publicInclude/spirv/compiled_meta_data.h b/prog/gameLibs/publicInclude/spirv/compiled_meta_data.h index 79b826782..58292f487 100644 --- a/prog/gameLibs/publicInclude/spirv/compiled_meta_data.h +++ b/prog/gameLibs/publicInclude/spirv/compiled_meta_data.h @@ -95,10 +95,6 @@ namespace platform using namespace desktop; } // namespace platform -// total combination of b, t and u register entries -// If more are needed, just bump it until you have enough or you hit a hardware limit -const uint32_t REGISTER_ENTRIES = 36; - // do not set above 32! this directly maps to header.*RegisterUseMask 32bit field! // limit for b register entries, this is for the renderer to store the bindings const uint32_t B_REGISTER_INDEX_MAX = 8; @@ -107,6 +103,9 @@ const uint32_t T_REGISTER_INDEX_MAX = 32; // limit for u register entries, this is for the renderer to store the bindings const uint32_t U_REGISTER_INDEX_MAX = 13; +// total combination of b, t and u register entries +const uint32_t REGISTER_ENTRIES = B_REGISTER_INDEX_MAX + T_REGISTER_INDEX_MAX + U_REGISTER_INDEX_MAX; + const uint32_t WORK_GROUP_SIZE_X_CONSTANT_ID = 1; const uint32_t WORK_GROUP_SIZE_Y_CONSTANT_ID = 2; const uint32_t WORK_GROUP_SIZE_Z_CONSTANT_ID = 3; diff --git a/prog/gameLibs/publicInclude/streamIO/streamIO.h b/prog/gameLibs/publicInclude/streamIO/streamIO.h index d963acd6f..5805d04cc 100644 --- a/prog/gameLibs/publicInclude/streamIO/streamIO.h +++ b/prog/gameLibs/publicInclude/streamIO/streamIO.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include class WinCritSec; @@ -32,11 +32,11 @@ enum class ProcessResult IoError }; -typedef eastl::map StringMap; +typedef dag::VectorMap StringMap; typedef void (*completion_cb_t)(const char *name, int error, IGenLoad *ptr, void *cb_arg, int64_t last_modified, intptr_t req_id); typedef ProcessResult (*stream_data_cb_t)(dag::Span data, void *cb_arg, intptr_t req_id); -typedef void (*header_cb_t)(StringMap const &headers, void *cb_arg); +typedef void (*resp_headers_cb_t)(StringMap const &resp_headers, void *cb_arg); typedef void (*progress_cb_t)(const char *name, size_t dltotal, size_t dlnow); class Context @@ -49,8 +49,8 @@ class Context // Callback might get called before function return // Caller should delete returned stream when it no longer needed // If modified_since >= 0 then request callback might get called with errcode ERR_NOT_MODIFIED - virtual intptr_t createStream(const char *name, completion_cb_t complete_cb, stream_data_cb_t stream_cb, header_cb_t header_cb, - progress_cb_t progress_cb, void *cb_arg, int64_t modified_since = -1, bool do_sync = false) = 0; + virtual intptr_t createStream(const char *name, completion_cb_t complete_cb, stream_data_cb_t stream_cb, + resp_headers_cb_t resp_headers_cb, progress_cb_t progress_cb, void *cb_arg, int64_t modified_since = -1, bool do_sync = false) = 0; virtual void poll() = 0; virtual void abort() = 0; virtual void abort_request(intptr_t req_id) = 0; diff --git a/prog/gameLibs/publicInclude/video360/video360.h b/prog/gameLibs/publicInclude/video360/video360.h index 49f1c61fa..7b538dd29 100644 --- a/prog/gameLibs/publicInclude/video360/video360.h +++ b/prog/gameLibs/publicInclude/video360/video360.h @@ -10,6 +10,7 @@ #include <3d/dag_texMgr.h> #include <3d/dag_textureIDHolder.h> #include +#include #include class PostFxRenderer; @@ -19,6 +20,12 @@ typedef BaseTexture CubeTexture; struct DagorCurView; struct Driver3dPerspective; +struct CameraSetupPerspPair +{ + CameraSetup camera; + Driver3dPerspective persp; +}; + class Video360 { public: @@ -39,8 +46,7 @@ class Video360 void renderResultOnScreen(); void finishRendering(); - bool getCamera(DagorCurView &cur_view, Driver3dPerspective &persp); - bool getCamera(CameraSetup &cam, Driver3dPerspective &persp); + eastl::optional getCamera() const; bool useFixedDt(); float getFixedDt(); int getCubeSize(); diff --git a/prog/gameLibs/quirrel/bindQuirrelEx/autoCleanup.cpp b/prog/gameLibs/quirrel/bindQuirrelEx/autoCleanup.cpp new file mode 100644 index 000000000..f124c9fb5 --- /dev/null +++ b/prog/gameLibs/quirrel/bindQuirrelEx/autoCleanup.cpp @@ -0,0 +1,21 @@ +#include +#include + +namespace sq +{ +static CleanupUnregRec *auto_binding_tail = NULL; + +CleanupUnregRec::CleanupUnregRec(cleanup_unregfunc_cb_t cb) : unregfuncCb(cb), next(auto_binding_tail) +{ + G_ASSERT(unregfuncCb); + auto_binding_tail = this; +} + +void cleanup_unreg_native_api(HSQUIRRELVM vm) +{ + G_ASSERT(vm); + for (auto *brr = auto_binding_tail; brr; brr = brr->next) + brr->unregfuncCb(vm); +} + +} // namespace sq diff --git a/prog/gameLibs/quirrel/bindQuirrelEx/jamfile b/prog/gameLibs/quirrel/bindQuirrelEx/jamfile index 86df70b54..2e495f285 100644 --- a/prog/gameLibs/quirrel/bindQuirrelEx/jamfile +++ b/prog/gameLibs/quirrel/bindQuirrelEx/jamfile @@ -25,6 +25,7 @@ Sources = sqRegExp.cpp sqUtf8.cpp autoBind.cpp + autoCleanup.cpp ; if $(ProjectUseQuirrel) in sq3r sq3r+ { diff --git a/prog/gameLibs/quirrel/frp/frp.cpp b/prog/gameLibs/quirrel/frp/frp.cpp index 11896105e..45494063d 100644 --- a/prog/gameLibs/quirrel/frp/frp.cpp +++ b/prog/gameLibs/quirrel/frp/frp.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include @@ -430,7 +430,7 @@ void ObservablesGraph::checkLeaks() } } - eastl::map> sortedSsi; + dag::VectorMap> sortedSsi; eastl::transform(infoCounts.begin(), infoCounts.end(), eastl::inserter(sortedSsi, sortedSsi.begin()), flip_pair); if (sortedSsi.size()) @@ -1729,6 +1729,7 @@ void bind_frp_classes(SqModules *module_mgr) .SquirrelFunc("mutate", forbid_computed_modify, 0) .SquirrelFunc("modify", forbid_computed_modify, 0) .SquirrelFunc("_call", forbid_computed_modify, 0) + .SquirrelFunc("_set", forbid_computed_modify, 0) .SquirrelFunc("_newslot", forbid_computed_modify, 0) .SquirrelFunc("_delslot", forbid_computed_modify, 0) .SquirrelFunc( diff --git a/prog/gameLibs/quirrel/http/sqHttpClient.cpp b/prog/gameLibs/quirrel/http/sqHttpClient.cpp index 3342051e5..19dbb9db7 100644 --- a/prog/gameLibs/quirrel/http/sqHttpClient.cpp +++ b/prog/gameLibs/quirrel/http/sqHttpClient.cpp @@ -82,6 +82,7 @@ static SQRESULT is_url_allowed(HSQUIRRELVM vm, const char *url) @param respEventId s : (optional) : event id to send into eventbus @param timeout_ms i : (optional), DEF_REQUEST_TIMEOUT_MS by default (10 seconds) @param waitable b : (optional, false by default) : if true then this request will be waited for on app shutdown +@param needResponseHeaders (optional, true by default) - specify false if you not need http response headers (optimization) @return i : request id @@ -121,7 +122,7 @@ static SQInteger request(HSQUIRRELVM vm) Sqrat::Object methodObj = params.RawGetSlot("method"); const char *method = sq_objtostring(&methodObj.GetObject()); if (!method || strcmp(method, "POST") == 0) - reqParams.reqType = HTTPReq::POST; + reqParams.reqType = HTTPReq::POST; //-V1048 else if (strcmp(method, "GET") == 0) reqParams.reqType = HTTPReq::GET; else if (strcmp(method, "HEAD") == 0) @@ -139,6 +140,10 @@ static SQInteger request(HSQUIRRELVM vm) else reqParams.reqTimeoutMs = DEF_REQUEST_TIMEOUT_MS; + reqParams.needResponseHeaders = true; //-V1048 To consider: flip this default to false + if (Sqrat::Object needResponseHeaders = params.RawGetSlot("needResponseHeaders"); !needResponseHeaders.IsNull()) + reqParams.needResponseHeaders = needResponseHeaders.Cast(); + Sqrat::Object dataObj = params.RawGetSlot("data"); Sqrat::Object jsonObj = params.RawGetSlot("json"); Sqrat::Table tblHeaders = params.GetSlot("headers"); @@ -291,7 +296,7 @@ static SQInteger request(HSQUIRRELVM vm) req->context = params.GetSlot("context"); reqParams.callback = - make_http_callback([req](RequestStatus status, int http_code, dag::ConstSpan response, StringMap const &headers) { + make_http_callback([req](RequestStatus status, int http_code, dag::ConstSpan response, StringMap const &resp_headers) { bool haveCb = !req->callback.IsNull(); if (haveCb || !req->respEventId.empty()) { @@ -303,14 +308,14 @@ static SQInteger request(HSQUIRRELVM vm) resp.SetValue("http_code", (SQInteger)http_code); resp.SetValue("context", req->context); - if (!headers.empty()) + if (!resp_headers.empty()) { - Sqrat::Table sqHeaders(vm); - for (auto kv : headers) - sqHeaders.SetValue(kv.first.data(), kv.second.data()); - resp.SetValue("headers", sqHeaders); + Sqrat::Table sqRespHeaders(vm); + for (auto &kv : resp_headers) + sqRespHeaders.SetValue(kv.first.data(), kv.second.data()); + resp.SetValue("headers", sqRespHeaders); // TODO: rename to `resp_headers` } - if (response.size() > 0) + if (!response.empty()) { SQUserPointer ptr = sqstd_createblob(vm, response.size()); G_ASSERT(ptr); // can happen if blob library was not registered diff --git a/prog/gameLibs/quirrel/sqDataCache/datacache.cpp b/prog/gameLibs/quirrel/sqDataCache/datacache.cpp index 382f8b175..ddfadc8e4 100644 --- a/prog/gameLibs/quirrel/sqDataCache/datacache.cpp +++ b/prog/gameLibs/quirrel/sqDataCache/datacache.cpp @@ -12,11 +12,11 @@ namespace bindquirrel static eastl::unordered_map> datacaches; -static void send_evbus_headers(const char *entry_key, streamio::StringMap const &headers) +static void send_evbus_resp_headers(const char *entry_key, streamio::StringMap const &resp_headers) { - eastl::string eventName(eastl::string::CtorSprintf(), "datacache.headers.%s", entry_key); + eastl::string eventName(eastl::string::CtorSprintf(), "datacache.headers.%s", entry_key); // TODO: rename to .resp_headers Json::Value msg; - for (auto header : headers) + for (auto &header : resp_headers) { const eastl::string key(header.first.begin(), header.first.end()); const eastl::string data(header.second.begin(), header.second.end()); @@ -32,7 +32,7 @@ static void send_evbus_error(const char *entry_key, const char *err, datacache:: msg["error"] = err; msg["error_code"] = error_code; sqeventbus::send_event(eventName.c_str(), msg); - send_evbus_headers(entry_key, streamio::StringMap{}); // send empty header for unsubscribe + send_evbus_resp_headers(entry_key, streamio::StringMap{}); // send empty header for unsubscribe } static void send_evbus_entry_info(const char *entry_key, datacache::Entry *entry) @@ -43,7 +43,7 @@ static void send_evbus_entry_info(const char *entry_key, datacache::Entry *entry msg["path"] = entry->getPath(); msg["size"] = entry->getDataSize(); sqeventbus::send_event(eventName.c_str(), msg); - send_evbus_headers(entry_key, streamio::StringMap{}); // send empty header for unsubscribe + send_evbus_resp_headers(entry_key, streamio::StringMap{}); // send empty header for unsubscribe } void init_cache(const char *cache_name, Sqrat::Table params) @@ -81,7 +81,10 @@ void abort_requests(const char *cache_name) } } -static void on_entry_headers(const char *key, streamio::StringMap const &headers, void *) { send_evbus_headers(key, headers); } +static void on_entry_resp_headers(const char *key, streamio::StringMap const &resp_headers, void *) +{ + send_evbus_resp_headers(key, resp_headers); +} static void on_entry_completion(const char *key, datacache::ErrorCode error, datacache::Entry *entry, void *) { @@ -140,7 +143,7 @@ void request_entry(const char *cache_name, const char *entry_key) } eastl::unique_ptr &datacache = it->second; datacache::ErrorCode error; - datacache::EntryHolder entry(datacache->getWithHeaders(entry_key, &error, on_entry_completion, on_entry_headers)); + datacache::EntryHolder entry(datacache->getWithRespHeaders(entry_key, &error, on_entry_completion, on_entry_resp_headers)); if (error == datacache::ERR_OK) { send_evbus_entry_info(entry_key, entry.get()); diff --git a/prog/gameLibs/quirrel/sqEventBus/sqEventBus.cpp b/prog/gameLibs/quirrel/sqEventBus/sqEventBus.cpp index 8aac5fc86..3e999950e 100644 --- a/prog/gameLibs/quirrel/sqEventBus/sqEventBus.cpp +++ b/prog/gameLibs/quirrel/sqEventBus/sqEventBus.cpp @@ -471,13 +471,21 @@ void bind(SqModules *module_mgr, const char *vm_id, ProcessingMode mode) Sqrat::Table api(vm); ///@module eventbus - api.SquirrelFunc("subscribe", subscribe, -3, ".sc a|s") + api + .SquirrelFunc("subscribe", subscribe, -3, ".sc a|s") // alias for backward compatibility + .SquirrelFunc("eventbus_subscribe", subscribe, -3, ".sc a|s") .SquirrelFunc("subscribe_onehit", subscribe_onehit, -3, ".sc a|s") + .SquirrelFunc("eventbus_subscribe_onehit", subscribe_onehit, -3, ".sc a|s") .SquirrelFunc("unsubscribe", unsubscribe, 3, ".sc") + .SquirrelFunc("eventbus_unsubscribe", unsubscribe, 3, ".sc") .SquirrelFunc("send", send, 3, ".s.", nullptr, 1, &sqVmId) + .SquirrelFunc("eventbus_send", send, 3, ".s.", nullptr, 1, &sqVmId) .SquirrelFunc("send_foreign", send_foreign, 3, ".s.", nullptr, 1, &sqVmId) + .SquirrelFunc("eventbus_send_foreign", send_foreign, 3, ".s.", nullptr, 1, &sqVmId) + .SquirrelFunc("eventbus_has_listeners", has_listeners, 2, ".s", nullptr, 1, &sqVmId) .SquirrelFunc("has_listeners", has_listeners, 2, ".s", nullptr, 1, &sqVmId) - .SquirrelFunc("has_foreign_listeners", has_foreign_listeners, 2, ".s", nullptr, 1, &sqVmId); + .SquirrelFunc("has_foreign_listeners", has_foreign_listeners, 2, ".s", nullptr, 1, &sqVmId) + .SquirrelFunc("eventbus_has_foreign_listeners", has_foreign_listeners, 2, ".s", nullptr, 1, &sqVmId); module_mgr->addNativeModule("eventbus", api); } diff --git a/prog/gameLibs/quirrel/sqModules/sqModules.cpp b/prog/gameLibs/quirrel/sqModules/sqModules.cpp index 04082e9b4..0c7eb4726 100644 --- a/prog/gameLibs/quirrel/sqModules/sqModules.cpp +++ b/prog/gameLibs/quirrel/sqModules/sqModules.cpp @@ -316,7 +316,7 @@ bool SqModules::compileScriptImpl(const dag::ConstSpan &buf, const char *r if (compilationOptions.doStaticAnalysis) { - sq_analyseast(sqvm, ast, bindings, &buf[0], buf.size()); + sq_analyzeast(sqvm, ast, bindings, &buf[0], buf.size()); } if (onBytecode_cb) diff --git a/prog/gameLibs/rendInst/debug/collisionVisualization.cpp b/prog/gameLibs/rendInst/debug/collisionVisualization.cpp index f933fae82..6957b5eb3 100644 --- a/prog/gameLibs/rendInst/debug/collisionVisualization.cpp +++ b/prog/gameLibs/rendInst/debug/collisionVisualization.cpp @@ -435,7 +435,9 @@ void drawDebugCollisions(DrawCollisionsFlags flags, mat44f_cref globtm, const Po state.zBias = 0.00005f; debugWireOverride.reset(shaders::overrides::create(state)); } - shaders::overrides::set(debugOverride); + + // Using set_master_state to avoid "override already set" errors because draw_debug_solid_mesh also sets an override. + shaders::overrides::set_master_state(shaders::overrides::get(debugOverride)); if (drawShaded) { @@ -454,17 +456,17 @@ void drawDebugCollisions(DrawCollisionsFlags flags, mat44f_cref globtm, const Po for (auto &coll : collisions) draw_collision_info(coll, view_pos, surfaceColor, true, false, drawPhysOnly, drawTraceOnly, max_coll_dist_sq, true); - shaders::overrides::reset(); + shaders::overrides::reset_master_state(); if (drawShadedWireframe) { - shaders::overrides::set(debugWireOverride); + shaders::overrides::set_master_state(shaders::overrides::get(debugWireOverride)); d3d::setwire(true); max_coll_dist_sq = eastl::min(max_coll_dist_sq, 200.f * 200.f); // no need WIREFRAME on far distance ShaderGlobal::set_int(wireframeVarId, 1); for (auto &coll : collisions) draw_collision_info(coll, view_pos, surfaceColor, true, false, drawPhysOnly, drawTraceOnly, max_coll_dist_sq, true); d3d::setwire(false); - shaders::overrides::reset(); + shaders::overrides::reset_master_state(); } { @@ -487,19 +489,19 @@ void drawDebugCollisions(DrawCollisionsFlags flags, mat44f_cref globtm, const Po max_coll_dist_sq); } - shaders::overrides::reset(); + shaders::overrides::reset_master_state(); if (flags & DrawCollisionsFlag::Wireframe) { TIME_D3D_PROFILE(DRAW_COLLISIONS_WIREFRAME); - shaders::overrides::set(debugWireOverride); + shaders::overrides::set_master_state(shaders::overrides::get(debugWireOverride)); d3d::setwire(true); max_coll_dist_sq = eastl::min(max_coll_dist_sq, 20.f * 20.f); // no need WIREFRAME on far distance for (int i = 0; i < collisions.size(); ++i) draw_collision_info(collisions[i], view_pos, linesColor, drawAnyRendinst, drawRendinstCanopy, drawPhysOnly, drawTraceOnly, max_coll_dist_sq); d3d::setwire(false); - shaders::overrides::reset(); + shaders::overrides::reset_master_state(); } { diff --git a/prog/gameLibs/rendInst/rendInstGen.cpp b/prog/gameLibs/rendInst/rendInstGen.cpp index 5aa333585..94e36234e 100644 --- a/prog/gameLibs/rendInst/rendInstGen.cpp +++ b/prog/gameLibs/rendInst/rendInstGen.cpp @@ -1733,6 +1733,11 @@ void rendinst::initRIGen(bool need_render, int cell_pool_sz, float poi_radius, r rendinst::maxExtraRiCount = ::dgs_get_game_params()->getInt("rendinstExtraMaxCnt", 4000); RendInstGenData::renderResRequired = need_render; + if (need_render && !ShaderGlobal::get_int_by_name("per_instance_data_no", rendinst::render::instancingTexRegNo)) + { + rendinst::render::instancingTexRegNo = 14; + logerr("\"per_instance_data_no\" shader var not exist, using 14 as fallback"); + } externalJobId = job_manager_id; if (externalJobId < 0) @@ -2167,7 +2172,11 @@ static int scheduleRIGenPrepare(RendInstGenData *rgl, dag::ConstSpan poi debug("generateCell in %dms @(%d, %d)", get_time_msec() - startTime, cx, cz); if (last) + { rendinst::optimizeRIGenExtra(); + if (rendinst::do_delayed_ri_extra_destruction) + delayed_call([&] { rendinst::do_delayed_ri_extra_destruction(); }); + } } virtual unsigned getJobTag() { return tag; }; virtual void releaseJob() { delete this; } @@ -2488,6 +2497,7 @@ void rendinst::prepareRIGen(bool init_sec_ri_extra_here, const DataBlock *level_ rendinst::setImpostorsDistAddMul(impostorsDistAddMul); rendinst::setImpostorsFarDistAddMul(impostorsFarDistAddMul); rendinst::clear_rendinst_gen_ptr = &clearRIGen; + if (maxExtraRiCount && riExtraSubstNames.nameCount()) { debug("initRIGenExtra: due to maxExtraRiCount=%d riExtraSubstNames.nameCount()=%d", maxExtraRiCount, diff --git a/prog/gameLibs/rendInst/rendInstGenCollision.cpp b/prog/gameLibs/rendInst/rendInstGenCollision.cpp index 4f06d0aea..09baf6fda 100644 --- a/prog/gameLibs/rendInst/rendInstGenCollision.cpp +++ b/prog/gameLibs/rendInst/rendInstGenCollision.cpp @@ -262,7 +262,7 @@ struct RayHitStrat : public MaterialRayStrat Point3 & /*out_norm*/, rendinst::RendInstDesc *ri_desc, bool &have_collision, int layer_idx, int idx, int pool, int offs, int &out_mat_id, int /*cell_idx*/) { - if (coll_res->rayHit(tm, v_ld(&pos.x), v_ld(&dir.x), in_t, out_mat_id)) + if (coll_res->rayHit(tm, pos, dir, in_t, rayMatId, out_mat_id)) { if (ri_desc) { @@ -281,7 +281,7 @@ struct RayHitStrat : public MaterialRayStrat float in_t, Point3 & /*out_norm*/, rendinst::RendInstDesc *ri_desc, bool &have_collision, int layer_idx, int idx, int pool, int offs, int &out_mat_id, int /*cell_idx*/, const BBox3 & /*bbox_all*/) { - if (coll_res->rayHit(tm, v_ld(&pos.x), v_ld(&dir.x), in_t, out_mat_id)) + if (coll_res->rayHit(tm, pos, dir, in_t, rayMatId, out_mat_id)) { if (ri_desc) { @@ -786,7 +786,7 @@ static bool rayHit1RiExtra(Trace &trace, rendinst::RendInstDesc *ri_desc, Materi } template -bool rayTraverseRiExtra(bbox3f_cref ray_box, dag::Span traces, rendinst::RendInstDesc *ri_desc, Strategy &strategy, +static bool rayTraverseRiExtra(bbox3f_cref ray_box, dag::Span traces, rendinst::RendInstDesc *ri_desc, Strategy &strategy, bool &haveCollision, riex_handle_t skip_riex_handle = rendinst::RIEX_HANDLE_NULL) // pos bbox here! { riex_collidable_t ri_h; @@ -855,7 +855,7 @@ bool rayTraverseRiExtra(bbox3f_cref ray_box, dag::Span traces, rendinst:: } template -bool rayTraverseRendinst(bbox3f_cref rayBox, dag::Span traces, bool trace_meshes, int layer_idx, +static bool rayTraverseRendinst(bbox3f_cref rayBox, dag::Span traces, bool trace_meshes, int layer_idx, rendinst::RendInstDesc *ri_desc, Strategy &strategy, bool &haveCollision) // pos bbox here! { RendInstGenData *rgl = rendinst::rgLayer[layer_idx]; @@ -907,7 +907,7 @@ bool rayTraverseRendinst(bbox3f_cref rayBox, dag::Span traces, bool trace } template -bool rayTraverse(dag::Span traces, bool trace_meshes, rendinst::RendInstDesc *ri_desc, Strategy &strategy, +static bool rayTraverse(dag::Span traces, bool trace_meshes, rendinst::RendInstDesc *ri_desc, Strategy &strategy, riex_handle_t skip_riex_handle = rendinst::RIEX_HANDLE_NULL) // pos bbox here! { bool haveCollision = false; diff --git a/prog/gameLibs/rendInst/rendInstGenDebris.cpp b/prog/gameLibs/rendInst/rendInstGenDebris.cpp index 821fd005a..9cf853818 100644 --- a/prog/gameLibs/rendInst/rendInstGenDebris.cpp +++ b/prog/gameLibs/rendInst/rendInstGenDebris.cpp @@ -375,17 +375,29 @@ static void print_debug_destr_data() } #endif -static void debug_verify_destroy_pool_data(const rendinst::DestroyedPoolData &destrPoolData) +static bool debug_verify_destroy_pool_data(const rendinst::DestroyedPoolData &pool, uint32_t offs = -1, uint32_t end = -1) { -#if DAGOR_DBGLEVEL > 0 - for (int i = 0, lastI = destrPoolData.destroyedInstances.size() - 1; i <= lastI; ++i) + for (int i = 0, lastI = pool.destroyedInstances.size() - 1; i <= lastI; ++i) { - G_ASSERT(destrPoolData.destroyedInstances[i].startOffs < destrPoolData.destroyedInstances[i].endOffs); - G_ASSERT(i == lastI || destrPoolData.destroyedInstances[i].endOffs < destrPoolData.destroyedInstances[i + 1].startOffs); + if (pool.destroyedInstances[i].startOffs > pool.destroyedInstances[i].endOffs || + (i != 0 && pool.destroyedInstances[i - 1].endOffs >= pool.destroyedInstances[i].startOffs)) + { + logerr("=================================================="); + { + { + logerr(" pool (%d), last added %u %u", pool.poolIdx, offs, end); + for (int k = 0; k < pool.destroyedInstances.size(); ++k) + { + const rendinst::DestroyedInstanceRange &range = pool.destroyedInstances[k]; + logerr(" range (%d)-(%d)", range.startOffs, range.endOffs); + } + } + } + logerr("=================================================="); + return false; + } } -#else - G_UNUSED(destrPoolData); -#endif + return true; } static void add_destroyed_data(const rendinst::RendInstDesc &desc, RendInstGenData *ri_gen) @@ -430,68 +442,41 @@ static void add_destroyed_data(const rendinst::RendInstDesc &desc, RendInstGenDa uint16_t stride = rendinst::getRIGenStride(desc.layer, desc.cellIdx, desc.pool); uint32_t offs = desc.offs; + uint32_t end = desc.offs + stride; bool found = false; for (int i = 0; i < destrPoolData->destroyedInstances.size(); ++i) { - // Inside already added range - if (offs >= destrPoolData->destroyedInstances[i].startOffs && offs + stride <= destrPoolData->destroyedInstances[i].endOffs) + // Merge + if ((offs >= destrPoolData->destroyedInstances[i].startOffs && offs <= destrPoolData->destroyedInstances[i].endOffs) || + (end >= destrPoolData->destroyedInstances[i].startOffs && end <= destrPoolData->destroyedInstances[i].endOffs)) { + destrPoolData->destroyedInstances[i].startOffs = min(destrPoolData->destroyedInstances[i].startOffs, offs); + destrPoolData->destroyedInstances[i].endOffs = max(destrPoolData->destroyedInstances[i].endOffs, end); + for (int j = i + 1; j < destrPoolData->destroyedInstances.size(); ++j) // Append next overlapped ranges + { + if (destrPoolData->destroyedInstances[j].startOffs <= destrPoolData->destroyedInstances[i].endOffs) + { + destrPoolData->destroyedInstances[i].endOffs = + max(destrPoolData->destroyedInstances[i].endOffs, destrPoolData->destroyedInstances[j].endOffs); + erase_items(destrPoolData->destroyedInstances, j, 1); + } + } found = true; break; } - - if (offs < destrPoolData->destroyedInstances[i].startOffs) + // Insert between + if ((i == 0 || offs > destrPoolData->destroyedInstances[i - 1].endOffs) && end < destrPoolData->destroyedInstances[i].startOffs) { + insert_item_at(destrPoolData->destroyedInstances, i, rendinst::DestroyedInstanceRange(offs, end)); found = true; - if (offs + stride >= destrPoolData->destroyedInstances[i].startOffs) - { - // It expands from head - if (i > 0 && offs == destrPoolData->destroyedInstances[i - 1].endOffs) - { - // And merges two ranges - destrPoolData->destroyedInstances[i - 1].endOffs = destrPoolData->destroyedInstances[i].endOffs; - erase_items(destrPoolData->destroyedInstances, i, 1); - } - else - { - destrPoolData->destroyedInstances[i].startOffs = offs; // Just expand - destrPoolData->destroyedInstances[i].endOffs = max(destrPoolData->destroyedInstances[i].endOffs, offs + stride); - for (int j = i + 1; j < destrPoolData->destroyedInstances.size(); ++j) // Append next overlapped ranges - { - if (destrPoolData->destroyedInstances[j].startOffs <= destrPoolData->destroyedInstances[i].endOffs) - { - destrPoolData->destroyedInstances[i].endOffs = - max(destrPoolData->destroyedInstances[i].endOffs, destrPoolData->destroyedInstances[j].endOffs); - erase_items(destrPoolData->destroyedInstances, j, 1); - } - } - } - } - else if (i > 0 && offs == destrPoolData->destroyedInstances[i - 1].endOffs) - destrPoolData->destroyedInstances[i - 1].endOffs = offs + stride; // It merges from tail of previous one - else - insert_item_at(destrPoolData->destroyedInstances, i, rendinst::DestroyedInstanceRange(offs, offs + stride)); // It's just - // in-between break; } } if (!found) // it should be last then - { - rendinst::DestroyedInstanceRange *lastRange = destrPoolData->destroyedInstances.end() - 1; - if (!destrPoolData->destroyedInstances.empty() && lastRange->endOffs == offs) - lastRange->endOffs = offs + stride; // merges with last one - else - insert_item_at(destrPoolData->destroyedInstances, destrPoolData->destroyedInstances.size(), - rendinst::DestroyedInstanceRange(offs, offs + stride)); - } + destrPoolData->destroyedInstances.emplace_back(rendinst::DestroyedInstanceRange(offs, end)); -#if DAGOR_DBGLEVEL > 0 -#if DEBUG_RI_DESTR - print_debug_destr_data(); -#endif - debug_verify_destroy_pool_data(*destrPoolData); -#endif + G_ASSERT(debug_verify_destroy_pool_data(*destrPoolData, offs, end)); } void rendinst::updateRiGenVbCell(int layer_idx, int cell_idx) @@ -930,7 +915,7 @@ rendinst::riex_handle_t rendinst::restoreRiGenDestr(const RendInstDesc &desc, co break; } - debug_verify_destroy_pool_data(pool); + G_ASSERT(debug_verify_destroy_pool_data(pool)); return h; } } diff --git a/prog/gameLibs/rendInst/rendInstGenExtra.cpp b/prog/gameLibs/rendInst/rendInstGenExtra.cpp index 97e2bf421..3b7308808 100644 --- a/prog/gameLibs/rendInst/rendInstGenExtra.cpp +++ b/prog/gameLibs/rendInst/rendInstGenExtra.cpp @@ -1429,8 +1429,8 @@ void rendinst::prepareRiExtraRefs(const DataBlock &_riConf) addRiExtraRefs(b, riConf.getBlock(i), nullptr); } -bool rayHitRiExtraInstance(vec4f from, vec4f dir, float len, rendinst::riex_handle_t handle, rendinst::RendInstDesc &ri_desc, - const MaterialRayStrat &strategy) +bool rayHitRiExtraInstance(const Point3 &from, const Point3 &dir, float len, rendinst::riex_handle_t handle, + rendinst::RendInstDesc &ri_desc, const MaterialRayStrat &strategy) { uint32_t res_idx = rendinst::handle_to_ri_type(handle); const rendinst::RiExtraPool &pool = rendinst::riExtra[res_idx]; @@ -1452,10 +1452,10 @@ bool rayHitRiExtraInstance(vec4f from, vec4f dir, float len, rendinst::riex_hand CollisionResource *collRes = rendinst::riExtra[res_idx].collRes; - int matId = -1; - if (collRes->rayHit(tm, from, dir, len, matId)) + int outMatId = PHYSMAT_INVALID; + if (collRes->rayHit(tm, from, dir, len, strategy.rayMatId, outMatId)) { - if (strategy.shouldIgnoreRendinst(/*isPos*/ false, /* is_immortal */ false, matId)) + if (strategy.shouldIgnoreRendinst(/*isPos*/ false, /* is_immortal */ false, outMatId)) return false; ri_desc.setRiExtra(); ri_desc.idx = idx; @@ -1474,7 +1474,7 @@ bool rendinst::rayHitRIGenExtraCollidable(const Point3 &from, const Point3 &dir, RiGridObject ret = rigrid_find_ray_intersections(riExtraGrid, from, dir, len, [&](RiGridObject object) { if (v_extract_w(object.getWBSph()) < min_r) return false; - return rayHitRiExtraInstance(v_ldu(&from.x), v_ldu(&dir.x), len, object.handle, ri_desc, strategy); + return rayHitRiExtraInstance(from, dir, len, object.handle, ri_desc, strategy); }); return ret != RIEX_HANDLE_NULL; } diff --git a/prog/gameLibs/rendInst/rendInstGenGlobals.cpp b/prog/gameLibs/rendInst/rendInstGenGlobals.cpp index 054eafa56..9fbd3c9ec 100644 --- a/prog/gameLibs/rendInst/rendInstGenGlobals.cpp +++ b/prog/gameLibs/rendInst/rendInstGenGlobals.cpp @@ -44,3 +44,4 @@ void (*RendInstGenData::riGenPrepareAddPregenCB)(RendInstGenData::CellRtData &cr float oy, float oz, float cell_xz_sz, float cell_y_sz) = nullptr; RendInstGenData::CellRtData *(*RendInstGenData::riGenValidateGeneratedCell)(RendInstGenData *rgl, RendInstGenData::CellRtData *crt, int idx, int cx, int cz) = nullptr; +void (*rendinst::do_delayed_ri_extra_destruction)() = nullptr; diff --git a/prog/gameLibs/rendInst/render/clipShadows.cpp b/prog/gameLibs/rendInst/render/clipShadows.cpp index 8d92d77d7..a5adf56d0 100644 --- a/prog/gameLibs/rendInst/render/clipShadows.cpp +++ b/prog/gameLibs/rendInst/render/clipShadows.cpp @@ -113,10 +113,10 @@ void initClipmapShadows() rendinstShadowsToClipmapShaderElem = rendinstShadowsToClipmapShaderMaterial->make_elem(); debug("rendinst clip shadows tex instancing count is %d", count); - rendinstShadowsToClipmapVb = d3d::create_vb(size, SBCF_MAYBELOST, "rendinstShadowsToClipmapVb"); + rendinstShadowsToClipmapVb = d3d::create_vb(size, 0, "rendinstShadowsToClipmapVb"); G_ASSERT(rendinstShadowsToClipmapVb != nullptr); - rendinstShadowsToClipmapIb = d3d::create_ib(6 * sizeof(uint16_t) * count, SBCF_MAYBELOST, "rendinstShadowsToClipmapIb"); + rendinstShadowsToClipmapIb = d3d::create_ib(6 * sizeof(uint16_t) * count, 0, "rendinstShadowsToClipmapIb"); fill_buffers(); blurOffset01VarId = get_shader_variable_id("blur_offset_0_1"); @@ -322,7 +322,7 @@ bool render_clipmap_shadow_pool(rendinst::render::RtPoolData &pool, RenderableIn ShaderGlobal::setBlock(rendinst::render::globalFrameBlockId, ShaderGlobal::LAYER_FRAME); ShaderGlobal::setBlock(rendinst::render::rendinstSceneBlockId, ShaderGlobal::LAYER_SCENE); - d3d::set_buffer(STAGE_VS, rendinst::render::INSTANCING_TEXREG, rendinst::render::oneInstanceTmVb); + d3d::set_buffer(STAGE_VS, rendinst::render::instancingTexRegNo, rendinst::render::oneInstanceTmVb); rendinst::render::RiShaderConstBuffers cb; cb.setBBoxZero(); cb.setOpacity(0, 1); @@ -579,7 +579,7 @@ void RendInstGenData::renderRendinstShadowsToClipmap(const BBox2 ®ion, int ne uint32_t flag = newForCascadeNo >= 0 ? (RendInstGenData::CellRtData::CLIPMAP_SHADOW_RENDERED << newForCascadeNo) : 0; rendinst::render::RiShaderConstBuffers cb; - d3d::set_buffer(STAGE_VS, rendinst::render::INSTANCING_TEXREG, rtData->cellsVb.getHeap().getBuf()); + d3d::set_buffer(STAGE_VS, rendinst::render::instancingTexRegNo, rtData->cellsVb.getHeap().getBuf()); auto currentHeapGen = rtData->cellsVb.getManager().getHeapGeneration(); for (int z = regions[1], cellI = regions[1] * cellNumW + regions[0]; z <= regions[3]; z++, cellI += cellXStride) diff --git a/prog/gameLibs/rendInst/render/depthShadows.cpp b/prog/gameLibs/rendInst/render/depthShadows.cpp index 9c3171e64..f5dd7101b 100644 --- a/prog/gameLibs/rendInst/render/depthShadows.cpp +++ b/prog/gameLibs/rendInst/render/depthShadows.cpp @@ -201,7 +201,12 @@ int RendInstGenData::RtData::renderRendinstGlobalShadowsToTextures(const Point3 // vec4f eye = v_mul(sunDirV, v_splats(pool.sphereRadius)); viewitm.col2 = sunDirV; - viewitm.col0 = v_norm3(v_cross3(V_C_UNIT_0100, viewitm.col2)); + viewitm.col0 = v_cross3(V_C_UNIT_0100, viewitm.col2); + vec3f col0_len = v_length3(viewitm.col0); + if (v_extract_x(col0_len) > 2e-6f) + viewitm.col0 = v_div(viewitm.col0, col0_len); + else + viewitm.col0 = v_norm3(v_cross3(V_C_UNIT_0010, viewitm.col2)); viewitm.col1 = v_cross3(viewitm.col2, viewitm.col0); if (pool.shadowImpostorUpdatePhase == PHASE_LOW_PASS_RENDER) @@ -252,7 +257,7 @@ int RendInstGenData::RtData::renderRendinstGlobalShadowsToTextures(const Point3 d3d::settm(TM_VIEW, view); rendinst::render::setCoordType(riPosInst[poolNo] ? rendinst::render::COORD_TYPE_POS : rendinst::render::COORD_TYPE_TM); - d3d::set_buffer(STAGE_VS, rendinst::render::INSTANCING_TEXREG, rendinst::render::oneInstanceTmVb); + d3d::set_buffer(STAGE_VS, rendinst::render::instancingTexRegNo, rendinst::render::oneInstanceTmVb); if (pool.hasImpostor()) cb.setInstancing(3, 1, diff --git a/prog/gameLibs/rendInst/render/extraRender.cpp b/prog/gameLibs/rendInst/render/extraRender.cpp index 865208145..aa80ff9e3 100644 --- a/prog/gameLibs/rendInst/render/extraRender.cpp +++ b/prog/gameLibs/rendInst/render/extraRender.cpp @@ -30,6 +30,7 @@ float rendinst::riExtraLodDistSqMul = 1.f; float rendinst::riExtraCullDistSqMul = 1.f; float rendinst::render::riExtraMinSizeForReflection = 25.f; float rendinst::render::riExtraMinSizeForDraftDepth = 25.f; +int rendinst::render::instancingTexRegNo = -1; rendinst::render::VbExtraCtx rendinst::render::vbExtraCtx[2]; UniqueBufHolder rendinst::render::perDrawData; @@ -1376,7 +1377,7 @@ void rendinst::render::renderRIGenExtra(const RiGenVisibility &vbase, RenderPass if (needToSetBlock) ShaderGlobal::setBlock(blockToSet, ShaderGlobal::LAYER_SCENE); - d3d::set_buffer(STAGE_VS, rendinst::render::INSTANCING_TEXREG, vb->getRenderBuf()); + d3d::set_buffer(STAGE_VS, rendinst::render::instancingTexRegNo, vb->getRenderBuf()); dag::ConstSpan riResOrder = v.riexPoolOrder; if (layer == LayerFlag::Transparent || layer == LayerFlag::Decals || layer == LayerFlag::Distortion) @@ -1444,7 +1445,7 @@ void rendinst::render::renderRIGenExtraSortedTransparentInstanceElems(const RiGe if (needToSetBlock) ShaderGlobal::setBlock(blockToSet, ShaderGlobal::LAYER_SCENE); - d3d::set_buffer(STAGE_VS, rendinst::render::INSTANCING_TEXREG, vb->getRenderBuf()); + d3d::set_buffer(STAGE_VS, rendinst::render::instancingTexRegNo, vb->getRenderBuf()); dag::ConstSpan riResOrder = riExPoolIdxPerStage[get_layer_index(rendinst::LayerFlag::Transparent)]; @@ -1536,7 +1537,7 @@ void rendinst::render::renderRIGenExtraFromBuffer(Sbuffer *buffer, dag::ConstSpa ShaderGlobal::set_int(rendinst::render::rendinstRenderPassVarId, eastl::to_underlying(render_pass)); - d3d::set_buffer(STAGE_VS, rendinst::render::INSTANCING_TEXREG, buffer); + d3d::set_buffer(STAGE_VS, rendinst::render::instancingTexRegNo, buffer); if (gpu_instancing) d3d::set_buffer(STAGE_VS, rendinst::render::GPU_INSTANCING_OFSBUFFER_TEXREG, ofs_buffer); diff --git a/prog/gameLibs/rendInst/render/genRender.cpp b/prog/gameLibs/rendInst/render/genRender.cpp index 3135fb150..87b9a1a84 100644 --- a/prog/gameLibs/rendInst/render/genRender.cpp +++ b/prog/gameLibs/rendInst/render/genRender.cpp @@ -358,7 +358,7 @@ static void allocateRendInstVBs() debug("perDrawInstanceData %d (%d)", elements, (int)useStructuredBind); rendinst::render::perDrawData = - dag::create_sbuffer(structSize, elements, SBCF_MAYBELOST | SBCF_BIND_SHADER_RES | (useStructuredBind ? SBCF_MISC_STRUCTURED : 0), + dag::create_sbuffer(structSize, elements, SBCF_BIND_SHADER_RES | (useStructuredBind ? SBCF_MISC_STRUCTURED : 0), useStructuredBind ? 0 : TEXFMT_A32B32G32R32F, "perDrawInstanceData"); #if !D3D_HAS_QUADS @@ -843,7 +843,7 @@ void RendInstGenData::CellRtData::clear() RendInstGenData::RtData::RtData(int layer_idx) : cellsVb(SbufferHeapManager(String(128, "cells_vb_%d", layer_idx), //-V730 - RENDER_ELEM_SIZE, SBCF_MAYBELOST | SBCF_BIND_SHADER_RES | (rendinst::render::useCellSbuffer ? SBCF_MISC_STRUCTURED : 0), + RENDER_ELEM_SIZE, SBCF_BIND_SHADER_RES | (rendinst::render::useCellSbuffer ? SBCF_MISC_STRUCTURED : 0), rendinst::render::useCellSbuffer ? 0 : (RENDINST_FLOAT_POS ? rendinst::render::unpacked_format : rendinst::render::packed_format))) { cellsVb.getManager().setShouldCopyToNewHeap(false); @@ -1236,8 +1236,7 @@ void RendInstGenData::renderPerInstance(rendinst::RenderPass render_pass, int lo if (lodI > visibility.PI_LAST_MESH_LOD || (rtData->rtPoolData[ri_idx]->hasTransitionLod() && lodI == visibility.PI_LAST_MESH_LOD)) { - rtData->rtPoolData[ri_idx]->setImpostor(cb, render_pass == rendinst::RenderPass::ToShadow, - rtData->riRes[ri_idx]->getPreshadowTexture()); + rtData->rtPoolData[ri_idx]->setImpostor(cb, render_pass == rendinst::RenderPass::ToShadow); } else rtData->rtPoolData[ri_idx]->setNoImpostor(cb); @@ -1296,8 +1295,7 @@ void RendInstGenData::renderCrossDissolve(rendinst::RenderPass render_pass, int } if (lodI > RiGenVisibility::PI_LAST_MESH_LOD) { - rtData->rtPoolData[ri_idx]->setImpostor(cb, render_pass == rendinst::RenderPass::ToShadow, - rtData->riRes[ri_idx]->getPreshadowTexture()); + rtData->rtPoolData[ri_idx]->setImpostor(cb, render_pass == rendinst::RenderPass::ToShadow); } else rtData->rtPoolData[ri_idx]->setNoImpostor(cb); @@ -1530,7 +1528,7 @@ void RendInstGenData::renderByCells(rendinst::RenderPass render_pass, const rend #endif rtData->updateVbResetCS.lock(); - d3d::set_buffer(STAGE_VS, rendinst::render::INSTANCING_TEXREG, rtData->cellsVb.getHeap().getBuf()); + d3d::set_buffer(STAGE_VS, rendinst::render::instancingTexRegNo, rtData->cellsVb.getHeap().getBuf()); auto currentHeapGen = rtData->cellsVb.getManager().getHeapGeneration(); G_UNUSED(currentHeapGen); LinearHeapAllocatorSbuffer::Region lastInfo = {}; @@ -1625,8 +1623,7 @@ void RendInstGenData::renderByCells(rendinst::RenderPass render_pass, const rend if (lodI == rtData->riResLodCount(ri_idx) - 1 || // impostor lod, all impostored rendinsts are put to impostor lod only to shadow (rtData->rtPoolData[ri_idx]->hasTransitionLod() && lodI == rtData->riResLodCount(ri_idx) - 2)) { - rtData->rtPoolData[ri_idx]->setImpostor(cb, render_pass == rendinst::RenderPass::ToShadow, - rtData->riRes[ri_idx]->getPreshadowTexture()); + rtData->rtPoolData[ri_idx]->setImpostor(cb, render_pass == rendinst::RenderPass::ToShadow); // all impostored rendinsts are rendered with impostor only } diff --git a/prog/gameLibs/rendInst/render/genRender.h b/prog/gameLibs/rendInst/render/genRender.h index 8171f4cb2..5f0beacce 100644 --- a/prog/gameLibs/rendInst/render/genRender.h +++ b/prog/gameLibs/rendInst/render/genRender.h @@ -273,7 +273,7 @@ class RtPoolData setImpostorParams(cb, impostor.shadowImpostorWd, impostor.shadowImpostorHt); } - void setImpostor(RiShaderConstBuffers &cb, bool forShadow, BaseTexture *baked_preshadow = nullptr) const + void setImpostor(RiShaderConstBuffers &cb, bool forShadow) const { if (!hasImpostor()) { @@ -285,23 +285,6 @@ class RtPoolData d3d::settex(dynamic_impostor_texture_const_no + DYNAMIC_IMPOSTOR_TEX_SHADOW_OFFSET, rendinstGlobalShadowTex); setShadowImpostorBoundingSphere(cb); } - else - { - if (baked_preshadow != nullptr) - { - d3d::settex(dynamic_impostor_texture_const_no + DYNAMIC_IMPOSTOR_TEX_SHADOW_OFFSET, baked_preshadow); - } - else - { - for (int j = 0; j < impostor.tex.size(); ++j) - { - if (!impostor.tex[j].getBaseTex()) - break; - d3d::settex(dynamic_impostor_texture_const_no + j, impostor.tex[j].getBaseTex()); - } - } - setDynamicImpostorBoundingSphere(cb); - } ShaderElement::invalidate_cached_state_block(); } }; diff --git a/prog/gameLibs/rendInst/render/impostor.cpp b/prog/gameLibs/rendInst/render/impostor.cpp index ff91ca88c..37801a117 100644 --- a/prog/gameLibs/rendInst/render/impostor.cpp +++ b/prog/gameLibs/rendInst/render/impostor.cpp @@ -24,17 +24,10 @@ int dynamicImpostorViewXVarId = -1; int dynamicImpostorViewYVarId = -1; bool impostorPreshadowNeedUpdate = false; -static bool use_impostors_compression = false; -static bool was_impostors_compression = false; -static bool is_render_target_size_changed = false; -static uint32_t maxAlbedoRTSizeMult = 1; static int impostor_tex_count = rendinst::render::IMP_COUNT; -static uint32_t impostor_compressed_texformats[rendinst::render::IMP_COUNT] = {TEXFMT_DXT5 | TEXCF_SRGBREAD, TEXFMT_DXT5, TEXFMT_DXT5}; static uint32_t impostor_texformats[rendinst::render::IMP_COUNT] = {TEXCF_SRGBREAD | TEXCF_SRGBWRITE, 0, 0}; static E3DCOLOR impostor_clear_color[rendinst::render::IMP_COUNT] = {0x00000000, 0x00FFFFFF, 0xFFFFFFFF}; UniqueTex impostorColorTexture[rendinst::render::IMP_COUNT]; -UniqueTex impostorCompressionBuffers[rendinst::render::IMP_COUNT]; -BcCompressor *bcCompressors[rendinst::render::IMP_COUNT] = {nullptr, nullptr, nullptr}; PostFxRenderer *postfxBuildMip = nullptr; static shaders::UniqueOverrideStateId impostorShadowOverride; @@ -70,33 +63,10 @@ static unsigned int dynamicImpostorsPerFrame = 100; void rendinst::set_billboards_vertical(bool is_vertical) { rendinst::render::vertical_billboards = is_vertical; } -bool rendinst::enable_impostors_compression(bool enabled) -{ - if (!rendinst::render::bcCompressors[0]) - enabled = false; - - bool ov = rendinst::render::use_impostors_compression; - rendinst::render::use_impostors_compression = enabled; - debug("rendinst:enable_impostors_compression: %d", enabled); - return ov; -} - -void rendinst::setImpostorDiffuseSizeMul(int value) -{ - if (rendinst::render::maxAlbedoRTSizeMult != value) - { - rendinst::render::maxAlbedoRTSizeMult = value; - rendinst::render::is_render_target_size_changed = true; - } -} - void initImpostorsTempTextures() { - int texWidth = rendinst::render::use_impostors_compression - ? rendinst::render::MAX_DYNAMIC_IMPOSTOR_TEX_SIZE * rendinst::render::maxAlbedoRTSizeMult - : rendinst::render::MAX_DYNAMIC_IMPOSTOR_TEX_SIZE; + int texWidth = rendinst::render::MAX_DYNAMIC_IMPOSTOR_TEX_SIZE; int texHeight = texWidth * IMPOSTOR_MAX_ASPECT_RATIO; - int numMips = get_log2i(max(texWidth, texHeight)) + 1; if (rendinst::render::use_color_padding && !rendinst::render::impostorColorTexture[0].getTex2D()) { @@ -109,51 +79,8 @@ void initImpostorsTempTextures() rendinst::render::impostorColorTexture[i]->texaddrv(TEXADDR_CLAMP); } } - - if (rendinst::render::use_impostors_compression && !rendinst::render::impostorCompressionBuffers[0].getTex2D()) - { - for (int i = 0, e = rendinst::render::impostor_tex_count; i < e; ++i) - { - int flags = TEXCF_RTARGET | rendinst::render::impostor_texformats[i]; - rendinst::render::impostorCompressionBuffers[i] = - dag::create_tex(nullptr, texWidth, texHeight, flags, numMips, String(0, "impostorCompressionBuffer%i", i).str()); - rendinst::render::impostorCompressionBuffers[i]->texfilter(TEXFILTER_POINT); - rendinst::render::impostorCompressionBuffers[i]->texaddru(TEXADDR_CLAMP); - rendinst::render::impostorCompressionBuffers[i]->texaddrv(TEXADDR_CLAMP); - } - } } -static inline BcCompressor::ECompressionType get_compressor(uint32_t i) -{ - const uint32_t fmt = rendinst::render::impostor_compressed_texformats[i]; - const uint32_t fmt2 = fmt & TEXFMT_MASK; - if (fmt2 == TEXFMT_DXT1) - return BcCompressor::COMPRESSION_BC1; - else if (fmt2 == TEXFMT_DXT5) - return BcCompressor::COMPRESSION_BC3; - else if (fmt2 == TEXFMT_ATI1N) - return BcCompressor::COMPRESSION_BC4; - else if (fmt2 == TEXFMT_ATI2N) - return BcCompressor::COMPRESSION_BC5; - return BcCompressor::COMPRESSION_BC3; -} - -static inline const char *get_compressor_shader(uint32_t i) -{ - const uint32_t fmt = rendinst::render::impostor_compressed_texformats[i]; - const uint32_t fmt2 = fmt & TEXFMT_MASK; - const bool srgb = fmt & TEXCF_SRGBREAD ? true : false; - if (fmt2 == TEXFMT_DXT1) - return srgb ? "bc1_srgbwrite_compressor" : "bc1_compressor"; - else if (fmt2 == TEXFMT_DXT5) - return srgb ? "bc3_srgbwrite_compressor" : "bc3_compressor"; - else if (fmt2 == TEXFMT_ATI1N) - return "bc4_compressor"; - else if (fmt2 == TEXFMT_ATI2N) - return "bc5_compressor"; - return "bc3_compressor"; -} static inline uint32_t get_format(const char *fmt) { const bool srgb = strstr(fmt, "srgb") != 0; @@ -198,12 +125,9 @@ void initImpostorsGlobals() rendinst::render::impostor_tex_count = graphics->getInt("impostorTexCount", compatibilityMode ? 1 : rendinst::render::IMP_COUNT); rendinst::render::impostor_texformats[0] = get_format(graphics->getStr("rendinstImpostorTex0", "argb8_srgb")); - rendinst::render::impostor_compressed_texformats[0] = get_format(graphics->getStr("rendinstImpostorCompressedTex0", "dxt5_srgb")); for (int i = 1; i < rendinst::render::impostor_tex_count; ++i) { rendinst::render::impostor_texformats[i] = get_format(graphics->getStr(String(0, "rendinstImpostorTex%d", i).c_str(), "argb8")); - rendinst::render::impostor_compressed_texformats[i] = - get_format(graphics->getStr(String(0, "rendinstImpostorCompressedTex%d", i).c_str(), "dxt5")); } for (int i = 0; i < rendinst::render::impostor_tex_count; ++i) { @@ -234,31 +158,6 @@ void initImpostorsGlobals() if (!rendinst::render::postfxBuildMip) rendinst::render::use_color_padding = false; - bool compressionAvailable = false; - if (BcCompressor::isAvailable(BcCompressor::COMPRESSION_BC3)) - { - rendinst::render::bcCompressors[0] = new BcCompressor(get_compressor(0), 0, 0, 0, 1, get_compressor_shader(0)); - compressionAvailable = rendinst::render::bcCompressors[0]->isValid(); - for (int i = 1; i < rendinst::render::impostor_tex_count && compressionAvailable; ++i) - { - rendinst::render::bcCompressors[i] = new BcCompressor(get_compressor(i), 0, 0, 0, 1, get_compressor_shader(i)); - if (!rendinst::render::bcCompressors[i]->isValid()) - { - compressionAvailable = false; - break; - } - } - } - - if (!compressionAvailable) - { - for (int i = 0; i < rendinst::render::IMP_COUNT; ++i) - del_it(rendinst::render::bcCompressors[i]); - - rendinst::render::use_impostors_compression = false; - } - - debug("impostor compression is available: %d", compressionAvailable); initImpostorsTempTextures(); @@ -274,8 +173,6 @@ void termImpostorsGlobals() for (int i = 0; i < rendinst::render::impostor_tex_count; ++i) { rendinst::render::impostorColorTexture[i].close(); - rendinst::render::impostorCompressionBuffers[i].close(); - del_it(rendinst::render::bcCompressors[i]); } rendinst::render::impostorDepthTextures.clear(); } @@ -408,9 +305,7 @@ void RendInstGenData::RtData::initImpostors() rtSize = get_bigger_pow2(rtSize); - int maximumRTSize = rendinst::render::use_impostors_compression - ? rendinst::render::MAX_DYNAMIC_IMPOSTOR_TEX_SIZE * rendinst::render::maxAlbedoRTSizeMult - : rendinst::render::MAX_DYNAMIC_IMPOSTOR_TEX_SIZE; + int maximumRTSize = rendinst::render::MAX_DYNAMIC_IMPOSTOR_TEX_SIZE; rtSize = clamp(rtSize, (int)rendinst::render::MIN_DYNAMIC_IMPOSTOR_TEX_SIZE, maximumRTSize); int rtSizeY = rtSize; if (bboxAspectRatio > 1.75) @@ -428,12 +323,8 @@ void RendInstGenData::RtData::initImpostors() _snprintf(name, sizeof(name), "impostor_color_RT_for_%d_%p", poolNo, this); name[sizeof(name) - 1] = 0; unsigned int texflags = TEXCF_RTARGET; - if (rendinst::render::use_impostors_compression) - texflags = TEXCF_CLEAR_ON_CREATE; - const unsigned int texflagsColor = - texflags | (rendinst::render::use_impostors_compression ? rendinst::render::impostor_compressed_texformats[0] - : rendinst::render::impostor_texformats[0]); + const unsigned int texflagsColor = texflags | rendinst::render::impostor_texformats[0]; pool.impostor.renderMips = IMPOSTOR_NOAUTO_MIPS; pool.impostor.numColorTexMips = pool.impostor.renderMips; @@ -465,8 +356,7 @@ void RendInstGenData::RtData::initImpostors() name[sizeof(name) - 1] = 0; pool.impostor.tex[i].close(); - const uint32_t fmt = (rendinst::render::use_impostors_compression ? rendinst::render::impostor_compressed_texformats[i] - : rendinst::render::impostor_texformats[i]); + const uint32_t fmt = rendinst::render::impostor_texformats[i]; pool.impostor.tex[i] = dag::create_tex(nullptr, rtSize, rtSizeY, texflags | fmt, pool.impostor.renderMips, name); d3d_err(pool.impostor.tex[i].getBaseTex()); pool.impostor.tex[i].getBaseTex()->texaddr(TEXADDR_CLAMP); @@ -577,7 +467,7 @@ bool RendInstGenData::RtData::updateImpostorsPreshadow(int poolNo, const Point3 d3d::settm(TM_PROJ, &orthoProj); TMatrix view34; - memcpy(&view34, &view, sizeof(view34)); + v_mat_43cu_from_mat44(&view34[0][0], view); TMatrix4 globtm; d3d::calcglobtm(view34, orthoProj, globtm); @@ -670,9 +560,9 @@ bool RendInstGenData::RtData::updateImpostorsPreshadow(int poolNo, const Point3 UniqueTex depthAtlas = get_impostor_texture_mgr()->renderDepthAtlasForShadow(riRes[poolNo]); - d3d::set_buffer(STAGE_VS, rendinst::render::INSTANCING_TEXREG, nullptr); + d3d::set_buffer(STAGE_VS, rendinst::render::instancingTexRegNo, nullptr); bool updated = fill_palette_vb(palette.count, palette.rotations); - d3d::set_buffer(STAGE_VS, rendinst::render::INSTANCING_TEXREG, rendinst::render::rotationPaletteTmVb); + d3d::set_buffer(STAGE_VS, rendinst::render::instancingTexRegNo, rendinst::render::rotationPaletteTmVb); if (!updated) return false; for (int i = 0; i < palette.count; ++i) @@ -723,8 +613,7 @@ static void impostorMipSRVBarrier(rendinst::render::RtPoolData &pool, int mip) { for (int j = 0; j < pool.impostor.tex.size(); ++j) { - UniqueTex &tex = - rendinst::render::use_impostors_compression ? rendinst::render::impostorCompressionBuffers[j] : pool.impostor.tex[j]; + UniqueTex &tex = pool.impostor.tex[j]; BaseTexture *pTex = tex.getBaseTex(); d3d::resource_barrier({pTex, RB_RO_SRV | RB_STAGE_PIXEL, (unsigned)(mip), 1}); } @@ -735,18 +624,9 @@ void renderImpostorMips(rendinst::render::RtPoolData &pool, int currentRenderMip PostFxRenderer *pFx = rendinst::render::postfxBuildMip; if (!pFx) { - if (rendinst::render::use_impostors_compression) - { - for (int j = 0; j < rendinst::render::impostor_tex_count; ++j) - if (rendinst::render::impostorCompressionBuffers[j].getTex2D()) - rendinst::render::impostorCompressionBuffers[j].getTex2D()->generateMips(); - } - else - { - for (int j = 0; j < pool.impostor.tex.size(); ++j) - if (pool.impostor.tex[j].getBaseTex()) - pool.impostor.tex[j].getBaseTex()->generateMips(); - } + for (int j = 0; j < pool.impostor.tex.size(); ++j) + if (pool.impostor.tex[j].getBaseTex()) + pool.impostor.tex[j].getBaseTex()->generateMips(); return; } @@ -774,8 +654,7 @@ void renderImpostorMips(rendinst::render::RtPoolData &pool, int currentRenderMip for (int j = 0; j < pool.impostor.tex.size(); ++j) { - UniqueTex &tex = - rendinst::render::use_impostors_compression ? rendinst::render::impostorCompressionBuffers[j] : pool.impostor.tex[j]; + UniqueTex &tex = pool.impostor.tex[j]; BaseTexture *pTex = tex.getBaseTex(); if (pTex) { @@ -785,8 +664,7 @@ void renderImpostorMips(rendinst::render::RtPoolData &pool, int currentRenderMip } } - UniqueTex &tex = - rendinst::render::use_impostors_compression ? rendinst::render::impostorCompressionBuffers[0] : pool.impostor.tex[0]; + UniqueTex &tex = pool.impostor.tex[0]; TextureInfo ti; tex.getBaseTex()->getinfo(ti, src_mip); ShaderGlobal::set_color4(rendinst::render::texelSizeVarId, 1.f, 1.f, 1.f / ti.w, 1.f / ti.h); @@ -807,31 +685,6 @@ void renderImpostorMips(rendinst::render::RtPoolData &pool, int currentRenderMip pool.impostor.tex[j].getBaseTex()->texmiplevel(pool.impostor.baseMip, lastMip); pool.impostor.tex[j].getBaseTex()->texfilter(TEXFILTER_DEFAULT); } - if (rendinst::render::use_impostors_compression) - rendinst::render::impostorCompressionBuffers[j].getTex2D()->texmiplevel(pool.impostor.baseMip, lastMip); - } -} - -void compressImpostor(rendinst::render::RtPoolData &pool) -{ - for (int i = 0, e = pool.impostor.tex.size(); i < e; ++i) - { - int texWidth = (i == 0) ? rendinst::render::MAX_DYNAMIC_IMPOSTOR_TEX_SIZE * rendinst::render::maxAlbedoRTSizeMult - : rendinst::render::MAX_DYNAMIC_IMPOSTOR_TEX_SIZE; - int texHeight = texWidth * IMPOSTOR_MAX_ASPECT_RATIO; - int bufMips = (get_log2i(max(texWidth / 4, texHeight / 4)) + 1); - BcCompressor *compr = rendinst::render::bcCompressors[i]; - compr->resetBuffer(bufMips, texWidth, texHeight, 1); - - for (int j = pool.impostor.baseMip; j < pool.impostor.renderMips; ++j) - { - TextureInfo ti; - BaseTexture *dstTex = pool.impostor.tex[i].getBaseTex(); - dstTex->getinfo(ti, j); - int scrMipToAdd = (i != 0 && rendinst::render::maxAlbedoRTSizeMult > 1) ? 1 : 0; - compr->updateFromMip(rendinst::render::impostorCompressionBuffers[i].getTexId(), j + scrMipToAdd, j); - compr->copyToMip(dstTex, j, 0, 0, j, 0, 0, ti.w, ti.h); - } } } @@ -880,8 +733,8 @@ void RendInstGenData::RtData::updateImpostors(float shadowDistance, const Point3 cb.setBBoxZero(); rendinst::render::startRenderInstancing(); - d3d::set_buffer(STAGE_VS, rendinst::render::INSTANCING_TEXREG, nullptr); - d3d::set_buffer(STAGE_VS, rendinst::render::INSTANCING_TEXREG, rendinst::render::rotationPaletteTmVb); + d3d::set_buffer(STAGE_VS, rendinst::render::instancingTexRegNo, nullptr); + d3d::set_buffer(STAGE_VS, rendinst::render::instancingTexRegNo, rendinst::render::rotationPaletteTmVb); d3d::set_immediate_const(STAGE_VS, ZERO_PTR(), 1); int numUpdated = 0; @@ -1022,9 +875,7 @@ void RendInstGenData::RtData::updateImpostors(float shadowDistance, const Point3 } // required to be set before block, soopengl driver use correct axis flipping - d3d::set_render_target(rendinst::render::use_impostors_compression ? rendinst::render::impostorCompressionBuffers[0].getTex2D() - : pool.impostor.tex[0].getBaseTex(), - 0); + d3d::set_render_target(pool.impostor.tex[0].getBaseTex(), 0); d3d::set_backbuf_depth(); ShaderGlobal::setBlock(rendinst::render::globalFrameBlockId, ShaderGlobal::LAYER_FRAME); ShaderGlobal::setBlock(rendinst::render::rendinstSceneBlockId, ShaderGlobal::LAYER_SCENE); @@ -1045,8 +896,6 @@ void RendInstGenData::RtData::updateImpostors(float shadowDistance, const Point3 { if (rendinst::render::use_color_padding) dstRT[j] = &rendinst::render::impostorColorTexture[j]; - else if (rendinst::render::use_impostors_compression) - dstRT[j] = &rendinst::render::impostorCompressionBuffers[j]; else dstRT[j] = &pool.impostor.tex[j]; } @@ -1110,7 +959,7 @@ void RendInstGenData::RtData::updateImpostors(float shadowDistance, const Point3 // set viewport float viewportPartX = 1.f, viewportPartY = 1.f; - if (rendinst::render::use_color_padding || rendinst::render::use_impostors_compression) + if (rendinst::render::use_color_padding) { TextureInfo tinfo; pool.impostor.tex[0].getBaseTex()->getinfo(tinfo, baseMip); @@ -1120,8 +969,7 @@ void RendInstGenData::RtData::updateImpostors(float shadowDistance, const Point3 d3d::setview(0, 0, vpWidth, vpHeight, 0.f, 1.f); TextureInfo tempRTInfo; - UniqueTex &tex = rendinst::render::use_impostors_compression ? rendinst::render::impostorCompressionBuffers[0] - : rendinst::render::impostorColorTexture[0]; + UniqueTex &tex = rendinst::render::impostorColorTexture[0]; tex.getTex2D()->getinfo(tempRTInfo, 0); G_ASSERT(vpWidth <= tempRTInfo.w && vpHeight <= tempRTInfo.h); @@ -1175,8 +1023,7 @@ void RendInstGenData::RtData::updateImpostors(float shadowDistance, const Point3 for (int j = 0; j < pool.impostor.tex.size(); ++j) { - UniqueTex &tex = - rendinst::render::use_impostors_compression ? rendinst::render::impostorCompressionBuffers[j] : pool.impostor.tex[j]; + UniqueTex &tex = pool.impostor.tex[j]; BaseTexture *pTex = tex.getBaseTex(); if (pTex) { @@ -1185,20 +1032,12 @@ void RendInstGenData::RtData::updateImpostors(float shadowDistance, const Point3 } } TextureInfo ti; - UniqueTex &tex = rendinst::render::use_impostors_compression ? rendinst::render::impostorCompressionBuffers[0] - : rendinst::render::impostorColorTexture[0]; + UniqueTex &tex = rendinst::render::impostorColorTexture[0]; tex.getBaseTex()->getinfo(ti, 0); ShaderGlobal::set_color4(rendinst::render::texelSizeVarId, viewportPartX, viewportPartY, 1.f / ti.w, 1.f / ti.h); ShaderGlobal::set_int(maxTranslucancyVarId, 0); ShaderGlobal::set_int(rendinst::render::texIndVid, -1); - if (rendinst::render::use_impostors_compression) - { - TextureInfo ti; - pool.impostor.tex[0].getBaseTex()->getinfo(ti, pool.impostor.baseMip); - d3d::setview(0, 0, ti.w, ti.h, 0, 1); - } - pFx->render(); } @@ -1209,16 +1048,10 @@ void RendInstGenData::RtData::updateImpostors(float shadowDistance, const Point3 // in RO_SRV state before exiting generation for (int j = 0; j < pool.impostor.tex.size(); ++j) { - UniqueTex &tex = - rendinst::render::use_impostors_compression ? rendinst::render::impostorCompressionBuffers[j] : pool.impostor.tex[j]; + UniqueTex &tex = pool.impostor.tex[j]; Texture *pTex = tex.getBaseTex(); d3d::resource_barrier({pTex, RB_RO_SRV | RB_STAGE_PIXEL, 0, 0}); } - - if (rendinst::render::use_impostors_compression) - { - compressImpostor(pool); - } } if (hasShadow && isSmallChange) @@ -1267,26 +1100,12 @@ void rendinst::updateRIGenImpostors(float shadowDistance, const Point3 &sunDir0, bool needsReset = false; - if (rendinst::render::use_impostors_compression != rendinst::render::was_impostors_compression || - rendinst::render::is_render_target_size_changed || rendinst::render::impostorPreshadowNeedUpdate) + if (rendinst::render::impostorPreshadowNeedUpdate) { - if (rendinst::render::is_render_target_size_changed) - { - for (int i = 0; i < rendinst::render::impostor_tex_count; ++i) - { - rendinst::render::impostorColorTexture[i].close(); - rendinst::render::impostorCompressionBuffers[i].close(); - } - rendinst::render::impostorDepthTextures.clear(); - rendinst::render::is_render_target_size_changed = false; - } - initImpostorsTempTextures(); needsReset = true; } - rendinst::render::was_impostors_compression = rendinst::render::use_impostors_compression; - FOR_EACH_RG_LAYER_DO (rgl) { ScopedLockRead lock(rgl->rtData->riRwCs); diff --git a/prog/gameLibs/rendInst/render/riShaderConstBuffers.cpp b/prog/gameLibs/rendInst/render/riShaderConstBuffers.cpp index ac2838c8e..9fdffd73d 100644 --- a/prog/gameLibs/rendInst/render/riShaderConstBuffers.cpp +++ b/prog/gameLibs/rendInst/render/riShaderConstBuffers.cpp @@ -12,7 +12,7 @@ static carray instancesCB = {0}; void endRenderInstancing() { - d3d::set_buffer(STAGE_VS, INSTANCING_TEXREG, 0); + d3d::set_buffer(STAGE_VS, instancingTexRegNo, 0); d3d::set_const_buffer(STAGE_VS, perinstBuffNo, nullptr); d3d::set_const_buffer(STAGE_VS, instanceBuffNo, nullptr); d3d::set_immediate_const(STAGE_VS, nullptr, 0); @@ -30,7 +30,7 @@ void init_instances_tb() for (int i = 0; i < instancesCB.size(); ++i) instancesCB[i] = d3d::buffers::create_one_frame_cb(MIN_INST_COUNT << i, "perInstanceData"); // instancesTB[i] = d3d::create_sbuffer(16, MIN_INST_COUNT<(get_bigger_log2(1 + (vec4_count - 1) / MIN_INST_COUNT), rendinst::render::instancesCB.size() - 1); rendinst::render::instancesCB[bin]->updateDataWithLock(0, vec4_count * sizeof(vec4f), data, VBLOCK_WRITEONLY | VBLOCK_DISCARD); - // d3d::set_buffer(STAGE_VS, INSTANCING_TEXREG, rendinst::render::instancesCB[bin]); + // d3d::set_buffer(STAGE_VS, instancingTexRegNo, rendinst::render::instancesCB[bin]); // on my 1070 ConstBuffer is proven to work faster than TB. We can actually handle a lot of instances in one CB. It may be worth // switching to CB only instancing... however, in synthetic tests both TB and SB were faster than CB when fetch 3 float4. I guess, // the difference is that we sample just ONE float4 for trees, and cache prefetching doesn't help us (like it does in TB/SB) diff --git a/prog/gameLibs/rendInst/visibility/extraVisibility.cpp b/prog/gameLibs/rendInst/visibility/extraVisibility.cpp index 0aa3ba716..0ac32a343 100644 --- a/prog/gameLibs/rendInst/visibility/extraVisibility.cpp +++ b/prog/gameLibs/rendInst/visibility/extraVisibility.cpp @@ -162,7 +162,7 @@ bool rendinst::prepareExtraVisibilityInternal(mat44f_cref globtm_cull, const Poi #define LAMBDA_BODY(forced_extra_lod_less_then_zero) \ G_UNUSED(ni); \ if (render_for_shadow && scene::check_node_flags(m, RendinstTiledScene::CHECKED_IN_SHADOWS) && \ - !scene::check_node_flags(m, RendinstTiledScene::VISIBLE_IN_SHADOWS)) \ + !scene::check_node_flags(m, RendinstTiledScene::VISIBLE_IN_SHADOWS | RendinstTiledScene::NEEDS_CHECK_IN_SHADOW)) \ return; \ if (filter_rendinst_clipmap && !scene::check_node_flags(m, RendinstTiledScene::IS_RENDINST_CLIPMAP)) \ return; \ diff --git a/prog/gameLibs/render/androidScreenRotation.cpp b/prog/gameLibs/render/androidScreenRotation.cpp index 267156d8c..23969b112 100644 --- a/prog/gameLibs/render/androidScreenRotation.cpp +++ b/prog/gameLibs/render/androidScreenRotation.cpp @@ -52,7 +52,6 @@ void android_screen_rotation::onFrameEnd() if (angle) { d3d::set_render_target(); - d3d::set_depth(nullptr, DepthAccess::SampledRO); d3d::clearview(CLEAR_DISCARD_TARGET, E3DCOLOR(0), 0, 0); pass.render(); } diff --git a/prog/gameLibs/render/cables/cables.cpp b/prog/gameLibs/render/cables/cables.cpp index ade97e4b2..6272a4163 100644 --- a/prog/gameLibs/render/cables/cables.cpp +++ b/prog/gameLibs/render/cables/cables.cpp @@ -168,6 +168,9 @@ void Cables::onRIExtraDestroyed(const TMatrix &tm, const BBox3 &box) { destroyed void Cables::destroyCables() { + G_ASSERTF_RETURN(!tiledArea.grid.empty() || cables.empty() || destroyedRIExtra.empty(), , + "tiledArea.grid.size()=%d cables.size()=%d destroyedRIExtra.size()=%d", // + tiledArea.grid.size(), cables.size(), destroyedRIExtra.size()); for (RIExtraInfo &riex : destroyedRIExtra) { TMatrix tm = riex.tm; @@ -175,6 +178,8 @@ void Cables::destroyCables() bbox3f riBBox; v_mat44_make_from_43cu_unsafe(riTm, tm.array); v_bbox3_init(riBBox, riTm, v_ldu_bbox3(riex.box)); + G_ASSERT_CONTINUE(!isnan(v_extract_x(riBBox.bmin)) && !isnan(v_extract_x(riBBox.bmax)) && !isnan(v_extract_z(riBBox.bmin)) && + !isnan(v_extract_z(riBBox.bmax))); // inverse matrix usage is more accurate, but seems it works well as is // TMatrix invTm = inverse(tm); int i_start = max((v_extract_x(riBBox.bmin) - tiledArea.gridBoundMin.x) / tiledArea.tileSize.x, 0); diff --git a/prog/gameLibs/render/clusteredLights.cpp b/prog/gameLibs/render/clusteredLights.cpp index 9363a39a3..2d4846364 100644 --- a/prog/gameLibs/render/clusteredLights.cpp +++ b/prog/gameLibs/render/clusteredLights.cpp @@ -64,7 +64,7 @@ void ClusteredLights::validateDensity(uint32_t words) String name(128, "lights_full_grid_%d", i); lightsFullGridCB[i].close(); lightsFullGridCB[i] = dag::create_sbuffer(sizeof(uint32_t), CLUSTERS_PER_GRID * allocatedWords, - SBCF_DYNAMIC | SBCF_MAYBELOST | SBCF_CPU_ACCESS_WRITE | SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED, 0, name); + SBCF_DYNAMIC | SBCF_CPU_ACCESS_WRITE | SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED, 0, name); } } @@ -631,10 +631,10 @@ void ClusteredLights::initConeSphere() static constexpr uint32_t SLICES = 5; calc_sphere_vertex_face_count(SLICES, SLICES, false, v_count, f_count); coneSphereVb.close(); - coneSphereVb = dag::create_vb((v_count + 5) * sizeof(Point3), SBCF_MAYBELOST, "coneSphereVb"); + coneSphereVb = dag::create_vb((v_count + 5) * sizeof(Point3), 0, "coneSphereVb"); d3d_err((bool)coneSphereVb); coneSphereIb.close(); - coneSphereIb = dag::create_ib((f_count + 6) * 6, SBCF_MAYBELOST, "coneSphereIb"); + coneSphereIb = dag::create_ib((f_count + 6) * 6, 0, "coneSphereIb"); d3d_err((bool)coneSphereIb); LockedBuffer indicesLocked = lock_sbuffer(coneSphereIb.getBuf(), 0, 0, VBLOCK_WRITEONLY); @@ -1401,6 +1401,5 @@ void ClusteredLights::setNeedSsss(bool need_ssss) spotLightSsssShadowDescBuffer.close(); if (need_ssss) spotLightSsssShadowDescBuffer = dag::create_sbuffer(sizeof(SpotlightShadowDescriptor), MAX_SPOT_LIGHTS, - SBCF_DYNAMIC | SBCF_MAYBELOST | SBCF_CPU_ACCESS_WRITE | SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED, 0, - "spot_lights_ssss_shadow_desc"); + SBCF_DYNAMIC | SBCF_CPU_ACCESS_WRITE | SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED, 0, "spot_lights_ssss_shadow_desc"); } diff --git a/prog/gameLibs/render/cur_view.cpp b/prog/gameLibs/render/cur_view.cpp new file mode 100644 index 000000000..42181d092 --- /dev/null +++ b/prog/gameLibs/render/cur_view.cpp @@ -0,0 +1,3 @@ +#include + +DagorCurView grs_cur_view; diff --git a/prog/gameLibs/render/daBfg/_aot.jam b/prog/gameLibs/render/daBfg/_aot.jam index 0ed0076e6..2a791a84c 100644 --- a/prog/gameLibs/render/daBfg/_aot.jam +++ b/prog/gameLibs/render/daBfg/_aot.jam @@ -1,14 +1,14 @@ local R = prog/gameLibs/render/daBfg ; local src = - $(R)/api/dasModules/frameGraphModule.cpp - $(R)/api/dasModules/enumerations.cpp - $(R)/api/dasModules/nodeDataAnnotation.cpp - $(R)/api/dasModules/nodeEcsRegistration.cpp - $(R)/api/dasModules/structureAnnotations.cpp - $(R)/api/dasModules/stub_aot.cpp + $(R)/api/das/frameGraphModule.cpp + $(R)/api/das/enumerations.cpp + $(R)/api/das/nodeDataAnnotation.cpp + $(R)/api/das/nodeEcsRegistration.cpp + $(R)/api/das/structureAnnotations.cpp + $(R)/api/das/stub_aot.cpp ; Sources += $(src) ; opt on $(src) = -I$(Root)/$(R) ; -DasToStringify += $(R)/api/dasModules/frameGraphModule.das $(R)/api/dasModules/frameGraphModule.das.inl $(R)/api/dasModules/frameGraphModule.cpp ; +DasToStringify += $(R)/api/das/frameGraphModule.das $(R)/api/das/frameGraphModule.das.inl $(R)/api/das/frameGraphModule.cpp ; DABFG_ENABLE_DAECS_INTEGRATION = yes ; DABFG_ENABLE_DAS_INTEGRATION = yes ; diff --git a/prog/gameLibs/render/daBfg/api/autoResolutionRequest.cpp b/prog/gameLibs/render/daBfg/api/cpp/autoResolutionRequest.cpp similarity index 82% rename from prog/gameLibs/render/daBfg/api/autoResolutionRequest.cpp rename to prog/gameLibs/render/daBfg/api/cpp/autoResolutionRequest.cpp index 6a74a5554..a790546a6 100644 --- a/prog/gameLibs/render/daBfg/api/autoResolutionRequest.cpp +++ b/prog/gameLibs/render/daBfg/api/cpp/autoResolutionRequest.cpp @@ -1,6 +1,6 @@ #include -#include +#include namespace dabfg diff --git a/prog/gameLibs/render/daBfg/api/bfg.cpp b/prog/gameLibs/render/daBfg/api/cpp/bfg.cpp similarity index 97% rename from prog/gameLibs/render/daBfg/api/bfg.cpp rename to prog/gameLibs/render/daBfg/api/cpp/bfg.cpp index 3ab310395..027843346 100644 --- a/prog/gameLibs/render/daBfg/api/bfg.cpp +++ b/prog/gameLibs/render/daBfg/api/cpp/bfg.cpp @@ -1,5 +1,5 @@ #include -#include +#include namespace dabfg diff --git a/prog/gameLibs/render/daBfg/api/detail/virtualResourceHandleBase.cpp b/prog/gameLibs/render/daBfg/api/cpp/detail/virtualResourceHandleBase.cpp similarity index 94% rename from prog/gameLibs/render/daBfg/api/detail/virtualResourceHandleBase.cpp rename to prog/gameLibs/render/daBfg/api/cpp/detail/virtualResourceHandleBase.cpp index 5ad268cf9..d03b8db82 100644 --- a/prog/gameLibs/render/daBfg/api/detail/virtualResourceHandleBase.cpp +++ b/prog/gameLibs/render/daBfg/api/cpp/detail/virtualResourceHandleBase.cpp @@ -1,5 +1,6 @@ #include -#include +#include + namespace dabfg::detail { diff --git a/prog/gameLibs/render/daBfg/api/detail/virtualResourceRequestBase.cpp b/prog/gameLibs/render/daBfg/api/cpp/detail/virtualResourceRequestBase.cpp similarity index 99% rename from prog/gameLibs/render/daBfg/api/detail/virtualResourceRequestBase.cpp rename to prog/gameLibs/render/daBfg/api/cpp/detail/virtualResourceRequestBase.cpp index c3d1f6fee..50ad909e6 100644 --- a/prog/gameLibs/render/daBfg/api/detail/virtualResourceRequestBase.cpp +++ b/prog/gameLibs/render/daBfg/api/cpp/detail/virtualResourceRequestBase.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include diff --git a/prog/gameLibs/render/daBfg/api/nameSpace.cpp b/prog/gameLibs/render/daBfg/api/cpp/nameSpace.cpp similarity index 99% rename from prog/gameLibs/render/daBfg/api/nameSpace.cpp rename to prog/gameLibs/render/daBfg/api/cpp/nameSpace.cpp index 60ac7146c..9422e222d 100644 --- a/prog/gameLibs/render/daBfg/api/nameSpace.cpp +++ b/prog/gameLibs/render/daBfg/api/cpp/nameSpace.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include namespace dabfg diff --git a/prog/gameLibs/render/daBfg/api/nameSpaceRequest.cpp b/prog/gameLibs/render/daBfg/api/cpp/nameSpaceRequest.cpp similarity index 98% rename from prog/gameLibs/render/daBfg/api/nameSpaceRequest.cpp rename to prog/gameLibs/render/daBfg/api/cpp/nameSpaceRequest.cpp index eba918416..22fbfded0 100644 --- a/prog/gameLibs/render/daBfg/api/nameSpaceRequest.cpp +++ b/prog/gameLibs/render/daBfg/api/cpp/nameSpaceRequest.cpp @@ -1,5 +1,5 @@ #include -#include +#include namespace dabfg diff --git a/prog/gameLibs/render/daBfg/api/registry.cpp b/prog/gameLibs/render/daBfg/api/cpp/registry.cpp similarity index 98% rename from prog/gameLibs/render/daBfg/api/registry.cpp rename to prog/gameLibs/render/daBfg/api/cpp/registry.cpp index 55a11161e..cc843c1a6 100644 --- a/prog/gameLibs/render/daBfg/api/registry.cpp +++ b/prog/gameLibs/render/daBfg/api/cpp/registry.cpp @@ -1,5 +1,5 @@ #include -#include +#include namespace dabfg diff --git a/prog/gameLibs/render/daBfg/api/stateRequest.cpp b/prog/gameLibs/render/daBfg/api/cpp/stateRequest.cpp similarity index 98% rename from prog/gameLibs/render/daBfg/api/stateRequest.cpp rename to prog/gameLibs/render/daBfg/api/cpp/stateRequest.cpp index 3e106c144..c5268ae6a 100644 --- a/prog/gameLibs/render/daBfg/api/stateRequest.cpp +++ b/prog/gameLibs/render/daBfg/api/cpp/stateRequest.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include diff --git a/prog/gameLibs/render/daBfg/api/virtualPassRequest.cpp b/prog/gameLibs/render/daBfg/api/cpp/virtualPassRequest.cpp similarity index 99% rename from prog/gameLibs/render/daBfg/api/virtualPassRequest.cpp rename to prog/gameLibs/render/daBfg/api/cpp/virtualPassRequest.cpp index e017ae555..6878684c2 100644 --- a/prog/gameLibs/render/daBfg/api/virtualPassRequest.cpp +++ b/prog/gameLibs/render/daBfg/api/cpp/virtualPassRequest.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include diff --git a/prog/gameLibs/render/daBfg/api/dasModules/bindingHelper.h b/prog/gameLibs/render/daBfg/api/das/bindingHelper.h similarity index 100% rename from prog/gameLibs/render/daBfg/api/dasModules/bindingHelper.h rename to prog/gameLibs/render/daBfg/api/das/bindingHelper.h diff --git a/prog/gameLibs/render/daBfg/api/dasModules/docs/detail/function_annotation-daBfg-bfg_ecs_node.rst b/prog/gameLibs/render/daBfg/api/das/docs/detail/function_annotation-daBfg-bfg_ecs_node.rst similarity index 100% rename from prog/gameLibs/render/daBfg/api/dasModules/docs/detail/function_annotation-daBfg-bfg_ecs_node.rst rename to prog/gameLibs/render/daBfg/api/das/docs/detail/function_annotation-daBfg-bfg_ecs_node.rst diff --git a/prog/gameLibs/render/daBfg/api/dasModules/docs/detail/module-daBfg.rst b/prog/gameLibs/render/daBfg/api/das/docs/detail/module-daBfg.rst similarity index 100% rename from prog/gameLibs/render/daBfg/api/dasModules/docs/detail/module-daBfg.rst rename to prog/gameLibs/render/daBfg/api/das/docs/detail/module-daBfg.rst diff --git a/prog/gameLibs/render/daBfg/api/dasModules/docs/gen_docs.das b/prog/gameLibs/render/daBfg/api/das/docs/gen_docs.das similarity index 95% rename from prog/gameLibs/render/daBfg/api/dasModules/docs/gen_docs.das rename to prog/gameLibs/render/daBfg/api/das/docs/gen_docs.das index 85e92a0b8..16d850eb4 100644 --- a/prog/gameLibs/render/daBfg/api/dasModules/docs/gen_docs.das +++ b/prog/gameLibs/render/daBfg/api/das/docs/gen_docs.das @@ -20,13 +20,13 @@ def main hide_group(group_by_regex("Internal container manipulation", mod, %regex~(set|insert|emplace|getNameId|addNameId|getName|empty|clear|reserve|contains|get)$%%)); hide_group(group_by_regex("Misc", mod, %regex~(getBufView|getTexView|thisRequest|NodeHandle|unregisterNode|getTracker|using|getRegistry|getResourceProvider|register_external_node)$%%)); }] - topic_root = "{dagorRoot}/prog/gameLibs/render/daBfg/api/dasModules/docs/detail" + topic_root = "{dagorRoot}/prog/gameLibs/render/daBfg/api/das/docs/detail" document("DaScript module", mod, "{docsRoot}/das.rst", "", groups) var rstText = "" fopen("{docsRoot}/das.rst", "rb") <| $(f) if f != null rstText = fread(f) fopen("{docsRoot}/das.rst", "wb") <| $(f) - fwrite(f, "..\n This is auto generated file. See daBfg/api/dasModules/docs\n") + fwrite(f, "..\n This is auto generated file. See daBfg/api/das/docs\n") if f != null fwrite(f, rstText) \ No newline at end of file diff --git a/prog/gameLibs/render/daBfg/api/dasModules/docs/gen_docs.sh b/prog/gameLibs/render/daBfg/api/das/docs/gen_docs.sh similarity index 100% rename from prog/gameLibs/render/daBfg/api/dasModules/docs/gen_docs.sh rename to prog/gameLibs/render/daBfg/api/das/docs/gen_docs.sh diff --git a/prog/gameLibs/render/daBfg/api/dasModules/enumerations.cpp b/prog/gameLibs/render/daBfg/api/das/enumerations.cpp similarity index 98% rename from prog/gameLibs/render/daBfg/api/dasModules/enumerations.cpp rename to prog/gameLibs/render/daBfg/api/das/enumerations.cpp index e3e54a4f5..3b03f4d85 100644 --- a/prog/gameLibs/render/daBfg/api/dasModules/enumerations.cpp +++ b/prog/gameLibs/render/daBfg/api/das/enumerations.cpp @@ -1,4 +1,4 @@ -#include +#include #define DAS_BIND_ENUM_BOTH(enum_name, das_enum_name, ...) \ DAS_BASE_BIND_ENUM_BOTH(DAS_BIND_ENUM_QUALIFIED_HELPER, enum_name, das_enum_name, __VA_ARGS__) diff --git a/prog/gameLibs/render/daBfg/api/dasModules/frameGraphModule.cpp b/prog/gameLibs/render/daBfg/api/das/frameGraphModule.cpp similarity index 96% rename from prog/gameLibs/render/daBfg/api/dasModules/frameGraphModule.cpp rename to prog/gameLibs/render/daBfg/api/das/frameGraphModule.cpp index 802b6893b..fe9720e79 100644 --- a/prog/gameLibs/render/daBfg/api/dasModules/frameGraphModule.cpp +++ b/prog/gameLibs/render/daBfg/api/das/frameGraphModule.cpp @@ -4,12 +4,12 @@ #include #include #include -#include -#include -#include -#include +#include +#include +#include +#include #include -#include +#include namespace bind_dascript @@ -199,8 +199,8 @@ DaBfgModule::DaBfgModule() : das::Module("daBfg") das::ModuleAotType DaBfgModule::aotRequire(das::TextWriter &tw) const { - tw << "#include \n"; - tw << "#include \n"; + tw << "#include \n"; + tw << "#include \n"; tw << "#include \n"; return das::ModuleAotType::cpp; } diff --git a/prog/gameLibs/render/daBfg/api/dasModules/frameGraphModule.das b/prog/gameLibs/render/daBfg/api/das/frameGraphModule.das similarity index 100% rename from prog/gameLibs/render/daBfg/api/dasModules/frameGraphModule.das rename to prog/gameLibs/render/daBfg/api/das/frameGraphModule.das diff --git a/prog/gameLibs/render/daBfg/api/dasModules/frameGraphModule.h b/prog/gameLibs/render/daBfg/api/das/frameGraphModule.h similarity index 94% rename from prog/gameLibs/render/daBfg/api/dasModules/frameGraphModule.h rename to prog/gameLibs/render/daBfg/api/das/frameGraphModule.h index ddbaab3f3..22e9aaec7 100644 --- a/prog/gameLibs/render/daBfg/api/dasModules/frameGraphModule.h +++ b/prog/gameLibs/render/daBfg/api/das/frameGraphModule.h @@ -1,16 +1,16 @@ #pragma once -#include -#include +#include +#include #include #include #include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include namespace dabfg diff --git a/prog/gameLibs/render/daBfg/api/dasModules/genericBindings/fixedVectorMap.h b/prog/gameLibs/render/daBfg/api/das/genericBindings/fixedVectorMap.h similarity index 100% rename from prog/gameLibs/render/daBfg/api/dasModules/genericBindings/fixedVectorMap.h rename to prog/gameLibs/render/daBfg/api/das/genericBindings/fixedVectorMap.h diff --git a/prog/gameLibs/render/daBfg/api/dasModules/genericBindings/fixedVectorSet.h b/prog/gameLibs/render/daBfg/api/das/genericBindings/fixedVectorSet.h similarity index 100% rename from prog/gameLibs/render/daBfg/api/dasModules/genericBindings/fixedVectorSet.h rename to prog/gameLibs/render/daBfg/api/das/genericBindings/fixedVectorSet.h diff --git a/prog/gameLibs/render/daBfg/api/dasModules/genericBindings/idHierarchicalNameMap.h b/prog/gameLibs/render/daBfg/api/das/genericBindings/idHierarchicalNameMap.h similarity index 100% rename from prog/gameLibs/render/daBfg/api/dasModules/genericBindings/idHierarchicalNameMap.h rename to prog/gameLibs/render/daBfg/api/das/genericBindings/idHierarchicalNameMap.h diff --git a/prog/gameLibs/render/daBfg/api/dasModules/genericBindings/idIndexedMapping.h b/prog/gameLibs/render/daBfg/api/das/genericBindings/idIndexedMapping.h similarity index 100% rename from prog/gameLibs/render/daBfg/api/dasModules/genericBindings/idIndexedMapping.h rename to prog/gameLibs/render/daBfg/api/das/genericBindings/idIndexedMapping.h diff --git a/prog/gameLibs/render/daBfg/api/dasModules/genericBindings/idNameMap.h b/prog/gameLibs/render/daBfg/api/das/genericBindings/idNameMap.h similarity index 100% rename from prog/gameLibs/render/daBfg/api/dasModules/genericBindings/idNameMap.h rename to prog/gameLibs/render/daBfg/api/das/genericBindings/idNameMap.h diff --git a/prog/gameLibs/render/daBfg/api/dasModules/genericBindings/optional.h b/prog/gameLibs/render/daBfg/api/das/genericBindings/optional.h similarity index 100% rename from prog/gameLibs/render/daBfg/api/dasModules/genericBindings/optional.h rename to prog/gameLibs/render/daBfg/api/das/genericBindings/optional.h diff --git a/prog/gameLibs/render/daBfg/api/dasModules/genericBindings/relocatableFixedVector.h b/prog/gameLibs/render/daBfg/api/das/genericBindings/relocatableFixedVector.h similarity index 100% rename from prog/gameLibs/render/daBfg/api/dasModules/genericBindings/relocatableFixedVector.h rename to prog/gameLibs/render/daBfg/api/das/genericBindings/relocatableFixedVector.h diff --git a/prog/gameLibs/render/daBfg/api/dasModules/nodeDataAnnotation.cpp b/prog/gameLibs/render/daBfg/api/das/nodeDataAnnotation.cpp similarity index 97% rename from prog/gameLibs/render/daBfg/api/dasModules/nodeDataAnnotation.cpp rename to prog/gameLibs/render/daBfg/api/das/nodeDataAnnotation.cpp index e053f80c6..2564b44c9 100644 --- a/prog/gameLibs/render/daBfg/api/dasModules/nodeDataAnnotation.cpp +++ b/prog/gameLibs/render/daBfg/api/das/nodeDataAnnotation.cpp @@ -1,5 +1,5 @@ -#include -#include +#include +#include struct NodeStateRequirementsAnnotation final : das::ManagedStructureAnnotation diff --git a/prog/gameLibs/render/daBfg/api/dasModules/nodeEcsRegistration.cpp b/prog/gameLibs/render/daBfg/api/das/nodeEcsRegistration.cpp similarity index 96% rename from prog/gameLibs/render/daBfg/api/dasModules/nodeEcsRegistration.cpp rename to prog/gameLibs/render/daBfg/api/das/nodeEcsRegistration.cpp index 02507cbd8..23260f87e 100644 --- a/prog/gameLibs/render/daBfg/api/dasModules/nodeEcsRegistration.cpp +++ b/prog/gameLibs/render/daBfg/api/das/nodeEcsRegistration.cpp @@ -1,7 +1,7 @@ #include -#include -#include -#include +#include +#include +#include das::mutex nodeEcsRegistrationMutex; diff --git a/prog/gameLibs/render/daBfg/api/dasModules/nodeEcsRegistration.h b/prog/gameLibs/render/daBfg/api/das/nodeEcsRegistration.h similarity index 89% rename from prog/gameLibs/render/daBfg/api/dasModules/nodeEcsRegistration.h rename to prog/gameLibs/render/daBfg/api/das/nodeEcsRegistration.h index 16178a84a..0484cc103 100644 --- a/prog/gameLibs/render/daBfg/api/dasModules/nodeEcsRegistration.h +++ b/prog/gameLibs/render/daBfg/api/das/nodeEcsRegistration.h @@ -35,4 +35,6 @@ struct NodeEcsRegistrationAnnotation final : das::FunctionAnnotation return true; }; dag::FixedVectorMap arguments; + using StrRegistrationArgumentsPair = eastl::pair; }; +DAG_DECLARE_RELOCATABLE(NodeEcsRegistrationAnnotation::StrRegistrationArgumentsPair); diff --git a/prog/gameLibs/render/daBfg/api/dasModules/nodeEcsRegistrationStub.cpp b/prog/gameLibs/render/daBfg/api/das/nodeEcsRegistrationStub.cpp similarity index 89% rename from prog/gameLibs/render/daBfg/api/dasModules/nodeEcsRegistrationStub.cpp rename to prog/gameLibs/render/daBfg/api/das/nodeEcsRegistrationStub.cpp index d0164971f..91a7266e4 100644 --- a/prog/gameLibs/render/daBfg/api/dasModules/nodeEcsRegistrationStub.cpp +++ b/prog/gameLibs/render/daBfg/api/das/nodeEcsRegistrationStub.cpp @@ -1,4 +1,4 @@ -#include +#include bool NodeEcsRegistrationAnnotation::apply(const das::FunctionPtr &func, das::ModuleGroup &libGroup, const das::AnnotationArgumentList &args, das::string &err) diff --git a/prog/gameLibs/render/daBfg/api/dasModules/structureAnnotations.cpp b/prog/gameLibs/render/daBfg/api/das/structureAnnotations.cpp similarity index 98% rename from prog/gameLibs/render/daBfg/api/dasModules/structureAnnotations.cpp rename to prog/gameLibs/render/daBfg/api/das/structureAnnotations.cpp index 3cb8bfe46..e8aa46581 100644 --- a/prog/gameLibs/render/daBfg/api/dasModules/structureAnnotations.cpp +++ b/prog/gameLibs/render/daBfg/api/das/structureAnnotations.cpp @@ -1,5 +1,5 @@ -#include -#include +#include +#include struct TextureResourceAnnotation final : das::ManagedStructureAnnotation { diff --git a/prog/gameLibs/render/daBfg/api/dasModules/stub_aot.cpp b/prog/gameLibs/render/daBfg/api/das/stub_aot.cpp similarity index 83% rename from prog/gameLibs/render/daBfg/api/dasModules/stub_aot.cpp rename to prog/gameLibs/render/daBfg/api/das/stub_aot.cpp index 4faa4e4bb..2ab854917 100644 --- a/prog/gameLibs/render/daBfg/api/dasModules/stub_aot.cpp +++ b/prog/gameLibs/render/daBfg/api/das/stub_aot.cpp @@ -1,7 +1,7 @@ -#include +#include #include -#include -#include +#include +#include InitOnDemand dabfg::Backend::instance; dabfg::Backend::~Backend() { G_ASSERT(0); } diff --git a/prog/gameLibs/render/daBfg/ecs/frameGraphNodeES.cpp.gen.es.cpp b/prog/gameLibs/render/daBfg/api/ecs/frameGraphNodeES.cpp.gen.es.cpp similarity index 100% rename from prog/gameLibs/render/daBfg/ecs/frameGraphNodeES.cpp.gen.es.cpp rename to prog/gameLibs/render/daBfg/api/ecs/frameGraphNodeES.cpp.gen.es.cpp diff --git a/prog/gameLibs/render/daBfg/ecs/frameGraphNodeES.cpp.inl b/prog/gameLibs/render/daBfg/api/ecs/frameGraphNodeES.cpp.inl similarity index 100% rename from prog/gameLibs/render/daBfg/ecs/frameGraphNodeES.cpp.inl rename to prog/gameLibs/render/daBfg/api/ecs/frameGraphNodeES.cpp.inl diff --git a/prog/gameLibs/render/daBfg/intermediateRepresentation.cpp b/prog/gameLibs/render/daBfg/backend/intermediateRepresentation.cpp similarity index 99% rename from prog/gameLibs/render/daBfg/intermediateRepresentation.cpp rename to prog/gameLibs/render/daBfg/backend/intermediateRepresentation.cpp index a44a81105..cda27e31c 100644 --- a/prog/gameLibs/render/daBfg/intermediateRepresentation.cpp +++ b/prog/gameLibs/render/daBfg/backend/intermediateRepresentation.cpp @@ -1,4 +1,4 @@ -#include +#include "intermediateRepresentation.h" #include #include diff --git a/prog/gameLibs/render/daBfg/intermediateRepresentation.h b/prog/gameLibs/render/daBfg/backend/intermediateRepresentation.h similarity index 96% rename from prog/gameLibs/render/daBfg/intermediateRepresentation.h rename to prog/gameLibs/render/daBfg/backend/intermediateRepresentation.h index 99df4f991..6db2ba7aa 100644 --- a/prog/gameLibs/render/daBfg/intermediateRepresentation.h +++ b/prog/gameLibs/render/daBfg/backend/intermediateRepresentation.h @@ -25,10 +25,10 @@ #include #include -#include -#include +#include +#include -#include +#include // This files contains an intermediate representation of the graph which @@ -36,10 +36,7 @@ // The main aim of these structures is to describe how the user graph // should look, possibly after applying transformations. -namespace dabfg -{ - -namespace intermediate +namespace dabfg::intermediate { // Denotes an index inside GraphIR::nodes @@ -282,6 +279,8 @@ class Mapping final dag::Vector resNameIdMapping_{}; }; -} // namespace intermediate +} // namespace dabfg::intermediate -} // namespace dabfg +DAG_DECLARE_RELOCATABLE(dabfg::intermediate::Binding); +DAG_DECLARE_RELOCATABLE(dabfg::intermediate::Request); +DAG_DECLARE_RELOCATABLE(eastl::optional); diff --git a/prog/gameLibs/render/daBfg/nodeScheduling/nodeScheduler.cpp b/prog/gameLibs/render/daBfg/backend/nodeScheduler.cpp similarity index 100% rename from prog/gameLibs/render/daBfg/nodeScheduling/nodeScheduler.cpp rename to prog/gameLibs/render/daBfg/backend/nodeScheduler.cpp diff --git a/prog/gameLibs/render/daBfg/nodeScheduling/nodeScheduler.h b/prog/gameLibs/render/daBfg/backend/nodeScheduler.h similarity index 87% rename from prog/gameLibs/render/daBfg/nodeScheduling/nodeScheduler.h rename to prog/gameLibs/render/daBfg/backend/nodeScheduler.h index 64596aa9a..5431daf90 100644 --- a/prog/gameLibs/render/daBfg/nodeScheduling/nodeScheduler.h +++ b/prog/gameLibs/render/daBfg/backend/nodeScheduler.h @@ -5,8 +5,8 @@ #include -#include -#include +#include +#include #include diff --git a/prog/gameLibs/render/daBfg/nodes/nodeStateDeltas.cpp b/prog/gameLibs/render/daBfg/backend/nodeStateDeltas.cpp similarity index 99% rename from prog/gameLibs/render/daBfg/nodes/nodeStateDeltas.cpp rename to prog/gameLibs/render/daBfg/backend/nodeStateDeltas.cpp index b010148ea..28bed99ca 100644 --- a/prog/gameLibs/render/daBfg/nodes/nodeStateDeltas.cpp +++ b/prog/gameLibs/render/daBfg/backend/nodeStateDeltas.cpp @@ -67,6 +67,9 @@ static bool operator<(const Binding &first, const Binding &second) } // namespace intermediate +namespace sd +{ + struct DeltaCalculator { // Use this overload set to add special processing for fields @@ -181,4 +184,6 @@ NodeStateDeltas calculate_per_node_state_deltas(const intermediate::Graph &graph return result; } +} // namespace sd + } // namespace dabfg diff --git a/prog/gameLibs/render/daBfg/nodes/nodeStateDeltas.h b/prog/gameLibs/render/daBfg/backend/nodeStateDeltas.h similarity index 89% rename from prog/gameLibs/render/daBfg/nodes/nodeStateDeltas.h rename to prog/gameLibs/render/daBfg/backend/nodeStateDeltas.h index 5da90f445..27adf00a2 100644 --- a/prog/gameLibs/render/daBfg/nodes/nodeStateDeltas.h +++ b/prog/gameLibs/render/daBfg/backend/nodeStateDeltas.h @@ -1,9 +1,10 @@ #pragma once -#include +#include -namespace dabfg +// sd stands for state deltas +namespace dabfg::sd { struct ShaderBlockLayersInfoDelta @@ -38,4 +39,4 @@ using NodeStateDeltas = IdIndexedMapping +#include #include namespace dabfg diff --git a/prog/gameLibs/render/daBfg/resourceScheduling/packers/boxingPacker.cpp b/prog/gameLibs/render/daBfg/backend/resourceScheduling/packers/boxingPacker.cpp similarity index 99% rename from prog/gameLibs/render/daBfg/resourceScheduling/packers/boxingPacker.cpp rename to prog/gameLibs/render/daBfg/backend/resourceScheduling/packers/boxingPacker.cpp index 17ee99491..7d6a1e054 100644 --- a/prog/gameLibs/render/daBfg/resourceScheduling/packers/boxingPacker.cpp +++ b/prog/gameLibs/render/daBfg/backend/resourceScheduling/packers/boxingPacker.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include diff --git a/prog/gameLibs/render/daBfg/resourceScheduling/packers/greedyScanlinePacker.cpp b/prog/gameLibs/render/daBfg/backend/resourceScheduling/packers/greedyScanlinePacker.cpp similarity index 99% rename from prog/gameLibs/render/daBfg/resourceScheduling/packers/greedyScanlinePacker.cpp rename to prog/gameLibs/render/daBfg/backend/resourceScheduling/packers/greedyScanlinePacker.cpp index c36ec3c8a..f7150cc82 100644 --- a/prog/gameLibs/render/daBfg/resourceScheduling/packers/greedyScanlinePacker.cpp +++ b/prog/gameLibs/render/daBfg/backend/resourceScheduling/packers/greedyScanlinePacker.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include diff --git a/prog/gameLibs/render/daBfg/resourceScheduling/poolResourceScheduler.cpp b/prog/gameLibs/render/daBfg/backend/resourceScheduling/poolResourceScheduler.cpp similarity index 100% rename from prog/gameLibs/render/daBfg/resourceScheduling/poolResourceScheduler.cpp rename to prog/gameLibs/render/daBfg/backend/resourceScheduling/poolResourceScheduler.cpp diff --git a/prog/gameLibs/render/daBfg/resourceScheduling/poolResourceScheduler.h b/prog/gameLibs/render/daBfg/backend/resourceScheduling/poolResourceScheduler.h similarity index 100% rename from prog/gameLibs/render/daBfg/resourceScheduling/poolResourceScheduler.h rename to prog/gameLibs/render/daBfg/backend/resourceScheduling/poolResourceScheduler.h diff --git a/prog/gameLibs/render/daBfg/resourceScheduling/resourceScheduler.cpp b/prog/gameLibs/render/daBfg/backend/resourceScheduling/resourceScheduler.cpp similarity index 99% rename from prog/gameLibs/render/daBfg/resourceScheduling/resourceScheduler.cpp rename to prog/gameLibs/render/daBfg/backend/resourceScheduling/resourceScheduler.cpp index 88976ca10..9f64d1415 100644 --- a/prog/gameLibs/render/daBfg/resourceScheduling/resourceScheduler.cpp +++ b/prog/gameLibs/render/daBfg/backend/resourceScheduling/resourceScheduler.cpp @@ -7,14 +7,13 @@ #include #include #include +#include <3d/dag_drv3d.h> -#include -#include +#include +#include #include - +#include #include -#include "3d/dag_drv3d.h" -#include "packer.h" #if DABFG_STATISTICS_REPORTING @@ -250,13 +249,11 @@ auto ResourceScheduler::scheduleResourcesIntoHeaps(const ResourceProperties &res } auto heapGroupProp = getResourceHeapGroupProperties(heapRequests[heapIdx].group); - // Currently heap size is limited by driver's 2GiB restriction - uint32_t maxHeapSize = eastl::min(heapGroupProp.maxHeapSize, static_cast(eastl::numeric_limits::max())); PackerInput input{}; input.timelineSize = allEvents.size() * allEvents[0].size(); input.resources = packerResources; - input.maxHeapSize = maxHeapSize; + input.maxHeapSize = heapGroupProp.maxHeapSize; // Do not increase the size of an existing heap if resources in it have hints. // Otherwise hinted resources will not be preserved, because of the heap recreation. if (heapHasHints && allocatedHeaps.isMapped(heapIdx) && allocatedHeaps[heapIdx].size != 0) diff --git a/prog/gameLibs/render/daBfg/resourceScheduling/resourceScheduler.h b/prog/gameLibs/render/daBfg/backend/resourceScheduling/resourceScheduler.h similarity index 99% rename from prog/gameLibs/render/daBfg/resourceScheduling/resourceScheduler.h rename to prog/gameLibs/render/daBfg/backend/resourceScheduling/resourceScheduler.h index a53321486..8ee807dac 100644 --- a/prog/gameLibs/render/daBfg/resourceScheduling/resourceScheduler.h +++ b/prog/gameLibs/render/daBfg/backend/resourceScheduling/resourceScheduler.h @@ -9,8 +9,8 @@ #include #include -#include "intermediateRepresentation.h" -#include "graphDumper.h" +#include +#include #define DABFG_STATISTICS_REPORTING DAGOR_DBGLEVEL > 0 diff --git a/prog/gameLibs/render/daBfg/api/autoResolutionData.h b/prog/gameLibs/render/daBfg/common/autoResolutionData.h similarity index 100% rename from prog/gameLibs/render/daBfg/api/autoResolutionData.h rename to prog/gameLibs/render/daBfg/common/autoResolutionData.h diff --git a/prog/gameLibs/render/daBfg/bindingType.h b/prog/gameLibs/render/daBfg/common/bindingType.h similarity index 100% rename from prog/gameLibs/render/daBfg/bindingType.h rename to prog/gameLibs/render/daBfg/common/bindingType.h diff --git a/prog/gameLibs/render/daBfg/graphDumper.h b/prog/gameLibs/render/daBfg/common/graphDumper.h similarity index 100% rename from prog/gameLibs/render/daBfg/graphDumper.h rename to prog/gameLibs/render/daBfg/common/graphDumper.h diff --git a/prog/gameLibs/render/daBfg/resourceUsage.cpp b/prog/gameLibs/render/daBfg/common/resourceUsage.cpp similarity index 99% rename from prog/gameLibs/render/daBfg/resourceUsage.cpp rename to prog/gameLibs/render/daBfg/common/resourceUsage.cpp index 3122110f2..e916457ec 100644 --- a/prog/gameLibs/render/daBfg/resourceUsage.cpp +++ b/prog/gameLibs/render/daBfg/common/resourceUsage.cpp @@ -1,4 +1,4 @@ -#include +#include "resourceUsage.h" #include diff --git a/prog/gameLibs/render/daBfg/resourceUsage.h b/prog/gameLibs/render/daBfg/common/resourceUsage.h similarity index 91% rename from prog/gameLibs/render/daBfg/resourceUsage.h rename to prog/gameLibs/render/daBfg/common/resourceUsage.h index 079febebf..a883bf6c8 100644 --- a/prog/gameLibs/render/daBfg/resourceUsage.h +++ b/prog/gameLibs/render/daBfg/common/resourceUsage.h @@ -4,8 +4,8 @@ #include #include -#include -#include +#include +#include #include #include diff --git a/prog/gameLibs/render/daBfg/debug/backendDebug.h b/prog/gameLibs/render/daBfg/debug/backendDebug.h index c24061204..49d9ff1bb 100644 --- a/prog/gameLibs/render/daBfg/debug/backendDebug.h +++ b/prog/gameLibs/render/daBfg/debug/backendDebug.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include diff --git a/prog/gameLibs/render/daBfg/debug/backendDebugStub.cpp b/prog/gameLibs/render/daBfg/debug/backendDebugStub.cpp index c2a173acf..865c4fc41 100644 --- a/prog/gameLibs/render/daBfg/debug/backendDebugStub.cpp +++ b/prog/gameLibs/render/daBfg/debug/backendDebugStub.cpp @@ -1,4 +1,4 @@ -#include +#include #include namespace dabfg diff --git a/prog/gameLibs/render/daBfg/debug/globalStatesValidation.cpp b/prog/gameLibs/render/daBfg/debug/globalStatesValidation.cpp index ae521ec29..006242db0 100644 --- a/prog/gameLibs/render/daBfg/debug/globalStatesValidation.cpp +++ b/prog/gameLibs/render/daBfg/debug/globalStatesValidation.cpp @@ -1,7 +1,7 @@ -#include +#include +#include #include #include -#include namespace dabfg diff --git a/prog/gameLibs/render/daBfg/debug/graphVisualization.cpp b/prog/gameLibs/render/daBfg/debug/graphVisualization.cpp index 98fe4716a..0453cf3e4 100644 --- a/prog/gameLibs/render/daBfg/debug/graphVisualization.cpp +++ b/prog/gameLibs/render/daBfg/debug/graphVisualization.cpp @@ -8,8 +8,8 @@ #include #include -#include -#include +#include +#include #include #include diff --git a/prog/gameLibs/render/daBfg/debug/resourceValidation.cpp b/prog/gameLibs/render/daBfg/debug/resourceValidation.cpp index dfd06ec8e..905fc09b8 100644 --- a/prog/gameLibs/render/daBfg/debug/resourceValidation.cpp +++ b/prog/gameLibs/render/daBfg/debug/resourceValidation.cpp @@ -1,9 +1,9 @@ -#include +#include #include #include #include -#include +#include #include #include #include diff --git a/prog/gameLibs/render/daBfg/debug/resourceVisualization.cpp b/prog/gameLibs/render/daBfg/debug/resourceVisualization.cpp index 9bb106ad2..343d7d77c 100644 --- a/prog/gameLibs/render/daBfg/debug/resourceVisualization.cpp +++ b/prog/gameLibs/render/daBfg/debug/resourceVisualization.cpp @@ -1,6 +1,6 @@ #include "backendDebug.h" -#include +#include #include #include diff --git a/prog/gameLibs/render/daBfg/debug/textureVisualization.cpp b/prog/gameLibs/render/daBfg/debug/textureVisualization.cpp index bd8d236ae..6e1c7d3ff 100644 --- a/prog/gameLibs/render/daBfg/debug/textureVisualization.cpp +++ b/prog/gameLibs/render/daBfg/debug/textureVisualization.cpp @@ -1,5 +1,5 @@ #include "textureVisualization.h" -#include +#include #include #include #include diff --git a/prog/gameLibs/render/daBfg/api/internalRegistry.h b/prog/gameLibs/render/daBfg/frontend/internalRegistry.h similarity index 97% rename from prog/gameLibs/render/daBfg/api/internalRegistry.h rename to prog/gameLibs/render/daBfg/frontend/internalRegistry.h index 2651f36e8..76bca4a15 100644 --- a/prog/gameLibs/render/daBfg/api/internalRegistry.h +++ b/prog/gameLibs/render/daBfg/frontend/internalRegistry.h @@ -5,11 +5,11 @@ #include #include -#include -#include +#include +#include +#include #include #include -#include namespace dabfg @@ -184,3 +184,4 @@ struct InternalRegistry DAG_DECLARE_RELOCATABLE(dabfg::VirtualSubresourceRef); DAG_DECLARE_RELOCATABLE(dabfg::VirtualPassRequirements); +DAG_DECLARE_RELOCATABLE(dabfg::Binding); diff --git a/prog/gameLibs/render/daBfg/multiplexingInternal.cpp b/prog/gameLibs/render/daBfg/frontend/multiplexingInternal.cpp similarity index 98% rename from prog/gameLibs/render/daBfg/multiplexingInternal.cpp rename to prog/gameLibs/render/daBfg/frontend/multiplexingInternal.cpp index 507dd4e9b..eff211cbb 100644 --- a/prog/gameLibs/render/daBfg/multiplexingInternal.cpp +++ b/prog/gameLibs/render/daBfg/frontend/multiplexingInternal.cpp @@ -1,4 +1,4 @@ -#include +#include "multiplexingInternal.h" namespace dabfg diff --git a/prog/gameLibs/render/daBfg/multiplexingInternal.h b/prog/gameLibs/render/daBfg/frontend/multiplexingInternal.h similarity index 95% rename from prog/gameLibs/render/daBfg/multiplexingInternal.h rename to prog/gameLibs/render/daBfg/frontend/multiplexingInternal.h index 62ff19699..18a86d13e 100644 --- a/prog/gameLibs/render/daBfg/multiplexingInternal.h +++ b/prog/gameLibs/render/daBfg/frontend/multiplexingInternal.h @@ -1,7 +1,7 @@ #pragma once #include -#include "intermediateRepresentation.h" +#include namespace dabfg diff --git a/prog/gameLibs/render/daBfg/nameResolver.cpp b/prog/gameLibs/render/daBfg/frontend/nameResolver.cpp similarity index 99% rename from prog/gameLibs/render/daBfg/nameResolver.cpp rename to prog/gameLibs/render/daBfg/frontend/nameResolver.cpp index 8f0cf8a3a..dc6d56fdb 100644 --- a/prog/gameLibs/render/daBfg/nameResolver.cpp +++ b/prog/gameLibs/render/daBfg/frontend/nameResolver.cpp @@ -1,4 +1,4 @@ -#include +#include "nameResolver.h" #include diff --git a/prog/gameLibs/render/daBfg/nameResolver.h b/prog/gameLibs/render/daBfg/frontend/nameResolver.h similarity index 97% rename from prog/gameLibs/render/daBfg/nameResolver.h rename to prog/gameLibs/render/daBfg/frontend/nameResolver.h index 4005a5a43..f4ab18c3c 100644 --- a/prog/gameLibs/render/daBfg/nameResolver.h +++ b/prog/gameLibs/render/daBfg/frontend/nameResolver.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include diff --git a/prog/gameLibs/render/daBfg/nodes/nodeTracker.cpp b/prog/gameLibs/render/daBfg/frontend/nodeTracker.cpp similarity index 93% rename from prog/gameLibs/render/daBfg/nodes/nodeTracker.cpp rename to prog/gameLibs/render/daBfg/frontend/nodeTracker.cpp index 6d5768b30..5fdc8831a 100644 --- a/prog/gameLibs/render/daBfg/nodes/nodeTracker.cpp +++ b/prog/gameLibs/render/daBfg/frontend/nodeTracker.cpp @@ -7,8 +7,8 @@ #include #include -#include -#include +#include +#include #include @@ -120,14 +120,18 @@ void NodeTracker::recalculateResourceLifetimes() // resource, we ignore the modification. This happens when a // node creates a rendertarget and then immediately renders to it, // or renames it after rendering. - for (auto resId : nodeData.modifiedResources) + for (auto unresolvedResId : nodeData.modifiedResources) + { + const auto resId = nameResolver.resolve(unresolvedResId); if (resourceLifetimes[resId].introducedBy != nodeId && resourceLifetimes[resId].consumedBy != nodeId) resourceLifetimes[resId].modificationChain.emplace_back(nodeId); + } for (auto [produced, unresolvedConsumed] : nodeData.renamedResources) { // Sanity check (it's an invariant) G_ASSERT(produced != unresolvedConsumed); + G_ASSERT(produced == nameResolver.resolve(produced)); const auto consumed = nameResolver.resolve(unresolvedConsumed); auto &consumedLifetime = resourceLifetimes[consumed]; @@ -146,39 +150,78 @@ void NodeTracker::recalculateResourceLifetimes() } } -void NodeTracker::validateLifetimes() const +void NodeTracker::validateLifetimes(ValidityInfo &validity) const { -#if DAGOR_DBGLEVEL > 0 for (auto [resId, lifetime] : resourceLifetimes.enumerate()) { - eastl::fixed_vector sortedReaders(lifetime.readers.begin(), lifetime.readers.end()); + if (registry.resources.isMapped(resId) && registry.resources[resId].history != History::No && + lifetime.consumedBy != NodeNameId::Invalid) + { + logerr("Resource '%s' was consumed by '%s' on it's frame but had it's " + "history requested on next frame! " + "This is impossible to satisfy, disabling the resource! ", + registry.knownNames.getName(resId), registry.knownNames.getName(lifetime.consumedBy)); + + validity.resourceValid.set(resId, false); + } + + if (lifetime.introducedBy != NodeNameId::Invalid && lifetime.introducedBy == lifetime.consumedBy) + { + logerr("Resource '%s' was both created and consumed by node '%s'! " + "This is impossible to satisfy, disabling the resource!", + registry.knownNames.getName(resId), registry.knownNames.getName(lifetime.introducedBy)); + + validity.resourceValid.set(resId, false); + } + + dag::RelocatableFixedVector sortedReaders; + sortedReaders.assign(lifetime.readers.begin(), lifetime.readers.end()); eastl::sort(sortedReaders.begin(), sortedReaders.end()); - eastl::fixed_vector sortedModifiers(lifetime.modificationChain.begin(), lifetime.modificationChain.end()); + if (eastl::binary_search(sortedReaders.begin(), sortedReaders.end(), lifetime.introducedBy)) + { + logerr("Resource '%s' was both read and introduced by node '%s'! " + "This is impossible to satisfy, disabling the resource! ", + registry.knownNames.getName(resId), registry.knownNames.getName(lifetime.introducedBy)); + + validity.resourceValid.set(resId, false); + } + + if (eastl::binary_search(sortedReaders.begin(), sortedReaders.end(), lifetime.consumedBy)) + { + logerr("Resource '%s' was both read and consumed by node '%s'! " + "This is impossible to satisfy, disabling the node! ", + registry.knownNames.getName(resId), registry.knownNames.getName(lifetime.consumedBy)); + + validity.nodeValid.set(lifetime.consumedBy, false); + } + + dag::RelocatableFixedVector sortedModifiers; + sortedModifiers.assign(lifetime.modificationChain.begin(), lifetime.modificationChain.end()); eastl::sort(sortedModifiers.begin(), sortedModifiers.end()); - dag::Vector conflicts; + dag::RelocatableFixedVector conflicts; eastl::set_intersection(sortedReaders.begin(), sortedReaders.end(), sortedModifiers.begin(), sortedModifiers.end(), eastl::back_insert_iterator(conflicts)); - eastl::string list; - for (auto nodeId : conflicts) + if (!conflicts.empty()) { - list += "'"; - list += registry.knownNames.getName(nodeId); - list += "' "; - } + eastl::string list; + for (const auto nodeId : conflicts) + { + list += "'"; + list += registry.knownNames.getName(nodeId); + list += "' "; + } - G_ASSERT_DO_AND_LOG(conflicts.empty(), dumpRawUserGraph(), "Found nodes that both modify and read resource '%s'! Offender(s): %s", - registry.knownNames.getName(resId), list.c_str()); + logerr("Found nodes that both modify and read resource '%s'! Offender(s): %s. " + "This is impossible to satisfy, disabling these nodes!", + registry.knownNames.getName(resId), list.c_str()); - if (registry.resources.isMapped(resId) && registry.resources[resId].history != History::No) - G_ASSERT_DO_AND_LOG(lifetime.consumedBy == NodeNameId::Invalid, dumpRawUserGraph(), - "Resource '%s' was consumed by '%s' on it's frame but had it's " - "history requested on next frame!", - registry.knownNames.getName(resId), registry.knownNames.getName(lifetime.consumedBy)); + for (const auto nodeId : conflicts) + validity.nodeValid.set(nodeId, false); + } } -#endif } void NodeTracker::declareNodes() @@ -201,7 +244,6 @@ void NodeTracker::declareNodes() void NodeTracker::gatherNodeData() { recalculateResourceLifetimes(); - validateLifetimes(); resolveRenaming(); updateRenamedResourceProperties(); } @@ -416,6 +458,8 @@ auto NodeTracker::findValidResourcesAndNodes() const -> ValidityInfo for (const auto nodeId : IdRange(registry.nodes.size())) nodeValid.set(nodeId, validateNode(nodeId)); + validateLifetimes(result); + // We have to do an initial pass of marking resources created by // broken nodes as invalid. for (auto [nodeId, valid] : nodeValid.enumerate()) @@ -645,7 +689,8 @@ eastl::pair NodeTracker::createDiscr const auto &resData = registry.resources[resId]; - const auto scanResourceUsages = [this](ResNameId res_id, const auto &process_readers, const auto &process_modifiers) { + const auto scanResourceUsages = [this, &validity](ResNameId res_id, const auto &process_readers, + const auto &process_modifiers) { FRAMEMEM_VALIDATE; // Save scanned resources to prevent hanging if there is cycle in renaming chains dag::VectorSet, framemem_allocator> knownCandidates; @@ -656,14 +701,20 @@ eastl::pair NodeTracker::createDiscr auto resIdCandidate = res_id; while (true) { + // We don't want to consider broken/invalid nodes when looking + // for an appropriate usage. for (const auto modifier : resourceLifetimes[resIdCandidate].modificationChain) - scanNodeSlotsOrResource(modifier, resIdCandidate, process_modifiers); + if (validity.nodeValid[modifier]) + scanNodeSlotsOrResource(modifier, resIdCandidate, process_modifiers); for (const auto reader : resourceLifetimes[resIdCandidate].readers) - scanNodeSlotsOrResource(reader, resIdCandidate, process_readers); + if (validity.nodeValid[reader]) + scanNodeSlotsOrResource(reader, resIdCandidate, process_readers); - if (!renamingChains.isMapped(resIdCandidate) || renamingChains[resIdCandidate] != resIdCandidate) + // The renaming chain might've been "torn" by an invalid node. + if (!renamingChains.isMapped(resIdCandidate) || !validity.resourceValid[renamingChains[resIdCandidate]]) break; + resIdCandidate = renamingChains[resIdCandidate]; if (knownCandidates.find(resIdCandidate) != knownCandidates.end()) break; diff --git a/prog/gameLibs/render/daBfg/nodes/nodeTracker.h b/prog/gameLibs/render/daBfg/frontend/nodeTracker.h similarity index 95% rename from prog/gameLibs/render/daBfg/nodes/nodeTracker.h rename to prog/gameLibs/render/daBfg/frontend/nodeTracker.h index 46b6f6452..cebc43d95 100644 --- a/prog/gameLibs/render/daBfg/nodes/nodeTracker.h +++ b/prog/gameLibs/render/daBfg/frontend/nodeTracker.h @@ -10,11 +10,11 @@ #include #include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include #include #include #include @@ -109,20 +109,19 @@ class NodeTracker final : public IGraphDumper IdIndexedMapping renamingChains; void recalculateResourceLifetimes(); - void validateLifetimes() const; void resolveRenaming(); void updateRenamedResourceProperties(); void fixupFalseHistoryFlags(intermediate::Graph &graph) const; - bool validateResource(ResNameId resId) const; - bool validateNode(NodeNameId resId) const; - struct ValidityInfo { IdIndexedFlags resourceValid; IdIndexedFlags nodeValid; }; + bool validateResource(ResNameId resId) const; + bool validateNode(NodeNameId resId) const; + void validateLifetimes(ValidityInfo &validity) const; ValidityInfo findValidResourcesAndNodes() const; // Result contains no edges yet diff --git a/prog/gameLibs/render/daBfg/api/resourceProvider.h b/prog/gameLibs/render/daBfg/frontend/resourceProvider.h similarity index 94% rename from prog/gameLibs/render/daBfg/api/resourceProvider.h rename to prog/gameLibs/render/daBfg/frontend/resourceProvider.h index 34e736cb9..ea53f34de 100644 --- a/prog/gameLibs/render/daBfg/api/resourceProvider.h +++ b/prog/gameLibs/render/daBfg/frontend/resourceProvider.h @@ -3,6 +3,8 @@ #include #include #include <3d/dag_resPtr.h> +#include + #include #include #include diff --git a/prog/gameLibs/render/daBfg/jamfile b/prog/gameLibs/render/daBfg/jamfile index 74acdda89..6179b86a3 100644 --- a/prog/gameLibs/render/daBfg/jamfile +++ b/prog/gameLibs/render/daBfg/jamfile @@ -27,28 +27,31 @@ AddIncludes = ; Sources = - nodes/nodeTracker.cpp - nodes/nodeExecutor.cpp - nodes/nodeStateDeltas.cpp - nodeScheduling/nodeScheduler.cpp - resourceScheduling/resourceScheduler.cpp - resourceScheduling/nativeResourceScheduler.cpp - resourceScheduling/poolResourceScheduler.cpp - resourceUsage.cpp - backend.cpp - intermediateRepresentation.cpp - multiplexingInternal.cpp - nameResolver.cpp + api/cpp/detail/virtualResourceHandleBase.cpp + api/cpp/detail/virtualResourceRequestBase.cpp + api/cpp/bfg.cpp + api/cpp/registry.cpp + api/cpp/stateRequest.cpp + api/cpp/virtualPassRequest.cpp + api/cpp/autoResolutionRequest.cpp + api/cpp/nameSpace.cpp + api/cpp/nameSpaceRequest.cpp - api/detail/virtualResourceHandleBase.cpp - api/detail/virtualResourceRequestBase.cpp - api/bfg.cpp - api/registry.cpp - api/stateRequest.cpp - api/virtualPassRequest.cpp - api/autoResolutionRequest.cpp - api/nameSpace.cpp - api/nameSpaceRequest.cpp + common/resourceUsage.cpp + + frontend/nodeTracker.cpp + frontend/nameResolver.cpp + frontend/multiplexingInternal.cpp + + backend/intermediateRepresentation.cpp + backend/nodeStateDeltas.cpp + backend/nodeScheduler.cpp + backend/resourceScheduling/resourceScheduler.cpp + backend/resourceScheduling/nativeResourceScheduler.cpp + backend/resourceScheduling/poolResourceScheduler.cpp + + runtime/nodeExecutor.cpp + runtime/backend.cpp ; if $(DABFG_ENABLE_DAS_INTEGRATION) = yes { @@ -56,31 +59,31 @@ if $(DABFG_ENABLE_DAS_INTEGRATION) = yes { $(Root)/prog/1stPartyLibs/daScript/include ; Sources += - api/dasModules/enumerations.cpp - api/dasModules/frameGraphModule.cpp - api/dasModules/nodeDataAnnotation.cpp - api/dasModules/structureAnnotations.cpp + api/das/enumerations.cpp + api/das/frameGraphModule.cpp + api/das/nodeDataAnnotation.cpp + api/das/structureAnnotations.cpp ; SourceDAS = - api/dasModules/frameGraphModule.das + api/das/frameGraphModule.das ; if $(DABFG_ENABLE_DAECS_INTEGRATION) = yes { - Sources += api/dasModules/nodeEcsRegistration.cpp ; + Sources += api/das/nodeEcsRegistration.cpp ; } else { - Sources += api/dasModules/nodeEcsRegistrationStub.cpp ; + Sources += api/das/nodeEcsRegistrationStub.cpp ; } UseProgLibs += 1stPartyLibs/daScript ; for s in $(SourceDAS) { StringifySourceFile $(s) : $(s).inl : $(s:S=.cpp) ; } } if $(DABFG_ENABLE_DAECS_INTEGRATION) = yes { SourceES = - ecs/frameGraphNodeES.cpp.inl + api/ecs/frameGraphNodeES.cpp.inl ; for s in $(SourceES) { GenESSourceFile $(s) ; } } AllSrcFolder_CPP = - resourceScheduling/packers + backend/resourceScheduling/packers ; diff --git a/prog/gameLibs/render/daBfg/nodes/nodeStateInternal.h b/prog/gameLibs/render/daBfg/nodes/nodeStateInternal.h deleted file mode 100644 index dcb6560d0..000000000 --- a/prog/gameLibs/render/daBfg/nodes/nodeStateInternal.h +++ /dev/null @@ -1,133 +0,0 @@ -#pragma once - -#include -#include - -#include - -#include -#include -#include -#include - -#include -#include <3d/dag_drv3dConsts.h> - -#include -#include -#include - - -namespace dabfg -{ - -struct ShaderBlockLayersInfo -{ - int objectLayer = -1; - int frameLayer = -1; - int sceneLayer = -1; -}; - -struct VirtualSubresourceRef -{ - ResNameId nameId = ResNameId::Invalid; - uint32_t mipLevel = 0; - uint32_t layer = 0; -}; - -struct VirtualPassRequirements -{ - // Also known as "multiple render targets" -- correspond to pixel - // shader outputs in the specified order. - dag::RelocatableFixedVector colorAttachments; - VirtualSubresourceRef depthAttachment; - bool depthReadOnly = true; -}; - -struct VrsStateRequirements -{ - uint32_t rateX; - uint32_t rateY; - ResNameId rateTextureResId; - VariableRateShadingCombiner vertexCombiner; - VariableRateShadingCombiner pixelCombiner; -}; - -struct NodeStateRequirements -{ - // Toggles whether d3d::setwire should be turned on for this node - // when wirefrime debug mode is enabled. - bool supportsWireframe = false; - - // Toggles VRS - VrsStateRequirements vrsState = {}; - - // Toggles C++ overrides for rendering pipeline state - // (which is usually configured in dagor shader code) - eastl::optional pipelineStateOverride = {}; - - void setOverride(const shaders::OverrideState &state) { pipelineStateOverride = state; } -}; - -enum class BindingType -{ - ShaderVar, - ViewMatrix, - ProjMatrix, - Invalid, -}; - -struct BindingInfo -{ - BindingType type = BindingType::Invalid; - ResNameId nameId = ResNameId::Invalid; - uint32_t multiplexingIdx = ~0u; // Workaround. TODO: reimplement binding logic with explicit public and internal binding info. - bool history = false; - bool optional = false; - Stage stage = Stage::UNKNOWN; - ResourceSubtypeTag projectedTag = ResourceSubtypeTag::Unknown; - detail::TypeErasedProjector projector = [](void *data) { return data; }; -}; - -using BindingsMap = dag::FixedVectorMap; - -struct VrsStateInternal -{ - uint32_t rateX = 1; - uint32_t rateY = 1; - ResNameId rateTexture = ResNameId::Invalid; - VariableRateShadingCombiner vertexCombiner = VariableRateShadingCombiner::VRS_PASSTHROUGH; - VariableRateShadingCombiner pixelCombiner = VariableRateShadingCombiner::VRS_PASSTHROUGH; -}; - -struct NodeStateInternal -{ - bool wire{}; - VrsStateInternal vrs{}; - eastl::optional rendering; - eastl::optional shaderOverrides; - BindingsMap bindings; - ShaderBlockLayersInfo shaderBlockLayers; -}; - -struct ShaderBlockLayersInfoDelta -{ - eastl::optional objectLayer; - eastl::optional frameLayer; - eastl::optional sceneLayer; -}; - -// nullopt here means "no change" -struct NodeStateDelta -{ - eastl::optional wire; - eastl::optional vrs; - eastl::optional rendering; - eastl::optional shaderOverrides; - BindingsMap bindings; - ShaderBlockLayersInfoDelta shaderBlockLayers; -}; - -dag::Vector calculate_per_node_state_deltas(eastl::span nodeStates); - -} // namespace dabfg diff --git a/prog/gameLibs/render/daBfg/backend.cpp b/prog/gameLibs/render/daBfg/runtime/backend.cpp similarity index 94% rename from prog/gameLibs/render/daBfg/backend.cpp rename to prog/gameLibs/render/daBfg/runtime/backend.cpp index 514d7c28a..e7bea334d 100644 --- a/prog/gameLibs/render/daBfg/backend.cpp +++ b/prog/gameLibs/render/daBfg/runtime/backend.cpp @@ -1,5 +1,4 @@ -#include "memory/dag_framemem.h" -#include +#include "backend.h" #include #include // std::lock_guard @@ -8,16 +7,16 @@ #include #include +#include #include #include -#include -#include -#include - -#include -#include +#include +#include +#include +#include +#include #include @@ -112,7 +111,14 @@ void Backend::scheduleNodes() update_graph_visualization(&nodeTracker, demultiplexedNodeExecutionOrder); } - perNodeStateDeltas = calculate_per_node_state_deltas(intermediateGraph); + currentStage = CompilationStage::REQUIRES_STATE_DELTA_RECALCULATION; +} + +void Backend::recalculateStateDeltas() +{ + TIME_PROFILE(recalculateStateDeltas); + + perNodeStateDeltas = sd::calculate_per_node_state_deltas(intermediateGraph); currentStage = CompilationStage::REQUIRES_RESOURCE_SCHEDULING; } @@ -314,6 +320,8 @@ void Backend::runNodes() case CompilationStage::REQUIRES_NODE_SCHEDULING: scheduleNodes(); [[fallthrough]]; + case CompilationStage::REQUIRES_STATE_DELTA_RECALCULATION: recalculateStateDeltas(); [[fallthrough]]; + case CompilationStage::REQUIRES_RESOURCE_SCHEDULING: scheduleResources(); [[fallthrough]]; case CompilationStage::REQUIRES_FIRST_FRAME_HISTORY_HANDLING: handleFirstFrameHistory(); [[fallthrough]]; diff --git a/prog/gameLibs/render/daBfg/backend.h b/prog/gameLibs/render/daBfg/runtime/backend.h similarity index 87% rename from prog/gameLibs/render/daBfg/backend.h rename to prog/gameLibs/render/daBfg/runtime/backend.h index 29a1b3dd0..8d184537b 100644 --- a/prog/gameLibs/render/daBfg/backend.h +++ b/prog/gameLibs/render/daBfg/runtime/backend.h @@ -7,14 +7,14 @@ #include <3d/dag_drv3d.h> #include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include -#include +#include +#include template @@ -59,7 +59,7 @@ class Backend intermediate::Graph intermediateGraph; intermediate::Mapping irMapping; - NodeStateDeltas perNodeStateDeltas; + sd::NodeStateDeltas perNodeStateDeltas; ResourceScheduler::EventsCollectionRef allResourceEvents; // Deferred init @@ -76,6 +76,7 @@ class Backend void gatherNodeData(); void gatherGraphIR(); void scheduleNodes(); + void recalculateStateDeltas(); void scheduleResources(); void handleFirstFrameHistory(); diff --git a/prog/gameLibs/render/daBfg/compilationStage.h b/prog/gameLibs/render/daBfg/runtime/compilationStage.h similarity index 93% rename from prog/gameLibs/render/daBfg/compilationStage.h rename to prog/gameLibs/render/daBfg/runtime/compilationStage.h index cb832ae63..e04a9204a 100644 --- a/prog/gameLibs/render/daBfg/compilationStage.h +++ b/prog/gameLibs/render/daBfg/runtime/compilationStage.h @@ -11,6 +11,7 @@ enum class CompilationStage REQUIRES_NODE_DATA_GATHERING, REQUIRES_IR_GENERATION, REQUIRES_NODE_SCHEDULING, + REQUIRES_STATE_DELTA_RECALCULATION, REQUIRES_RESOURCE_SCHEDULING, REQUIRES_FIRST_FRAME_HISTORY_HANDLING, UP_TO_DATE, diff --git a/prog/gameLibs/render/daBfg/nodes/nodeExecutor.cpp b/prog/gameLibs/render/daBfg/runtime/nodeExecutor.cpp similarity index 98% rename from prog/gameLibs/render/daBfg/nodes/nodeExecutor.cpp rename to prog/gameLibs/render/daBfg/runtime/nodeExecutor.cpp index 5ccadded5..09ab3c739 100644 --- a/prog/gameLibs/render/daBfg/nodes/nodeExecutor.cpp +++ b/prog/gameLibs/render/daBfg/runtime/nodeExecutor.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include @@ -59,7 +59,7 @@ void populate_resource_provider(ResourceProvider &provider, // passed by ref to } void NodeExecutor::execute(int prev_frame, int curr_frame, multiplexing::Extents multiplexing_extents, - const ResourceScheduler::FrameEventsRef &events, const NodeStateDeltas &state_deltas) + const ResourceScheduler::FrameEventsRef &events, const sd::NodeStateDeltas &state_deltas) { currentlyProvidedResources.resolutions.resize(registry.knownNames.nameCount()); for (auto [unresolvedId, resolution] : currentlyProvidedResources.resolutions.enumerate()) @@ -231,7 +231,7 @@ void NodeExecutor::processEvents(ResourceScheduler::NodeEventsRef events) const } } -void NodeExecutor::applyState(const NodeStateDelta &state, int frame, int prev_frame) const +void NodeExecutor::applyState(const sd::NodeStateDelta &state, int frame, int prev_frame) const { if (externalState.wireframeModeEnabled && state.wire) d3d::setwire(*state.wire); @@ -409,6 +409,7 @@ void NodeExecutor::bindShaderVar(int bind_idx, const intermediate::Binding &bind ShaderGlobal::set_buffer(bind_idx, bufId); break; } + case SHVT_SAMPLER: logerr("set_sampler is not implemented yet"); break; } } diff --git a/prog/gameLibs/render/daBfg/nodes/nodeExecutor.h b/prog/gameLibs/render/daBfg/runtime/nodeExecutor.h similarity index 84% rename from prog/gameLibs/render/daBfg/nodes/nodeExecutor.h rename to prog/gameLibs/render/daBfg/runtime/nodeExecutor.h index 8984b85e6..6bed0c54c 100644 --- a/prog/gameLibs/render/daBfg/nodes/nodeExecutor.h +++ b/prog/gameLibs/render/daBfg/runtime/nodeExecutor.h @@ -3,11 +3,11 @@ #include #include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include namespace dabfg @@ -22,7 +22,7 @@ class NodeExecutor {} void execute(int prev_frame, int curr_frame, multiplexing::Extents multiplexing_extents, - const ResourceScheduler::FrameEventsRef &events, const NodeStateDeltas &state_deltas); + const ResourceScheduler::FrameEventsRef &events, const sd::NodeStateDeltas &state_deltas); ExternalState externalState; @@ -31,7 +31,7 @@ class NodeExecutor IdIndexedMapping &resources); void processEvents(ResourceScheduler::NodeEventsRef events) const; - void applyState(const NodeStateDelta &state, int frame, int prev_frame) const; + void applyState(const sd::NodeStateDelta &state, int frame, int prev_frame) const; void applyBindings(const intermediate::BindingsMap &bindings, int frame, int prev_frame) const; void bindShaderVar(int bind_idx, const intermediate::Binding &binding, int frame, int prev_frame) const; diff --git a/prog/gameLibs/render/debug3dSolid/debug3dSolid.cpp b/prog/gameLibs/render/debug3dSolid/debug3dSolid.cpp index 35577a9d5..b105e3a75 100644 --- a/prog/gameLibs/render/debug3dSolid/debug3dSolid.cpp +++ b/prog/gameLibs/render/debug3dSolid/debug3dSolid.cpp @@ -157,8 +157,8 @@ static void create_shape_relem(dynrender::RElem &relem, SHAPE_TYPE shape) uint32_t ibSize = faceCount * sizeof(uint16_t) * 3; // Create the mesh - Vbuffer *vb = d3d::create_vb(vbSize, SBCF_MAYBELOST, "solid_sphere"); - Ibuffer *ib = d3d::create_ib(ibSize, SBCF_MAYBELOST, "solid_sphere_ib"); + Vbuffer *vb = d3d::create_vb(vbSize, 0, "solid_sphere"); + Ibuffer *ib = d3d::create_ib(ibSize, 0, "solid_sphere_ib"); uint8_t *vertices = NULL; uint8_t *indices = NULL; diff --git a/prog/gameLibs/render/debug3dSolid/debug3dSolidBuffered.cpp b/prog/gameLibs/render/debug3dSolid/debug3dSolidBuffered.cpp index d3565e011..3b753b81d 100644 --- a/prog/gameLibs/render/debug3dSolid/debug3dSolidBuffered.cpp +++ b/prog/gameLibs/render/debug3dSolid/debug3dSolidBuffered.cpp @@ -25,6 +25,7 @@ struct BufferedSphere static Tab buffered_mesh_list(midmem_ptr()); // Note: sorted by deadline static Tab buffered_sphere_list(midmem_ptr()); // Note: sorted by deadline static size_t current_frame = 0; +static bool last_frame_game_was_paused = false; void clear_buffered_debug_solids() { @@ -33,8 +34,9 @@ void clear_buffered_debug_solids() } static BufferedMesh get_mesh(const uint16_t *indices, int faces_count, const float *xyz_pos, int vertex_size, int vertices_count, - const TMatrix &tm, Color4 color, size_t frames) + const TMatrix &tm, Color4 color, size_t requested_frames) { + size_t frames = last_frame_game_was_paused ? 1 : requested_frames; BufferedMesh mesh = {current_frame + frames, frames, color, tm}; mesh.indices = Tab(); @@ -51,10 +53,11 @@ static BufferedMesh get_mesh(const uint16_t *indices, int faces_count, const flo } void draw_debug_solid_mesh_buffered(const uint16_t *indices, int faces_count, const float *xyz_pos, int vertex_size, - int vertices_count, const TMatrix &tm, Color4 color, size_t frames) + int vertices_count, const TMatrix &tm, Color4 color, size_t requested_frames) { if (faces_count <= 0 || vertex_size <= 0 || vertices_count <= 0) return; + size_t frames = last_frame_game_was_paused ? 1 : requested_frames; size_t deadlineFrame = current_frame + frames; for (int last = buffered_mesh_list.size() - 1, i = last; i >= 0; --i) // lookup place to insert into (according to deadline) if (deadlineFrame >= buffered_mesh_list[i].deadlineFrame) @@ -71,8 +74,9 @@ void draw_debug_solid_mesh_buffered(const uint16_t *indices, int faces_count, co get_mesh(indices, faces_count, xyz_pos, vertex_size, vertices_count, tm, color, frames)); // insert first (oldest) } -void draw_debug_ball_buffered(const Point3 &sphere_c, float sphere_r, const Color4 &color, size_t frames) +void draw_debug_ball_buffered(const Point3 &sphere_c, float sphere_r, const Color4 &color, size_t requested_frames) { + size_t frames = last_frame_game_was_paused ? 1 : requested_frames; size_t deadlineFrame = current_frame + frames; for (int last = buffered_sphere_list.size() - 1, i = last; i >= 0; --i) // lookup place to insert into (according to deadline) if (deadlineFrame >= buffered_sphere_list[i].deadlineFrame) @@ -246,12 +250,13 @@ static int draw_buffered_spheres(dag::ConstSpan spheres, size_t return eraseNum; } -void flush_buffered_debug_meshes(bool decriment_buffer_frames) +void flush_buffered_debug_meshes(bool game_is_paused) { + last_frame_game_was_paused = game_is_paused; if (buffered_mesh_list.empty() && buffered_sphere_list.empty()) return; - if (!decriment_buffer_frames) + if (game_is_paused) { for (auto &mesh : buffered_mesh_list) if (mesh.bufferFrames > 1) diff --git a/prog/gameLibs/render/debugMesh/stub/debugMesh.cpp b/prog/gameLibs/render/debugMesh/stub/debugMesh.cpp index 0582eab33..7e5358762 100644 --- a/prog/gameLibs/render/debugMesh/stub/debugMesh.cpp +++ b/prog/gameLibs/render/debugMesh/stub/debugMesh.cpp @@ -11,4 +11,5 @@ void reset_debug_value() {} void activate_mesh_coloring_master_override() {} void deactivate_mesh_coloring_master_override() {} +Type debug_gbuffer_mode_to_type(DebugGbufferMode) { return Type::NONE; } } // namespace debug_mesh diff --git a/prog/gameLibs/render/deferredRT.cpp b/prog/gameLibs/render/deferredRT.cpp index b31ca3400..e32137443 100644 --- a/prog/gameLibs/render/deferredRT.cpp +++ b/prog/gameLibs/render/deferredRT.cpp @@ -59,8 +59,6 @@ void DeferredRT::setVar() uint32_t DeferredRT::recreateDepthInternal(uint32_t targetFmt) { - if (targetFmt & TEXCF_MULTISAMPLED) - targetFmt |= TEXCF_MSAATARGET; if (!(d3d::get_texformat_usage(targetFmt) & d3d::USAGE_DEPTH)) { debug("not supported depth format 0x%08x, fallback to TEXFMT_DEPTH24", targetFmt); @@ -71,7 +69,7 @@ uint32_t DeferredRT::recreateDepthInternal(uint32_t targetFmt) { TextureInfo tinfo; depth.getTex2D()->getinfo(tinfo, 0); - currentFmt = tinfo.cflg & (TEXFMT_MASK | TEXCF_MSAATARGET | TEXCF_MULTISAMPLED | TEXCF_TC_COMPATIBLE); + currentFmt = tinfo.cflg & (TEXFMT_MASK | TEXCF_SAMPLECOUNT_MASK | TEXCF_TC_COMPATIBLE); targetFmt |= currentFmt & (~TEXFMT_MASK); } if (currentFmt == targetFmt) diff --git a/prog/gameLibs/render/depthAOAboveRenderer.cpp b/prog/gameLibs/render/depthAOAboveRenderer.cpp index 0841b983d..9c9be640e 100644 --- a/prog/gameLibs/render/depthAOAboveRenderer.cpp +++ b/prog/gameLibs/render/depthAOAboveRenderer.cpp @@ -510,6 +510,7 @@ void DepthAOAboveRenderer::setInvalidVars() void DepthAOAboveRenderer::setVars() { worldAODepth.setVar(); + blurredDepth.setVar(); if (renderTransparent) { blurredDepthWithTransparency.setVar(); diff --git a/prog/gameLibs/render/distortion.cpp b/prog/gameLibs/render/distortion.cpp index afbc1ae7f..799bace55 100644 --- a/prog/gameLibs/render/distortion.cpp +++ b/prog/gameLibs/render/distortion.cpp @@ -68,7 +68,6 @@ void DistortionRenderer::startRenderDistortion(Texture *distortionOffsetTex) d3d::get_render_target(prevRt); d3d::set_render_target(distortionOffsetTex, 0); - d3d::set_backbuf_depth(); d3d::clearview(CLEAR_TARGET, E3DCOLOR_MAKE(0x80, 0x80, 0, 0), 0.f, 0); /*float prevHdrOverbright = ShaderGlobal::get_real_fast(hdrOverbrightGlobVarId); diff --git a/prog/gameLibs/render/dof/dof_ps.cpp b/prog/gameLibs/render/dof/dof_ps.cpp index ea3c006f4..3152ab262 100644 --- a/prog/gameLibs/render/dof/dof_ps.cpp +++ b/prog/gameLibs/render/dof/dof_ps.cpp @@ -121,7 +121,7 @@ void DepthOfFieldPS::initNear() if (dof_coc[i].getTex2D()) dof_coc[i].getTex2D()->texaddr(TEXADDR_CLAMP); } - if (useCoCAccumulation) + if (useCoCAccumulation && !useSimplifiedRendering) { dof_coc_history = dag::create_tex(NULL, originalWidth, originalHeight, flg | TEXFMT_R8, 1, "dof_coc_history"); if (dof_coc_history.getTex2D()) @@ -359,7 +359,8 @@ void DepthOfFieldPS::perform(const TextureIDPair &frame, const TextureIDPair &cl ShaderGlobal::set_texture(dof_far_layerVarId, hasFarDof ? dof_far_layer[0].getTexId() : BAD_TEXTUREID); ShaderGlobal::set_texture(dof_near_layerVarId, hasNearDof ? dof_near_layer[0].getTexId() : BAD_TEXTUREID); - ShaderGlobal::set_texture(dof_coc_historyVarId, useCoCAccumulation ? dof_coc_history.getTexId() : BAD_TEXTUREID); + ShaderGlobal::set_texture(dof_coc_historyVarId, + (useCoCAccumulation && !useSimplifiedRendering) ? dof_coc_history.getTexId() : BAD_TEXTUREID); // 1st downscale stage TextureInfo info; @@ -509,7 +510,7 @@ void DepthOfFieldPS::perform(const TextureIDPair &frame, const TextureIDPair &cl d3d::resource_barrier({dof_far_layer[0].getTex2D(), RB_RO_SRV | RB_STAGE_PIXEL, 0, 0}); } - if (useCoCAccumulation && dof_coc_history && dof_coc[0]) + if (useCoCAccumulation && !useSimplifiedRendering && dof_coc_history && dof_coc[0]) { eastl::swap(dof_coc_history, dof_coc[0]); dof_coc_history->texfilter(TEXFILTER_SMOOTH); diff --git a/prog/gameLibs/render/dynamicCube.cpp b/prog/gameLibs/render/dynamicCube.cpp deleted file mode 100644 index 122090a69..000000000 --- a/prog/gameLibs/render/dynamicCube.cpp +++ /dev/null @@ -1,203 +0,0 @@ -#include -#include -#include -#include -#include -#include <3d/dag_tex3d.h> -#include <3d/dag_drv3d.h> -#include <3d/dag_drv3d_platform.h> -#include -#include <3d/dag_drv3dCmd.h> - - -#define NUM_CUBE_FACES 6 - - -DynamicCube::DynamicCube(unsigned int num_mips, unsigned int size, float blur, unsigned format) : - dynamicCubeTex1(NULL), - dynamicCubeTex1Id(BAD_TEXTUREID), - dynamicCubeTex1VarId(-1), - dynamicCubeTex2(NULL), - dynamicCubeTex2Id(BAD_TEXTUREID), - dynamicCubeTex2VarId(-1), - dynamicCubeTex(NULL), - dynamicCubeTexId(BAD_TEXTUREID), - dynamicCubeTexVarId(-1), - dynamicCubeTexBlendVarId(-1), - dynamicCubeDepthTex(NULL), - numDynamicCubeTexMips(num_mips), - dynamicCubeSize(size), - dynamicCubeBlur(blur), - dynamicCubeFaceNo(-1), - blendCubesStage(-1) -{ - - blendCubesRenderer = create_postfx_renderer("blend_cubes"); - blurCubesRenderer = create_postfx_renderer("blur_cubes"); - blendCubesParamsVarId = get_shader_variable_id("blend_cubes_params"); - if (!blurCubesRenderer && num_mips != 1) - { - numDynamicCubeTexMips = 0; - logerr("we can't blur mips of dynamic Cube, use generate mipmap instead"); - } - - - uint32_t cubeFlags = format | TEXCF_RTARGET; - - dynamicCubeTex1 = d3d::create_cubetex(dynamicCubeSize, cubeFlags, 1, "dynamicCubeTex1"); - G_ASSERT(dynamicCubeTex1); - dynamicCubeTex1Id = register_managed_tex("dynamicCubeTex1", dynamicCubeTex1); - dynamicCubeTex1VarId = ::get_shader_variable_id("dynamic_cube_tex_1", true); - ShaderGlobal::set_texture(dynamicCubeTex1VarId, dynamicCubeTex1Id); - - dynamicCubeTex2 = d3d::create_cubetex(dynamicCubeSize, cubeFlags, 1, "dynamicCubeTex2"); - G_ASSERT(dynamicCubeTex2); - dynamicCubeTex2Id = register_managed_tex("dynamicCubeTex2", dynamicCubeTex2); - dynamicCubeTex2VarId = ::get_shader_variable_id("dynamic_cube_tex_2", true); - ShaderGlobal::set_texture(dynamicCubeTex2VarId, dynamicCubeTex2Id); - - dynamicCubeTex = d3d::create_cubetex(dynamicCubeSize, - cubeFlags | ((numDynamicCubeTexMips > 1 && !blurCubesRenderer) ? TEXCF_GENERATEMIPS : 0), numDynamicCubeTexMips, "dynamicCubeTex"); - G_ASSERT(dynamicCubeTex); - dynamicCubeTexId = register_managed_tex("dynamicCubeTex", dynamicCubeTex); - dynamicCubeTexVarId = ::get_shader_variable_id("dynamic_cube_tex", true); - ShaderGlobal::set_texture(dynamicCubeTexVarId, dynamicCubeTexId); - - dynamicCubeTexBlendVarId = ::get_shader_variable_id("dynamic_cube_tex_blend", true); - - ShaderGlobal::set_color4(get_shader_variable_id("dynamic_cube_params", true), - numDynamicCubeTexMips ? numDynamicCubeTexMips - 1 : num_mips, 0.f, 0.f, 0.f); -} - - -DynamicCube::~DynamicCube() -{ - ShaderGlobal::reset_from_vars_and_release_managed_tex_verified(dynamicCubeTex1Id, dynamicCubeTex1); - ShaderGlobal::reset_from_vars_and_release_managed_tex_verified(dynamicCubeTex2Id, dynamicCubeTex2); - ShaderGlobal::reset_from_vars_and_release_managed_tex_verified(dynamicCubeTexId, dynamicCubeTex); - - del_d3dres(dynamicCubeDepthTex); - - del_it(blendCubesRenderer); - del_it(blurCubesRenderer); -} - - -bool DynamicCube::refresh() -{ - if (dynamicCubeFaceNo != -1) - return false; - - eastl::swap(dynamicCubeTex1Id, dynamicCubeTex2Id); - eastl::swap(dynamicCubeTex1, dynamicCubeTex2); - ShaderGlobal::set_texture(dynamicCubeTex1VarId, dynamicCubeTex1Id); - - dynamicCubeFaceNo = 5; - - return true; -} - - -void DynamicCube::beforeRender(float blend_to_next, IRenderDynamicCubeFace *render) -{ - - - if (dynamicCubeFaceNo >= 0) - { - G_ASSERT(render); - - render->renderDynamicCubeFace(dynamicCubeTex2VarId, dynamicCubeTex2Id, dynamicCubeTex2, dynamicCubeFaceNo); - - dynamicCubeFaceNo--; - } - - float blend = 0.f; - if (dynamicCubeFaceNo < 0) - blend = min(1.f, blend_to_next); - - ShaderGlobal::set_real(dynamicCubeTexBlendVarId, blend); - - - // Blend and blur cubes. - - Driver3dRenderTarget prevRt; - d3d::get_render_target(prevRt); - ShaderGlobal::set_texture(dynamicCubeTexVarId, BAD_TEXTUREID); - - unsigned int fromMip = 0; - unsigned int toMip = max(1, (int)numDynamicCubeTexMips); - if (blendCubesStage >= 0) - { - if (blendCubesStage < NUM_CUBE_FACES) - { - fromMip = 0; - toMip = 1; - } - else - { - fromMip = 1; - toMip = max(1, (int)numDynamicCubeTexMips); - } - } - - for (unsigned int mipNo = fromMip; mipNo < toMip; mipNo++) - { - unsigned int fromFace = 0; - unsigned int toFace = NUM_CUBE_FACES; - if (blendCubesStage >= 0) - { - fromFace = blendCubesStage % NUM_CUBE_FACES; - toFace = fromFace + 1; - } - if (!blurCubesRenderer && mipNo) - { - // rely on generateMips - dynamicCubeTex->generateMips(); - continue; - } - for (unsigned int faceNo = fromFace; faceNo < toFace; faceNo++) - { - if (mipNo == 0) - { - d3d::set_render_target(prevRt); - d3d::set_render_target(dynamicCubeTex, faceNo, 0); - - ShaderGlobal::set_color4(blendCubesParamsVarId, 1.f / dynamicCubeSize, -1.f / dynamicCubeSize, faceNo, 0.f); - - blendCubesRenderer->render(); - } - else - { - d3d::set_render_target(prevRt); - d3d::set_render_target(dynamicCubeTex, faceNo, (int)mipNo); - - ShaderGlobal::set_color4(blendCubesParamsVarId, 0.f, 0.f, faceNo, dynamicCubeBlur * mipNo); - - blurCubesRenderer->render(); - } - } - } - blendCubesStage = (blendCubesStage + 1) % (2 * NUM_CUBE_FACES); - - d3d::set_render_target(prevRt); - ShaderGlobal::set_texture(dynamicCubeTexVarId, dynamicCubeTexId); -} - - -void DynamicCube::reset(IRenderDynamicCubeFace *render) -{ - dynamicCubeFaceNo = -1; - blendCubesStage = -1; - - for (unsigned int faceNo = 0; faceNo < 6; faceNo++) - { - render->renderDynamicCubeFace(dynamicCubeTex1VarId, dynamicCubeTex1Id, dynamicCubeTex1, faceNo); - } - - for (unsigned int faceNo = 0; faceNo < 6; faceNo++) - { - render->renderDynamicCubeFace(dynamicCubeTex2VarId, dynamicCubeTex2Id, dynamicCubeTex2, faceNo); - } - - beforeRender(0.f, render); -} diff --git a/prog/gameLibs/render/fx/demonPostFx.cpp b/prog/gameLibs/render/fx/demonPostFx.cpp index 2fe061f95..b3eea155e 100644 --- a/prog/gameLibs/render/fx/demonPostFx.cpp +++ b/prog/gameLibs/render/fx/demonPostFx.cpp @@ -262,9 +262,9 @@ void DemonPostFx::initHistogramVb() ShaderGlobal::set_int(adaptation_use_center_weightedVarId, centerWeightedAdaptation ? 1 : 0); #if HIST_USE_INSTANCING - histogramVb = dag::create_vb(sizeof(HistogramVertex), SBCF_MAYBELOST, "histogramInst"); + histogramVb = dag::create_vb(sizeof(HistogramVertex), 0, "histogramInst"); #else - histogramVb = dag::create_vb(vbSize.x * vbSize.y * sizeof(HistogramVertex), SBCF_MAYBELOST, "histogram"); + histogramVb = dag::create_vb(vbSize.x * vbSize.y * sizeof(HistogramVertex), 0, "histogram"); #endif d3d_err(histogramVb.getBuf()); histBufferFiller.vbSize = vbSize; diff --git a/prog/gameLibs/render/gpuBenchmark/gpuBenchmark.cpp b/prog/gameLibs/render/gpuBenchmark/gpuBenchmark.cpp index 47f4fd9bf..c19f88802 100644 --- a/prog/gameLibs/render/gpuBenchmark/gpuBenchmark.cpp +++ b/prog/gameLibs/render/gpuBenchmark/gpuBenchmark.cpp @@ -47,7 +47,7 @@ GpuBenchmark::GpuBenchmark() index_buffer::init_quads_32bit(QUADS_COUNT); int vbSize = sizeof(Point4) * 2 * VERT_COUNT; - vb.set(d3d::create_vb(vbSize, SBCF_MAYBELOST | SBCF_CPU_ACCESS_WRITE, "gpu_benchmark_vb"), "gpu_benchmark"); + vb.set(d3d::create_vb(vbSize, SBCF_CPU_ACCESS_WRITE, "gpu_benchmark_vb"), "gpu_benchmark"); G_ASSERT(vb.getBuf()); struct VertexData diff --git a/prog/gameLibs/render/gpuGrass.cpp b/prog/gameLibs/render/gpuGrass.cpp index f927e63ac..ff0c27429 100644 --- a/prog/gameLibs/render/gpuGrass.cpp +++ b/prog/gameLibs/render/gpuGrass.cpp @@ -360,8 +360,6 @@ void GPUGrassBase::init(const DataBlock &grassSettings) grassColorsVSCB = dag::buffers::create_persistent_cb(dag::buffers::cb_array_reg_count(GRASS_MAX_TYPES), "grass_colors_buf"); - // without SBCF_MAYBELOST buffers not recreated correctly - // grassInstancesCountConst.reset(d3d::create_sbuffer(16, 1, SBCF_MAYBELOST|SBCF_BIND_CONSTANT, 0, "grassInstancesCount"));// indirectSrv = get_shader_variable_id("grass_instances_indirect", true) >= 0; grassInstancesIndirect = dag::create_sbuffer(4, 5 + ((GRASS_LOD_COUNT + 3) & ~3), (indirectSrv ? SBCF_BIND_SHADER_RES : 0) | SBCF_UA_INDIRECT, 0, "grass_instances_indirect"); diff --git a/prog/gameLibs/render/hdrRender/hdrRender.cpp b/prog/gameLibs/render/hdrRender/hdrRender.cpp index 7e93d21f7..1456e1db8 100644 --- a/prog/gameLibs/render/hdrRender/hdrRender.cpp +++ b/prog/gameLibs/render/hdrRender/hdrRender.cpp @@ -146,10 +146,7 @@ const ManagedTex &hdrrender::get_render_target() { return float_rt_tex; } void hdrrender::set_render_target() { if (is_hdr_enabled()) - { d3d::set_render_target(float_rt_tex.getTex2D(), 0); - d3d::set_backbuf_depth(); - } else d3d::set_render_target(); } diff --git a/prog/gameLibs/render/heatHaze/heatHazeRenderer.cpp b/prog/gameLibs/render/heatHaze/heatHazeRenderer.cpp index 40e93665e..777696d74 100644 --- a/prog/gameLibs/render/heatHaze/heatHazeRenderer.cpp +++ b/prog/gameLibs/render/heatHaze/heatHazeRenderer.cpp @@ -233,7 +233,7 @@ void HeatHazeRenderer::applyHaze(double total_time, Texture *back_buffer, const void HeatHazeRenderer::render(double total_time, const RenderTargets &targets, const IPoint2 &back_buffer_resolution, int depth_tex_lod, RenderHazeParticlesCallback render_haze_particles, RenderCustomHazeCallback render_custom_haze, - RenderCustomHazeCallback render_ri_haze) + RenderCustomHazeCallback render_ri_haze, BeforeApplyHazeCallback before_apply_haze, AfterApplyHazeCallback after_apply_haze) { if (!areShadersValid()) return; @@ -254,9 +254,15 @@ void HeatHazeRenderer::render(double total_time, const RenderTargets &targets, c d3d::resource_barrier({targets.hazeColor, RB_RO_SRV | RB_STAGE_PIXEL, 0, 0}); } + if (before_apply_haze) + before_apply_haze(); + applyHaze(total_time, targets.backBuffer, targets.backBufferArea, targets.backBufferId, targets.resolvedDepthId, targets.hazeTemp, targets.hazeTempId, back_buffer_resolution); + if (after_apply_haze) + after_apply_haze(); + if (hazeFxRenderer) clearTargets(targets.hazeColor, targets.hazeOffset, targets.hazeDepth); } diff --git a/prog/gameLibs/render/heroWetness.cpp b/prog/gameLibs/render/heroWetness.cpp index 5c5b5a517..394c7ece6 100644 --- a/prog/gameLibs/render/heroWetness.cpp +++ b/prog/gameLibs/render/heroWetness.cpp @@ -123,7 +123,7 @@ void HeroWetness::init() int vbSize = getVbSize(); G_ASSERT(!waterHeightRendererVb); - waterHeightRendererVb = d3d::create_vb(sizeof(float) * vbSize, SBCF_MAYBELOST, "wetnessCalculationPostfx"); + waterHeightRendererVb = d3d::create_vb(sizeof(float) * vbSize, 0, "wetnessCalculationPostfx"); d3d_err(waterHeightRendererVb); fillVertexBuffer(vbSize); diff --git a/prog/gameLibs/render/jamfile b/prog/gameLibs/render/jamfile index 79b47a8d9..eb654fb89 100644 --- a/prog/gameLibs/render/jamfile +++ b/prog/gameLibs/render/jamfile @@ -16,6 +16,7 @@ AddIncludes = Sources = # bentCones.cpp + cur_view.cpp bigLightsShadows.cpp tdrGpu.cpp toroidalStaticShadows/toroidalStaticShadowCascade.cpp @@ -47,7 +48,6 @@ Sources = distortion.cpp antialiasing.cpp partialDxtRender.cpp - dynamicCube.cpp dynamicLightProbe.cpp randomGrass.cpp landMask.cpp diff --git a/prog/gameLibs/render/lightCube.cpp b/prog/gameLibs/render/lightCube.cpp index b64e9fcea..5e9bfe629 100644 --- a/prog/gameLibs/render/lightCube.cpp +++ b/prog/gameLibs/render/lightCube.cpp @@ -1,5 +1,4 @@ #include <3d/dag_drv3d.h> -#include #include <3d/dag_resPtr.h> #include #include @@ -100,6 +99,7 @@ class Cube d3d::set_render_target(i, nullptr, 0); } } + d3d::settex(7, nullptr); if (cubTex == tex.getCubeTex()) cubTex->texmiplevel(-1, -1); } diff --git a/prog/gameLibs/render/lruCollision/lruCollision.cpp b/prog/gameLibs/render/lruCollision/lruCollision.cpp index 09f4d7eba..348032c84 100644 --- a/prog/gameLibs/render/lruCollision/lruCollision.cpp +++ b/prog/gameLibs/render/lruCollision/lruCollision.cpp @@ -43,15 +43,15 @@ uint32_t LRURendinstCollision::getMaxBatchSize() const { return MAX_VOXELIZATION static constexpr uint32_t compute_vb_flags = (can_voxelize_in_compute ? SBCF_BIND_SHADER_RES | SBCF_MISC_ALLOW_RAW : 0); LRURendinstCollision::LRURendinstCollision() : - vbAllocator(SbufferHeapManager("vb_collision_", 4, compute_vb_flags | SBCF_MAYBELOST | SBCF_BIND_VERTEX)), - ibAllocator(SbufferHeapManager("ib_collision_", 4, compute_vb_flags | SBCF_MAYBELOST | SBCF_BIND_INDEX)) + vbAllocator(SbufferHeapManager("vb_collision_", 4, compute_vb_flags | SBCF_BIND_VERTEX)), + ibAllocator(SbufferHeapManager("ib_collision_", 4, compute_vb_flags | SBCF_BIND_INDEX)) { create_cubic_indices(make_span((uint8_t *)boxIndices.data(), COLLISION_BOX_INDICES_NUM * sizeof(uint16_t)), 1, false); supportNoOverwrite = d3d::get_driver_desc().caps.hasNoOverwriteOnShaderResourceBuffers; instanceTms = dag::create_sbuffer(sizeof(Point4), getMaxBatchSize() * 3, - (supportNoOverwrite ? (SBCF_DYNAMIC | SBCF_CPU_ACCESS_WRITE) : 0) | SBCF_MAYBELOST | SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED, - 0, "collision_voxelization_tm"); + (supportNoOverwrite ? (SBCF_DYNAMIC | SBCF_CPU_ACCESS_WRITE) : 0) | SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED, 0, + "collision_voxelization_tm"); // if we don't support nooverwrite do not use SBCF_DYNAMIC | SBCF_CPU_ACCESS_WRITE voxelizeCollisionMat.reset(new_shader_material_by_name_optional("voxelize_collision")); if (voxelizeCollisionMat) diff --git a/prog/gameLibs/render/nodeBasedShader/nodeBasedShader.cpp b/prog/gameLibs/render/nodeBasedShader/nodeBasedShader.cpp index bd73fd81c..06f4ed190 100644 --- a/prog/gameLibs/render/nodeBasedShader/nodeBasedShader.cpp +++ b/prog/gameLibs/render/nodeBasedShader/nodeBasedShader.cpp @@ -66,7 +66,7 @@ void NodeBasedShader::createShaders() shadersCache.reserve(shaderBins.size()); for (const auto &shaderBin : shaderBins) { - PROGRAM updatedProgram = d3d::create_program_cs(shaderBin.data()); + PROGRAM updatedProgram = d3d::create_program_cs(shaderBin.data(), CSPreloaded::No); G_ASSERTF(updatedProgram != BAD_PROGRAM, "Can't create compute shader"); shadersCache.emplace_back(new PROGRAM{updatedProgram}); // -V1023 } diff --git a/prog/gameLibs/render/partialDxtRender.cpp b/prog/gameLibs/render/partialDxtRender.cpp index 0b19f287a..70f0fd2c5 100644 --- a/prog/gameLibs/render/partialDxtRender.cpp +++ b/prog/gameLibs/render/partialDxtRender.cpp @@ -113,6 +113,7 @@ void PartialDxtRender(Texture *rt, Texture *rtn, int linesPerPart, int picWidth, { \ d3d::setwire(0); \ d3d::set_render_target(); \ + d3d::set_backbuf_depth(); \ dagor_work_cycle_flush_pending_frame(); \ dagor_draw_scene_and_gui(true, true); \ d3d::update_screen(); \ diff --git a/prog/gameLibs/render/rainX7/rainX7.cpp b/prog/gameLibs/render/rainX7/rainX7.cpp index 7f62961a1..6365ff37e 100644 --- a/prog/gameLibs/render/rainX7/rainX7.cpp +++ b/prog/gameLibs/render/rainX7/rainX7.cpp @@ -112,11 +112,11 @@ RainX7::RainX7(const DataBlock &blk) : // VB and IB. rendElem.numVert = 4 * numParticles; - vb = d3d::create_vb(rendElem.numVert * rendElem.stride, SBCF_MAYBELOST, "rainx7_vb"); + vb = d3d::create_vb(rendElem.numVert * rendElem.stride, 0, "rainx7_vb"); G_ASSERT(vb); rendElem.numPrim = 2 * numParticles; - ib = d3d::create_ib(3 * rendElem.numPrim * sizeof(unsigned short int), SBCF_MAYBELOST, "rainx7_ib"); + ib = d3d::create_ib(3 * rendElem.numPrim * sizeof(unsigned short int), 0, "rainx7_ib"); G_ASSERT(ib); diff --git a/prog/gameLibs/render/randomGrass.cpp b/prog/gameLibs/render/randomGrass.cpp index 689486497..1dbbe4b45 100644 --- a/prog/gameLibs/render/randomGrass.cpp +++ b/prog/gameLibs/render/randomGrass.cpp @@ -19,6 +19,7 @@ #include <3d/dag_drv3dCmd.h> #include <3d/dag_drv3d_platform.h> #include <3d/dag_dynAtlas.h> +#include #include #include #include @@ -715,9 +716,9 @@ void RandomGrass::resetLayersVB() String indexBufferName, vertexBufferName; indexBufferName.printf(0, "randomGrass::combinedLodIb%d", lodIdx); vertexBufferName.printf(0, "randomGrass::combinedLodVb%d", lodIdx); - combinedLod.lodIb = dag::create_ib(indexCount * sizeof(uint16_t), SBCF_MAYBELOST, indexBufferName); + combinedLod.lodIb = dag::create_ib(indexCount * sizeof(uint16_t), 0, indexBufferName); d3d_err(combinedLod.lodIb.getBuf()); - combinedLod.lodVb = dag::create_vb(vertexCount * sizeof(CellVertex), SBCF_MAYBELOST, vertexBufferName); + combinedLod.lodVb = dag::create_vb(vertexCount * sizeof(CellVertex), 0, vertexBufferName); d3d_err(combinedLod.lodVb.getBuf()); static CompiledShaderChannelId chan[] = {{SCTYPE_UINT1, SCUSAGE_POS, 0, 0}, {SCTYPE_UINT1, SCUSAGE_TC, 0, 0}}; diff --git a/prog/gameLibs/render/resourceSlot/detail/nodeDeclaration.h b/prog/gameLibs/render/resourceSlot/detail/nodeDeclaration.h index faf94b8ae..2d0fea3ce 100644 --- a/prog/gameLibs/render/resourceSlot/detail/nodeDeclaration.h +++ b/prog/gameLibs/render/resourceSlot/detail/nodeDeclaration.h @@ -41,5 +41,5 @@ struct NodeDeclaration NodeDeclaration(const NodeDeclaration &) = delete; NodeDeclaration &operator=(const NodeDeclaration &) = delete; }; - -} // namespace resource_slot::detail \ No newline at end of file +} // namespace resource_slot::detail +DAG_DECLARE_RELOCATABLE(resource_slot::detail::NodeDeclaration); \ No newline at end of file diff --git a/prog/gameLibs/render/shaderCacheWarmup/jamfile b/prog/gameLibs/render/shaderCacheWarmup/jamfile index d778600a7..d24ee19b5 100644 --- a/prog/gameLibs/render/shaderCacheWarmup/jamfile +++ b/prog/gameLibs/render/shaderCacheWarmup/jamfile @@ -7,7 +7,6 @@ TargetType = lib ; Target = gameLibs/render/shaderCacheWarmup.lib ; AddIncludes = - $(Root)/prog/daNetGame $(Root)/prog/engine $(Root)/prog/engine/sharedInclude $(Root)/prog/gameLibs/publicInclude diff --git a/prog/gameLibs/render/shaderCacheWarmup/shaderCacheWarmup.cpp b/prog/gameLibs/render/shaderCacheWarmup/shaderCacheWarmup.cpp index 1cda5c581..3e6d010dd 100644 --- a/prog/gameLibs/render/shaderCacheWarmup/shaderCacheWarmup.cpp +++ b/prog/gameLibs/render/shaderCacheWarmup/shaderCacheWarmup.cpp @@ -90,8 +90,10 @@ eastl::vector gather_shader_classes(const Ta #define MS(us) (int)((us)*1000) static const int COMPILE_TIME_LIMIT_DEFAULT = 100; +static const int TAIL_WAIT_TIME = 100; static bool is_loading_thread = false; static int compileTimeLimit = 0; +static int flushEveryNPipelines = 0; static int maxFlushPeriodMs = 0; enum { @@ -142,18 +144,42 @@ class DynamicD3DFlusher G_ASSERT(gpuLocked); compiledPipelinesCount += 1; - if (compiledPipelinesCount == flushEveryNPipelines) + size_t queued = d3d::driver_command(DRV3D_COMMAND_GET_PIPELINE_COMPILATION_QUEUE_LENGTH, nullptr, nullptr, nullptr); + bool perPipeFlush = compiledPipelinesCount == flushEveryNPipelines; + bool perQueueFlush = queued >= (flushEveryNPipelines * 2); + bool doFlush = perPipeFlush | perQueueFlush; + + int64_t timeus = 0; +// detailed profiling +#if 0 + if (perPipeFlush) + { + TIME_PROFILE_NAME(pp_flush, String(32, "per_pipe-%u-%u", compiledPipelinesCount, flushEveryNPipelines)); + timeus = flushCommands(); + } + else if (perQueueFlush) + { + TIME_PROFILE_NAME(pq_flush, String(32, "per_queued-%u-%u", queued, flushEveryNPipelines)); + timeus = flushCommands(); + } +#else + if (doFlush) + timeus = flushCommands(); +#endif + + if (perPipeFlush) { - const int64_t timeus = flushCommands(); if (timeus < compileTimeLimit) flushEveryNPipelines += 1; else flushEveryNPipelines = (flushEveryNPipelines > 2) ? flushEveryNPipelines - 2 : 1; + } + if (doFlush) + { int timems = timeus / 1000; if (timems > maxFlushPeriodMs) maxFlushPeriodMs = timems; - compiledPipelinesCount = 0; } } @@ -170,6 +196,7 @@ class DynamicD3DFlusher private: int64_t flushCommands() { + TIME_PROFILE(warmup_shaders_flush_cmds); if (is_loading_thread) { int64_t timeus = profile_ref_ticks(); @@ -204,12 +231,13 @@ class DynamicD3DFlusher private: bool gpuLocked = false; size_t compiledPipelinesCount = 0; - int flushEveryNPipelines = 1; }; class ShadersWarmup { public: + ShadersWarmup(DynamicD3DFlusher &flusher) : d3dFlusher(flusher) {} + void warmupShaders(const eastl::vector &shader_classes) { for (const auto &shaderClass : shader_classes) @@ -219,6 +247,7 @@ class ShadersWarmup private: PtrTab materials; PtrTab elemetns; + DynamicD3DFlusher &d3dFlusher; virtual ScriptedShaderMaterial *initMaterial(const shaderbindump::ShaderClass &sc) = 0; @@ -240,8 +269,6 @@ class ShadersWarmup } materials.push_back(ssm); - DynamicD3DFlusher d3dFlusher; - for (size_t staticVariantId = 0; staticVariantId < shaderClass.code.size(); ++staticVariantId) { const shaderbindump::ShaderCode &code = shaderClass.code[staticVariantId]; @@ -271,7 +298,7 @@ class ShadersWarmup class GraphicsShadersWarmup final : public ShadersWarmup { public: - GraphicsShadersWarmup() { initResources(); } + GraphicsShadersWarmup(DynamicD3DFlusher &flusher) : ShadersWarmup(flusher) { initResources(); } ~GraphicsShadersWarmup() { @@ -330,6 +357,9 @@ class GraphicsShadersWarmup final : public ShadersWarmup class ComputeShadersWarmup final : public ShadersWarmup { +public: + ComputeShadersWarmup(DynamicD3DFlusher &flusher) : ShadersWarmup(flusher) {} + private: virtual ScriptedShaderMaterial *initMaterial(const shaderbindump::ShaderClass &sc) override { @@ -369,6 +399,7 @@ void shadercache::warmup_shaders(const Tab &graphics_shader_names, compileTimeLimit = MS(::dgs_get_settings()->getBlockByNameEx("shadersWarmup")->getInt("compileTimeLimitMs", COMPILE_TIME_LIMIT_DEFAULT)); + flushEveryNPipelines = ::dgs_get_settings()->getBlockByNameEx("shadersWarmup")->getInt("flushEveryNPipelines", 1000); is_loading_thread = is_loading_thrd; @@ -376,24 +407,43 @@ void shadercache::warmup_shaders(const Tab &graphics_shader_names, if (is_loading_thread) replace_on_swap_cb(); - const auto graphicsShaderClasses = gather_shader_classes(graphics_shader_names); - if (!graphicsShaderClasses.empty()) { - TIME_PROFILE(warmup_shaders_gr); - GraphicsShadersWarmup graphics; - graphics.warmupShaders(graphicsShaderClasses); + DynamicD3DFlusher d3dFlusher; + d3d::driver_command(DRV3D_COMMAND_SET_PIPELINE_COMPILATION_TIME_BUDGET, (void *)0, nullptr, nullptr); + + const auto graphicsShaderClasses = gather_shader_classes(graphics_shader_names); + if (!graphicsShaderClasses.empty()) + { + TIME_PROFILE(warmup_shaders_gr); + GraphicsShadersWarmup graphics(d3dFlusher); + graphics.warmupShaders(graphicsShaderClasses); + } + + const auto computeShaderClasses = gather_shader_classes(compute_shader_names); + if (!computeShaderClasses.empty()) + { + TIME_PROFILE(warmup_shaders_cs); + ComputeShadersWarmup compute(d3dFlusher); + compute.warmupShaders(computeShaderClasses); + } + + d3d::driver_command(DRV3D_COMMAND_SET_PIPELINE_COMPILATION_TIME_BUDGET, (void *)-1, nullptr, nullptr); } - const auto computeShaderClasses = gather_shader_classes(compute_shader_names); - if (!computeShaderClasses.empty()) { - TIME_PROFILE(warmup_shaders_cs); - ComputeShadersWarmup compute; - compute.warmupShaders(computeShaderClasses); + TIME_PROFILE(warmup_shaders_async_complete_wait); + while (d3d::driver_command(DRV3D_COMMAND_GET_PIPELINE_COMPILATION_QUEUE_LENGTH, nullptr, nullptr, nullptr) > 0) + sleep_msec(TAIL_WAIT_TIME); } if (is_loading_thread) restore_on_swap_cb(); + + { + TIME_PROFILE(warmup_shaders_save); + d3d::driver_command(DRV3D_COMMAND_SAVE_PIPELINE_CACHE, nullptr, nullptr, nullptr); + } + time = profile_time_usec(time); // restrict precision to ms @@ -404,7 +454,4 @@ void shadercache::warmup_shaders(const Tab &graphics_shader_names, statsd::histogram("render.shader_cache_warmup.max_flush_s", maxFlushS); debug("shaders warmup took %f sec, max flush time %f sec", dltS, maxFlushS); - - TIME_PROFILE(warmup_shaders_save); - d3d::driver_command(DRV3D_COMMAND_SAVE_PIPELINE_CACHE, nullptr, nullptr, nullptr); } diff --git a/prog/gameLibs/render/shaders/bc1_compression.sh b/prog/gameLibs/render/shaders/bc1_compression.sh new file mode 100644 index 000000000..dc7acc931 --- /dev/null +++ b/prog/gameLibs/render/shaders/bc1_compression.sh @@ -0,0 +1,27 @@ +include "land_block_inc.sh" +include "bc_compression_inc.sh" + +texture src_tex; +float src_mip = 0; +float dst_mip = 0; + +int src_face = -1; +interval src_face : src_single_face < 0, src_cube; + + +shader bc1_srgbwrite_compressor, bc1_compressor +{ + USE_BC1_COMPRESSION(ps) + hlsl(ps) { + uint4 compress(half4 texels[16], half4 min_color, half4 max_color); + } + COMMON_BC_SHADER(shader == bc1_compressor, shader == bc1_srgbwrite_compressor) + hlsl(ps) { + uint4 compress(half4 texels[16], half4 min_color, half4 max_color) + { + find_base_colors( texels, min_color, max_color ); + refine_rgb_base_colors( texels, min_color, max_color ); + return pack_bc1_block( texels, min_color, max_color ); + } + } +} diff --git a/prog/gameLibs/render/shaders/bc3_compression.sh b/prog/gameLibs/render/shaders/bc3_compression.sh new file mode 100644 index 000000000..c28927842 --- /dev/null +++ b/prog/gameLibs/render/shaders/bc3_compression.sh @@ -0,0 +1,27 @@ +include "land_block_inc.sh" +include "bc_compression_inc.sh" + +texture src_tex; +float src_mip = 0; +float dst_mip = 0; + +int src_face = -1; +interval src_face : src_single_face < 0, src_cube; + + +shader bc3_compressor, bc3_srgbwrite_compressor +{ + USE_BC3_COMPRESSION(ps) + hlsl(ps) { + uint4 compress(half4 texels[16], half4 min_color, half4 max_color); + } + COMMON_BC_SHADER(false, shader == bc3_srgbwrite_compressor) + hlsl(ps) { + uint4 compress(half4 texels[16], half4 min_color, half4 max_color) + { + find_base_colors( texels, min_color, max_color ); + refine_rgb_base_colors( texels, min_color, max_color ); + return pack_bc3_block( texels, min_color, max_color ); + } + } +} diff --git a/prog/gameLibs/render/shaders/bc4_compression.sh b/prog/gameLibs/render/shaders/bc4_compression.sh new file mode 100644 index 000000000..fb7976285 --- /dev/null +++ b/prog/gameLibs/render/shaders/bc4_compression.sh @@ -0,0 +1,28 @@ +include "land_block_inc.sh" +include "bc_compression_inc.sh" + +texture src_tex; +float src_mip = 0; +float dst_mip = 0; + +int src_face = -1; +interval src_face : src_single_face < 0, src_cube; + + +shader bc4_compressor +{ + supports land_mesh_prepare_clipmap; + USE_BC4_COMPRESSION(ps) + hlsl(ps) { + uint4 compress(half4 texels[16], half4 min_color, half4 max_color); + } + COMMON_BC_SHADER(true, false) + hlsl(ps) { + uint4 compress(half4 texels[16], half4 min_color, half4 max_color) + { + find_base_colors( texels, min_color, max_color ); + refine_rgb_base_colors( texels, min_color, max_color ); + return pack_bc4_block( texels, min_color, max_color ); + } + } +} diff --git a/prog/gameLibs/render/shaders/bc5_compression.sh b/prog/gameLibs/render/shaders/bc5_compression.sh new file mode 100644 index 000000000..d12f00379 --- /dev/null +++ b/prog/gameLibs/render/shaders/bc5_compression.sh @@ -0,0 +1,27 @@ +include "land_block_inc.sh" +include "bc_compression_inc.sh" + +texture src_tex; +float src_mip = 0; +float dst_mip = 0; + +int src_face = -1; +interval src_face : src_single_face < 0, src_cube; + + +shader bc5_compressor +{ + USE_BC5_COMPRESSION(ps) + hlsl(ps) { + uint4 compress(half4 texels[16], half4 min_color, half4 max_color); + } + COMMON_BC_SHADER(false, false) + hlsl(ps) { + uint4 compress(half4 texels[16], half4 min_color, half4 max_color) + { + find_base_colors( texels, min_color, max_color ); + refine_rgb_base_colors( texels, min_color, max_color ); + return pack_bc5_block( texels, min_color, max_color ); + } + } +} diff --git a/prog/gameLibs/render/shaders/bc6h_compression.sh b/prog/gameLibs/render/shaders/bc6h_compression.sh index 1912c28cb..7d879bdf1 100644 --- a/prog/gameLibs/render/shaders/bc6h_compression.sh +++ b/prog/gameLibs/render/shaders/bc6h_compression.sh @@ -1,523 +1,29 @@ -int bc6h_compression_mode = 1; -interval bc6h_compression_mode : bc6h_compression_fast < 1, bc6h_compression_quality; +include "land_block_inc.sh" +include "bc_compression_inc.sh" +include "bc6h_compression_inc.sh" -macro USE_BC6H_COMPRESSION(code) +texture src_tex; +float src_mip = 0; +float dst_mip = 0; -hlsl { - #define HALF_MAX 65504.0f - #define BC6H_PATTERN_NUM 32 - - float CalcMSLE(float3 a, float3 b) - { - float3 err = log2((b + 1.0f) / (a + 1.0f));; - err = err * err; - return err.x + err.y + err.z; - } - - uint PatternFixupID(uint i) - { - uint ret = 15; - ret = ((3441033216U >> i) & 0x1) ? 2 : ret; - ret = ((845414400U >> i) & 0x1) ? 8 : ret; - return ret; - } - - uint Pattern(uint p, uint i) - { - uint p2 = p / 2; - uint p3 = p - p2 * 2; - - uint enc = 0; - enc = p2 == 0 ? 2290666700 : enc; - enc = p2 == 1 ? 3972591342 : enc; - enc = p2 == 2 ? 4276930688 : enc; - enc = p2 == 3 ? 3967876808 : enc; - enc = p2 == 4 ? 4293707776 : enc; - enc = p2 == 5 ? 3892379264 : enc; - enc = p2 == 6 ? 4278255592 : enc; - enc = p2 == 7 ? 4026597360 : enc; - enc = p2 == 8 ? 9369360 : enc; - enc = p2 == 9 ? 147747072 : enc; - enc = p2 == 10 ? 1930428556 : enc; - enc = p2 == 11 ? 2362323200 : enc; - enc = p2 == 12 ? 823134348 : enc; - enc = p2 == 13 ? 913073766 : enc; - enc = p2 == 14 ? 267393000 : enc; - enc = p2 == 15 ? 966553998 : enc; - - enc = p3 ? enc >> 16 : enc; - uint ret = (enc >> i) & 0x1; - return ret; - } - - float3 Quantize7(float3 x) - { - return (f32tof16(x) * 128.0f) / (0x7bff + 1.0f); - } - - float3 Quantize9(float3 x) - { - return (f32tof16(x) * 512.0f) / (0x7bff + 1.0f); - } - - float3 Quantize10(float3 x) - { - return (f32tof16(x) * 1024.0f) / (0x7bff + 1.0f); - } - - float3 Unquantize7(float3 x) - { - return (x * 65536.0f + 0x8000) / 128.0f; - } - - float3 Unquantize9(float3 x) - { - return (x * 65536.0f + 0x8000) / 512.0f; - } - - float3 Unquantize10(float3 x) - { - return (x * 65536.0f + 0x8000) / 1024.0f; - } - - float3 FinishUnquantize(float3 endpoint0Unq, float3 endpoint1Unq, float weight) - { - float3 comp = (endpoint0Unq * (64.0f - weight) + endpoint1Unq * weight + 32.0f) * (31.0f / 4096.0f); - return f16tof32(uint3(comp)); - } - - void Swap(inout float3 a, inout float3 b) - { - float3 tmp = a; - a = b; - b = tmp; - } - - void Swap(inout float a, inout float b) - { - float tmp = a; - a = b; - b = tmp; - } - - uint ComputeIndex3(float texelPos, float endPoint0Pos, float endPoint1Pos) - { - float r = (texelPos - endPoint0Pos) / (endPoint1Pos - endPoint0Pos); - return (uint)clamp(r * 6.98182f + 0.00909f + 0.5f, 0.0f, 7.0f); - } - - uint ComputeIndex4(float texelPos, float endPoint0Pos, float endPoint1Pos) - { - float r = (texelPos - endPoint0Pos) / (endPoint1Pos - endPoint0Pos); - return (uint)clamp(r * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f); - } - - void SignExtend(inout float3 v1, uint mask, uint signFlag) - { - int3 v = (int3)v1; - v.x = (v.x & mask) | (v.x < 0 ? signFlag : 0); - v.y = (v.y & mask) | (v.y < 0 ? signFlag : 0); - v.z = (v.z & mask) | (v.z < 0 ? signFlag : 0); - v1 = v; - } - - void EncodeP1(inout uint4 block, inout float blockMSLE, float3 texels[16]) - { - // compute endpoints (min/max RGB bbox) - float3 blockMin = texels[0]; - float3 blockMax = texels[0]; - uint i; - for (i = 1; i < 16; ++i) - { - blockMin = min(blockMin, texels[i]); - blockMax = max(blockMax, texels[i]); - } - - - // refine endpoints in log2 RGB space - float3 refinedBlockMin = blockMax; - float3 refinedBlockMax = blockMin; - for (i = 0; i < 16; ++i) - { - refinedBlockMin = min(refinedBlockMin, texels[i] == blockMin ? refinedBlockMin : texels[i]); - refinedBlockMax = max(refinedBlockMax, texels[i] == blockMax ? refinedBlockMax : texels[i]); - } - - float3 logBlockMax = log2(blockMax + 1.0f); - float3 logBlockMin = log2(blockMin + 1.0f); - float3 logRefinedBlockMax = log2(refinedBlockMax + 1.0f); - float3 logRefinedBlockMin = log2(refinedBlockMin + 1.0f); - float3 logBlockMaxExt = (logBlockMax - logBlockMin) * (1.0f / 32.0f); - logBlockMin += min(logRefinedBlockMin - logBlockMin, logBlockMaxExt); - logBlockMax -= min(logBlockMax - logRefinedBlockMax, logBlockMaxExt); - blockMin = exp2(logBlockMin) - 1.0f; - blockMax = exp2(logBlockMax) - 1.0f; - - float3 blockDir = blockMax - blockMin; - blockDir = blockDir / (blockDir.x + blockDir.y + blockDir.z); - - float3 endpoint0 = Quantize10(blockMin); - float3 endpoint1 = Quantize10(blockMax); - float endPoint0Pos = f32tof16(dot(blockMin, blockDir)); - float endPoint1Pos = f32tof16(dot(blockMax, blockDir)); - - - // check if endpoint swap is required - float fixupTexelPos = f32tof16(dot(texels[0], blockDir)); - uint fixupIndex = ComputeIndex4(fixupTexelPos, endPoint0Pos, endPoint1Pos); - if (fixupIndex > 7) - { - Swap(endPoint0Pos, endPoint1Pos); - Swap(endpoint0, endpoint1); - } - - // compute indices - uint indices[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - for (i = 0; i < 16; ++i) - { - float texelPos = f32tof16(dot(texels[i], blockDir)); - indices[i] = ComputeIndex4(texelPos, endPoint0Pos, endPoint1Pos); - } - - // compute compression error (MSLE) - float3 endpoint0Unq = Unquantize10(endpoint0); - float3 endpoint1Unq = Unquantize10(endpoint1); - float msle = 0.0f; - for (i = 0; i < 16; ++i) - { - float weight = floor((indices[i] * 64.0f) / 15.0f + 0.5f); - float3 texelUnc = FinishUnquantize(endpoint0Unq, endpoint1Unq, weight); - - msle += CalcMSLE(texels[i], texelUnc); - } +int src_face = -1; +interval src_face : src_single_face < 0, src_cube; - // encode block for mode 11 - blockMSLE = msle; - block.x = 0x03; - - // endpoints - block.x |= (uint)endpoint0.x << 5; - block.x |= (uint)endpoint0.y << 15; - block.x |= (uint)endpoint0.z << 25; - block.y |= (uint)endpoint0.z >> 7; - block.y |= (uint)endpoint1.x << 3; - block.y |= (uint)endpoint1.y << 13; - block.y |= (uint)endpoint1.z << 23; - block.z |= (uint)endpoint1.z >> 9; - - // indices - block.z |= indices[0] << 1; - block.z |= indices[1] << 4; - block.z |= indices[2] << 8; - block.z |= indices[3] << 12; - block.z |= indices[4] << 16; - block.z |= indices[5] << 20; - block.z |= indices[6] << 24; - block.z |= indices[7] << 28; - block.w |= indices[8] << 0; - block.w |= indices[9] << 4; - block.w |= indices[10] << 8; - block.w |= indices[11] << 12; - block.w |= indices[12] << 16; - block.w |= indices[13] << 20; - block.w |= indices[14] << 24; - block.w |= indices[15] << 28; +shader bc6h_compressor +{ + USE_BC6H_COMPRESSION(ps) + hlsl(ps) { + uint4 compress(half4 texels[16], half4 min_color, half4 max_color); } - - void EncodeP2Pattern(inout uint4 block, inout float blockMSLE, int pattern, float3 texels[16]) - { - float3 p0BlockMin = float3(HALF_MAX, HALF_MAX, HALF_MAX); - float3 p0BlockMax = float3(0.0f, 0.0f, 0.0f); - float3 p1BlockMin = float3(HALF_MAX, HALF_MAX, HALF_MAX); - float3 p1BlockMax = float3(0.0f, 0.0f, 0.0f); - - uint i; - for (i = 0; i < 16; ++i) + COMMON_BC_SHADER(false, false) + hlsl(ps) { + uint4 compress(half4 texels[16], half4 min_color, half4 max_color) { - uint paletteID = Pattern(pattern, i); - if (paletteID == 0) - { - p0BlockMin = min(p0BlockMin, texels[i]); - p0BlockMax = max(p0BlockMax, texels[i]); - } - else - { - p1BlockMin = min(p1BlockMin, texels[i]); - p1BlockMax = max(p1BlockMax, texels[i]); - } + //DXGI_FORMAT_BC6H_UF16 fmt is used, so limit negative values + for ( int i = 0; i < 16; ++i ) + texels[i].rgb = max( 0., texels[i].rgb ); + return pack_bc6h_block(texels); } - - float3 p0BlockDir = p0BlockMax - p0BlockMin; - float3 p1BlockDir = p1BlockMax - p1BlockMin; - p0BlockDir = p0BlockDir / (p0BlockDir.x + p0BlockDir.y + p0BlockDir.z); - p1BlockDir = p1BlockDir / (p1BlockDir.x + p1BlockDir.y + p1BlockDir.z); - - - float p0Endpoint0Pos = f32tof16(dot(p0BlockMin, p0BlockDir)); - float p0Endpoint1Pos = f32tof16(dot(p0BlockMax, p0BlockDir)); - float p1Endpoint0Pos = f32tof16(dot(p1BlockMin, p1BlockDir)); - float p1Endpoint1Pos = f32tof16(dot(p1BlockMax, p1BlockDir)); - - - uint fixupID = PatternFixupID(pattern); - float p0FixupTexelPos = f32tof16(dot(texels[0], p0BlockDir)); - float p1FixupTexelPos = f32tof16(dot(texels[fixupID], p1BlockDir)); - uint p0FixupIndex = ComputeIndex3(p0FixupTexelPos, p0Endpoint0Pos, p0Endpoint1Pos); - uint p1FixupIndex = ComputeIndex3(p1FixupTexelPos, p1Endpoint0Pos, p1Endpoint1Pos); - if (p0FixupIndex > 3) - { - Swap(p0Endpoint0Pos, p0Endpoint1Pos); - Swap(p0BlockMin, p0BlockMax); - } - if (p1FixupIndex > 3) - { - Swap(p1Endpoint0Pos, p1Endpoint1Pos); - Swap(p1BlockMin, p1BlockMax); - } - - uint indices[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - for (i = 0; i < 16; ++i) - { - float p0TexelPos = f32tof16(dot(texels[i], p0BlockDir)); - float p1TexelPos = f32tof16(dot(texels[i], p1BlockDir)); - uint p0Index = ComputeIndex3(p0TexelPos, p0Endpoint0Pos, p0Endpoint1Pos); - uint p1Index = ComputeIndex3(p1TexelPos, p1Endpoint0Pos, p1Endpoint1Pos); - - uint paletteID = Pattern(pattern, i); - indices[i] = paletteID == 0 ? p0Index : p1Index; - } - - float3 endpoint760 = floor(Quantize7(p0BlockMin)); - float3 endpoint761 = floor(Quantize7(p0BlockMax)); - float3 endpoint762 = floor(Quantize7(p1BlockMin)); - float3 endpoint763 = floor(Quantize7(p1BlockMax)); - - float3 endpoint950 = floor(Quantize9(p0BlockMin)); - float3 endpoint951 = floor(Quantize9(p0BlockMax)); - float3 endpoint952 = floor(Quantize9(p1BlockMin)); - float3 endpoint953 = floor(Quantize9(p1BlockMax)); - - endpoint761 = endpoint761 - endpoint760; - endpoint762 = endpoint762 - endpoint760; - endpoint763 = endpoint763 - endpoint760; - - endpoint951 = endpoint951 - endpoint950; - endpoint952 = endpoint952 - endpoint950; - endpoint953 = endpoint953 - endpoint950; - - int maxVal76 = 0x1F; - endpoint761 = clamp(endpoint761, -maxVal76, maxVal76); - endpoint762 = clamp(endpoint762, -maxVal76, maxVal76); - endpoint763 = clamp(endpoint763, -maxVal76, maxVal76); - - int maxVal95 = 0xF; - endpoint951 = clamp(endpoint951, -maxVal95, maxVal95); - endpoint952 = clamp(endpoint952, -maxVal95, maxVal95); - endpoint953 = clamp(endpoint953, -maxVal95, maxVal95); - - float3 endpoint760Unq = Unquantize7(endpoint760); - float3 endpoint761Unq = Unquantize7(endpoint760 + endpoint761); - float3 endpoint762Unq = Unquantize7(endpoint760 + endpoint762); - float3 endpoint763Unq = Unquantize7(endpoint760 + endpoint763); - float3 endpoint950Unq = Unquantize9(endpoint950); - float3 endpoint951Unq = Unquantize9(endpoint950 + endpoint951); - float3 endpoint952Unq = Unquantize9(endpoint950 + endpoint952); - float3 endpoint953Unq = Unquantize9(endpoint950 + endpoint953); - - float msle76 = 0.0f; - float msle95 = 0.0f; - for (i = 0; i < 16; ++i) - { - uint paletteID = Pattern(pattern, i); - - float3 tmp760Unq = paletteID == 0 ? endpoint760Unq : endpoint762Unq; - float3 tmp761Unq = paletteID == 0 ? endpoint761Unq : endpoint763Unq; - float3 tmp950Unq = paletteID == 0 ? endpoint950Unq : endpoint952Unq; - float3 tmp951Unq = paletteID == 0 ? endpoint951Unq : endpoint953Unq; - - float weight = floor((indices[i] * 64.0f) / 7.0f + 0.5f); - float3 texelUnc76 = FinishUnquantize(tmp760Unq, tmp761Unq, weight); - float3 texelUnc95 = FinishUnquantize(tmp950Unq, tmp951Unq, weight); - - msle76 += CalcMSLE(texels[i], texelUnc76); - msle95 += CalcMSLE(texels[i], texelUnc95); - } - - SignExtend(endpoint761, 0x1F, 0x20); - SignExtend(endpoint762, 0x1F, 0x20); - SignExtend(endpoint763, 0x1F, 0x20); - - SignExtend(endpoint951, 0xF, 0x10); - SignExtend(endpoint952, 0xF, 0x10); - SignExtend(endpoint953, 0xF, 0x10); - - // encode block - float p2MSLE = min(msle76, msle95); - if (p2MSLE < blockMSLE) - { - blockMSLE = p2MSLE; - block = uint4(0, 0, 0, 0); - - if (p2MSLE == msle76) - { - // 7.6 - block.x = 0x1; - block.x |= ((uint)endpoint762.y & 0x20) >> 3; - block.x |= ((uint)endpoint763.y & 0x10) >> 1; - block.x |= ((uint)endpoint763.y & 0x20) >> 1; - block.x |= (uint)endpoint760.x << 5; - block.x |= ((uint)endpoint763.z & 0x01) << 12; - block.x |= ((uint)endpoint763.z & 0x02) << 12; - block.x |= ((uint)endpoint762.z & 0x10) << 10; - block.x |= (uint)endpoint760.y << 15; - block.x |= ((uint)endpoint762.z & 0x20) << 17; - block.x |= ((uint)endpoint763.z & 0x04) << 21; - block.x |= ((uint)endpoint762.y & 0x10) << 20; - block.x |= (uint)endpoint760.z << 25; - block.y |= ((uint)endpoint763.z & 0x08) >> 3; - block.y |= ((uint)endpoint763.z & 0x20) >> 4; - block.y |= ((uint)endpoint763.z & 0x10) >> 2; - block.y |= (uint)endpoint761.x << 3; - block.y |= ((uint)endpoint762.y & 0x0F) << 9; - block.y |= (uint)endpoint761.y << 13; - block.y |= ((uint)endpoint763.y & 0x0F) << 19; - block.y |= (uint)endpoint761.z << 23; - block.y |= ((uint)endpoint762.z & 0x07) << 29; - block.z |= ((uint)endpoint762.z & 0x08) >> 3; - block.z |= (uint)endpoint762.x << 1; - block.z |= (uint)endpoint763.x << 7; - } - else - { - // 9.5 - block.x = 0xE; - block.x |= (uint)endpoint950.x << 5; - block.x |= ((uint)endpoint952.z & 0x10) << 10; - block.x |= (uint)endpoint950.y << 15; - block.x |= ((uint)endpoint952.y & 0x10) << 20; - block.x |= (uint)endpoint950.z << 25; - block.y |= (uint)endpoint950.z >> 7; - block.y |= ((uint)endpoint953.z & 0x10) >> 2; - block.y |= (uint)endpoint951.x << 3; - block.y |= ((uint)endpoint953.y & 0x10) << 4; - block.y |= ((uint)endpoint952.y & 0x0F) << 9; - block.y |= (uint)endpoint951.y << 13; - block.y |= ((uint)endpoint953.z & 0x01) << 18; - block.y |= ((uint)endpoint953.y & 0x0F) << 19; - block.y |= (uint)endpoint951.z << 23; - block.y |= ((uint)endpoint953.z & 0x02) << 27; - block.y |= (uint)endpoint952.z << 29; - block.z |= ((uint)endpoint952.z & 0x08) >> 3; - block.z |= (uint)endpoint952.x << 1; - block.z |= ((uint)endpoint953.z & 0x04) << 4; - block.z |= (uint)endpoint953.x << 7; - block.z |= ((uint)endpoint953.z & 0x08) << 9; - } - - block.z |= pattern << 13; - uint blockFixupID = PatternFixupID(pattern); - if (blockFixupID == 15) - { - block.z |= indices[0] << 18; - block.z |= indices[1] << 20; - block.z |= indices[2] << 23; - block.z |= indices[3] << 26; - block.z |= indices[4] << 29; - block.w |= indices[5] << 0; - block.w |= indices[6] << 3; - block.w |= indices[7] << 6; - block.w |= indices[8] << 9; - block.w |= indices[9] << 12; - block.w |= indices[10] << 15; - block.w |= indices[11] << 18; - block.w |= indices[12] << 21; - block.w |= indices[13] << 24; - block.w |= indices[14] << 27; - block.w |= indices[15] << 30; - } - else if (blockFixupID == 2) - { - block.z |= indices[0] << 18; - block.z |= indices[1] << 20; - block.z |= indices[2] << 23; - block.z |= indices[3] << 25; - block.z |= indices[4] << 28; - block.z |= indices[5] << 31; - block.w |= indices[5] >> 1; - block.w |= indices[6] << 2; - block.w |= indices[7] << 5; - block.w |= indices[8] << 8; - block.w |= indices[9] << 11; - block.w |= indices[10] << 14; - block.w |= indices[11] << 17; - block.w |= indices[12] << 20; - block.w |= indices[13] << 23; - block.w |= indices[14] << 26; - block.w |= indices[15] << 29; - } - else - { - block.z |= indices[0] << 18; - block.z |= indices[1] << 20; - block.z |= indices[2] << 23; - block.z |= indices[3] << 26; - block.z |= indices[4] << 29; - block.w |= indices[5] << 0; - block.w |= indices[6] << 3; - block.w |= indices[7] << 6; - block.w |= indices[8] << 9; - block.w |= indices[9] << 11; - block.w |= indices[10] << 14; - block.w |= indices[11] << 17; - block.w |= indices[12] << 20; - block.w |= indices[13] << 23; - block.w |= indices[14] << 26; - block.w |= indices[15] << 29; - } - } - } - - uint4 pack_bc6h_block(float4 texels[16]) - { - // gather texels for current 4x4 block - // 0 1 2 3 - // 4 5 6 7 - // 8 9 10 11 - // 12 13 14 15 - - float3 rgb_texels[16]; -#define COPY_ITERATION(n) rgb_texels[n] = texels[n].rgb; - COPY_ITERATION(0); - COPY_ITERATION(1); - COPY_ITERATION(2); - COPY_ITERATION(3); - COPY_ITERATION(4); - COPY_ITERATION(5); - COPY_ITERATION(6); - COPY_ITERATION(7); - COPY_ITERATION(8); - COPY_ITERATION(9); - COPY_ITERATION(10); - COPY_ITERATION(11); - COPY_ITERATION(12); - COPY_ITERATION(13); - COPY_ITERATION(14); - COPY_ITERATION(15); -#undef COPY_ITERATION - - uint4 block = uint4(0, 0, 0, 0); - float blockMSLE = 0.0f; - - EncodeP1(block, blockMSLE, rgb_texels); -##if bc6h_compression_mode == bc6h_compression_quality && !hardware.vulkan//HLSLcc or Vulkan generates shit - for (uint i = 0; i < 32; ++i) - { - EncodeP2Pattern(block, blockMSLE, i, rgb_texels); - } -##endif - - return block; } } -endmacro diff --git a/prog/gameLibs/render/shaders/bc6h_compression_inc.sh b/prog/gameLibs/render/shaders/bc6h_compression_inc.sh new file mode 100644 index 000000000..1912c28cb --- /dev/null +++ b/prog/gameLibs/render/shaders/bc6h_compression_inc.sh @@ -0,0 +1,523 @@ +int bc6h_compression_mode = 1; +interval bc6h_compression_mode : bc6h_compression_fast < 1, bc6h_compression_quality; + +macro USE_BC6H_COMPRESSION(code) + +hlsl { + #define HALF_MAX 65504.0f + #define BC6H_PATTERN_NUM 32 + + float CalcMSLE(float3 a, float3 b) + { + float3 err = log2((b + 1.0f) / (a + 1.0f));; + err = err * err; + return err.x + err.y + err.z; + } + + uint PatternFixupID(uint i) + { + uint ret = 15; + ret = ((3441033216U >> i) & 0x1) ? 2 : ret; + ret = ((845414400U >> i) & 0x1) ? 8 : ret; + return ret; + } + + uint Pattern(uint p, uint i) + { + uint p2 = p / 2; + uint p3 = p - p2 * 2; + + uint enc = 0; + enc = p2 == 0 ? 2290666700 : enc; + enc = p2 == 1 ? 3972591342 : enc; + enc = p2 == 2 ? 4276930688 : enc; + enc = p2 == 3 ? 3967876808 : enc; + enc = p2 == 4 ? 4293707776 : enc; + enc = p2 == 5 ? 3892379264 : enc; + enc = p2 == 6 ? 4278255592 : enc; + enc = p2 == 7 ? 4026597360 : enc; + enc = p2 == 8 ? 9369360 : enc; + enc = p2 == 9 ? 147747072 : enc; + enc = p2 == 10 ? 1930428556 : enc; + enc = p2 == 11 ? 2362323200 : enc; + enc = p2 == 12 ? 823134348 : enc; + enc = p2 == 13 ? 913073766 : enc; + enc = p2 == 14 ? 267393000 : enc; + enc = p2 == 15 ? 966553998 : enc; + + enc = p3 ? enc >> 16 : enc; + uint ret = (enc >> i) & 0x1; + return ret; + } + + float3 Quantize7(float3 x) + { + return (f32tof16(x) * 128.0f) / (0x7bff + 1.0f); + } + + float3 Quantize9(float3 x) + { + return (f32tof16(x) * 512.0f) / (0x7bff + 1.0f); + } + + float3 Quantize10(float3 x) + { + return (f32tof16(x) * 1024.0f) / (0x7bff + 1.0f); + } + + float3 Unquantize7(float3 x) + { + return (x * 65536.0f + 0x8000) / 128.0f; + } + + float3 Unquantize9(float3 x) + { + return (x * 65536.0f + 0x8000) / 512.0f; + } + + float3 Unquantize10(float3 x) + { + return (x * 65536.0f + 0x8000) / 1024.0f; + } + + float3 FinishUnquantize(float3 endpoint0Unq, float3 endpoint1Unq, float weight) + { + float3 comp = (endpoint0Unq * (64.0f - weight) + endpoint1Unq * weight + 32.0f) * (31.0f / 4096.0f); + return f16tof32(uint3(comp)); + } + + void Swap(inout float3 a, inout float3 b) + { + float3 tmp = a; + a = b; + b = tmp; + } + + void Swap(inout float a, inout float b) + { + float tmp = a; + a = b; + b = tmp; + } + + uint ComputeIndex3(float texelPos, float endPoint0Pos, float endPoint1Pos) + { + float r = (texelPos - endPoint0Pos) / (endPoint1Pos - endPoint0Pos); + return (uint)clamp(r * 6.98182f + 0.00909f + 0.5f, 0.0f, 7.0f); + } + + uint ComputeIndex4(float texelPos, float endPoint0Pos, float endPoint1Pos) + { + float r = (texelPos - endPoint0Pos) / (endPoint1Pos - endPoint0Pos); + return (uint)clamp(r * 14.93333f + 0.03333f + 0.5f, 0.0f, 15.0f); + } + + void SignExtend(inout float3 v1, uint mask, uint signFlag) + { + int3 v = (int3)v1; + v.x = (v.x & mask) | (v.x < 0 ? signFlag : 0); + v.y = (v.y & mask) | (v.y < 0 ? signFlag : 0); + v.z = (v.z & mask) | (v.z < 0 ? signFlag : 0); + v1 = v; + } + + void EncodeP1(inout uint4 block, inout float blockMSLE, float3 texels[16]) + { + // compute endpoints (min/max RGB bbox) + float3 blockMin = texels[0]; + float3 blockMax = texels[0]; + uint i; + for (i = 1; i < 16; ++i) + { + blockMin = min(blockMin, texels[i]); + blockMax = max(blockMax, texels[i]); + } + + + // refine endpoints in log2 RGB space + float3 refinedBlockMin = blockMax; + float3 refinedBlockMax = blockMin; + for (i = 0; i < 16; ++i) + { + refinedBlockMin = min(refinedBlockMin, texels[i] == blockMin ? refinedBlockMin : texels[i]); + refinedBlockMax = max(refinedBlockMax, texels[i] == blockMax ? refinedBlockMax : texels[i]); + } + + float3 logBlockMax = log2(blockMax + 1.0f); + float3 logBlockMin = log2(blockMin + 1.0f); + float3 logRefinedBlockMax = log2(refinedBlockMax + 1.0f); + float3 logRefinedBlockMin = log2(refinedBlockMin + 1.0f); + float3 logBlockMaxExt = (logBlockMax - logBlockMin) * (1.0f / 32.0f); + logBlockMin += min(logRefinedBlockMin - logBlockMin, logBlockMaxExt); + logBlockMax -= min(logBlockMax - logRefinedBlockMax, logBlockMaxExt); + blockMin = exp2(logBlockMin) - 1.0f; + blockMax = exp2(logBlockMax) - 1.0f; + + float3 blockDir = blockMax - blockMin; + blockDir = blockDir / (blockDir.x + blockDir.y + blockDir.z); + + float3 endpoint0 = Quantize10(blockMin); + float3 endpoint1 = Quantize10(blockMax); + float endPoint0Pos = f32tof16(dot(blockMin, blockDir)); + float endPoint1Pos = f32tof16(dot(blockMax, blockDir)); + + + // check if endpoint swap is required + float fixupTexelPos = f32tof16(dot(texels[0], blockDir)); + uint fixupIndex = ComputeIndex4(fixupTexelPos, endPoint0Pos, endPoint1Pos); + if (fixupIndex > 7) + { + Swap(endPoint0Pos, endPoint1Pos); + Swap(endpoint0, endpoint1); + } + + // compute indices + uint indices[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + for (i = 0; i < 16; ++i) + { + float texelPos = f32tof16(dot(texels[i], blockDir)); + indices[i] = ComputeIndex4(texelPos, endPoint0Pos, endPoint1Pos); + } + + // compute compression error (MSLE) + float3 endpoint0Unq = Unquantize10(endpoint0); + float3 endpoint1Unq = Unquantize10(endpoint1); + float msle = 0.0f; + for (i = 0; i < 16; ++i) + { + float weight = floor((indices[i] * 64.0f) / 15.0f + 0.5f); + float3 texelUnc = FinishUnquantize(endpoint0Unq, endpoint1Unq, weight); + + msle += CalcMSLE(texels[i], texelUnc); + } + + + // encode block for mode 11 + blockMSLE = msle; + block.x = 0x03; + + // endpoints + block.x |= (uint)endpoint0.x << 5; + block.x |= (uint)endpoint0.y << 15; + block.x |= (uint)endpoint0.z << 25; + block.y |= (uint)endpoint0.z >> 7; + block.y |= (uint)endpoint1.x << 3; + block.y |= (uint)endpoint1.y << 13; + block.y |= (uint)endpoint1.z << 23; + block.z |= (uint)endpoint1.z >> 9; + + // indices + block.z |= indices[0] << 1; + block.z |= indices[1] << 4; + block.z |= indices[2] << 8; + block.z |= indices[3] << 12; + block.z |= indices[4] << 16; + block.z |= indices[5] << 20; + block.z |= indices[6] << 24; + block.z |= indices[7] << 28; + block.w |= indices[8] << 0; + block.w |= indices[9] << 4; + block.w |= indices[10] << 8; + block.w |= indices[11] << 12; + block.w |= indices[12] << 16; + block.w |= indices[13] << 20; + block.w |= indices[14] << 24; + block.w |= indices[15] << 28; + } + + void EncodeP2Pattern(inout uint4 block, inout float blockMSLE, int pattern, float3 texels[16]) + { + float3 p0BlockMin = float3(HALF_MAX, HALF_MAX, HALF_MAX); + float3 p0BlockMax = float3(0.0f, 0.0f, 0.0f); + float3 p1BlockMin = float3(HALF_MAX, HALF_MAX, HALF_MAX); + float3 p1BlockMax = float3(0.0f, 0.0f, 0.0f); + + uint i; + for (i = 0; i < 16; ++i) + { + uint paletteID = Pattern(pattern, i); + if (paletteID == 0) + { + p0BlockMin = min(p0BlockMin, texels[i]); + p0BlockMax = max(p0BlockMax, texels[i]); + } + else + { + p1BlockMin = min(p1BlockMin, texels[i]); + p1BlockMax = max(p1BlockMax, texels[i]); + } + } + + float3 p0BlockDir = p0BlockMax - p0BlockMin; + float3 p1BlockDir = p1BlockMax - p1BlockMin; + p0BlockDir = p0BlockDir / (p0BlockDir.x + p0BlockDir.y + p0BlockDir.z); + p1BlockDir = p1BlockDir / (p1BlockDir.x + p1BlockDir.y + p1BlockDir.z); + + + float p0Endpoint0Pos = f32tof16(dot(p0BlockMin, p0BlockDir)); + float p0Endpoint1Pos = f32tof16(dot(p0BlockMax, p0BlockDir)); + float p1Endpoint0Pos = f32tof16(dot(p1BlockMin, p1BlockDir)); + float p1Endpoint1Pos = f32tof16(dot(p1BlockMax, p1BlockDir)); + + + uint fixupID = PatternFixupID(pattern); + float p0FixupTexelPos = f32tof16(dot(texels[0], p0BlockDir)); + float p1FixupTexelPos = f32tof16(dot(texels[fixupID], p1BlockDir)); + uint p0FixupIndex = ComputeIndex3(p0FixupTexelPos, p0Endpoint0Pos, p0Endpoint1Pos); + uint p1FixupIndex = ComputeIndex3(p1FixupTexelPos, p1Endpoint0Pos, p1Endpoint1Pos); + if (p0FixupIndex > 3) + { + Swap(p0Endpoint0Pos, p0Endpoint1Pos); + Swap(p0BlockMin, p0BlockMax); + } + if (p1FixupIndex > 3) + { + Swap(p1Endpoint0Pos, p1Endpoint1Pos); + Swap(p1BlockMin, p1BlockMax); + } + + uint indices[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + for (i = 0; i < 16; ++i) + { + float p0TexelPos = f32tof16(dot(texels[i], p0BlockDir)); + float p1TexelPos = f32tof16(dot(texels[i], p1BlockDir)); + uint p0Index = ComputeIndex3(p0TexelPos, p0Endpoint0Pos, p0Endpoint1Pos); + uint p1Index = ComputeIndex3(p1TexelPos, p1Endpoint0Pos, p1Endpoint1Pos); + + uint paletteID = Pattern(pattern, i); + indices[i] = paletteID == 0 ? p0Index : p1Index; + } + + float3 endpoint760 = floor(Quantize7(p0BlockMin)); + float3 endpoint761 = floor(Quantize7(p0BlockMax)); + float3 endpoint762 = floor(Quantize7(p1BlockMin)); + float3 endpoint763 = floor(Quantize7(p1BlockMax)); + + float3 endpoint950 = floor(Quantize9(p0BlockMin)); + float3 endpoint951 = floor(Quantize9(p0BlockMax)); + float3 endpoint952 = floor(Quantize9(p1BlockMin)); + float3 endpoint953 = floor(Quantize9(p1BlockMax)); + + endpoint761 = endpoint761 - endpoint760; + endpoint762 = endpoint762 - endpoint760; + endpoint763 = endpoint763 - endpoint760; + + endpoint951 = endpoint951 - endpoint950; + endpoint952 = endpoint952 - endpoint950; + endpoint953 = endpoint953 - endpoint950; + + int maxVal76 = 0x1F; + endpoint761 = clamp(endpoint761, -maxVal76, maxVal76); + endpoint762 = clamp(endpoint762, -maxVal76, maxVal76); + endpoint763 = clamp(endpoint763, -maxVal76, maxVal76); + + int maxVal95 = 0xF; + endpoint951 = clamp(endpoint951, -maxVal95, maxVal95); + endpoint952 = clamp(endpoint952, -maxVal95, maxVal95); + endpoint953 = clamp(endpoint953, -maxVal95, maxVal95); + + float3 endpoint760Unq = Unquantize7(endpoint760); + float3 endpoint761Unq = Unquantize7(endpoint760 + endpoint761); + float3 endpoint762Unq = Unquantize7(endpoint760 + endpoint762); + float3 endpoint763Unq = Unquantize7(endpoint760 + endpoint763); + float3 endpoint950Unq = Unquantize9(endpoint950); + float3 endpoint951Unq = Unquantize9(endpoint950 + endpoint951); + float3 endpoint952Unq = Unquantize9(endpoint950 + endpoint952); + float3 endpoint953Unq = Unquantize9(endpoint950 + endpoint953); + + float msle76 = 0.0f; + float msle95 = 0.0f; + for (i = 0; i < 16; ++i) + { + uint paletteID = Pattern(pattern, i); + + float3 tmp760Unq = paletteID == 0 ? endpoint760Unq : endpoint762Unq; + float3 tmp761Unq = paletteID == 0 ? endpoint761Unq : endpoint763Unq; + float3 tmp950Unq = paletteID == 0 ? endpoint950Unq : endpoint952Unq; + float3 tmp951Unq = paletteID == 0 ? endpoint951Unq : endpoint953Unq; + + float weight = floor((indices[i] * 64.0f) / 7.0f + 0.5f); + float3 texelUnc76 = FinishUnquantize(tmp760Unq, tmp761Unq, weight); + float3 texelUnc95 = FinishUnquantize(tmp950Unq, tmp951Unq, weight); + + msle76 += CalcMSLE(texels[i], texelUnc76); + msle95 += CalcMSLE(texels[i], texelUnc95); + } + + SignExtend(endpoint761, 0x1F, 0x20); + SignExtend(endpoint762, 0x1F, 0x20); + SignExtend(endpoint763, 0x1F, 0x20); + + SignExtend(endpoint951, 0xF, 0x10); + SignExtend(endpoint952, 0xF, 0x10); + SignExtend(endpoint953, 0xF, 0x10); + + // encode block + float p2MSLE = min(msle76, msle95); + if (p2MSLE < blockMSLE) + { + blockMSLE = p2MSLE; + block = uint4(0, 0, 0, 0); + + if (p2MSLE == msle76) + { + // 7.6 + block.x = 0x1; + block.x |= ((uint)endpoint762.y & 0x20) >> 3; + block.x |= ((uint)endpoint763.y & 0x10) >> 1; + block.x |= ((uint)endpoint763.y & 0x20) >> 1; + block.x |= (uint)endpoint760.x << 5; + block.x |= ((uint)endpoint763.z & 0x01) << 12; + block.x |= ((uint)endpoint763.z & 0x02) << 12; + block.x |= ((uint)endpoint762.z & 0x10) << 10; + block.x |= (uint)endpoint760.y << 15; + block.x |= ((uint)endpoint762.z & 0x20) << 17; + block.x |= ((uint)endpoint763.z & 0x04) << 21; + block.x |= ((uint)endpoint762.y & 0x10) << 20; + block.x |= (uint)endpoint760.z << 25; + block.y |= ((uint)endpoint763.z & 0x08) >> 3; + block.y |= ((uint)endpoint763.z & 0x20) >> 4; + block.y |= ((uint)endpoint763.z & 0x10) >> 2; + block.y |= (uint)endpoint761.x << 3; + block.y |= ((uint)endpoint762.y & 0x0F) << 9; + block.y |= (uint)endpoint761.y << 13; + block.y |= ((uint)endpoint763.y & 0x0F) << 19; + block.y |= (uint)endpoint761.z << 23; + block.y |= ((uint)endpoint762.z & 0x07) << 29; + block.z |= ((uint)endpoint762.z & 0x08) >> 3; + block.z |= (uint)endpoint762.x << 1; + block.z |= (uint)endpoint763.x << 7; + } + else + { + // 9.5 + block.x = 0xE; + block.x |= (uint)endpoint950.x << 5; + block.x |= ((uint)endpoint952.z & 0x10) << 10; + block.x |= (uint)endpoint950.y << 15; + block.x |= ((uint)endpoint952.y & 0x10) << 20; + block.x |= (uint)endpoint950.z << 25; + block.y |= (uint)endpoint950.z >> 7; + block.y |= ((uint)endpoint953.z & 0x10) >> 2; + block.y |= (uint)endpoint951.x << 3; + block.y |= ((uint)endpoint953.y & 0x10) << 4; + block.y |= ((uint)endpoint952.y & 0x0F) << 9; + block.y |= (uint)endpoint951.y << 13; + block.y |= ((uint)endpoint953.z & 0x01) << 18; + block.y |= ((uint)endpoint953.y & 0x0F) << 19; + block.y |= (uint)endpoint951.z << 23; + block.y |= ((uint)endpoint953.z & 0x02) << 27; + block.y |= (uint)endpoint952.z << 29; + block.z |= ((uint)endpoint952.z & 0x08) >> 3; + block.z |= (uint)endpoint952.x << 1; + block.z |= ((uint)endpoint953.z & 0x04) << 4; + block.z |= (uint)endpoint953.x << 7; + block.z |= ((uint)endpoint953.z & 0x08) << 9; + } + + block.z |= pattern << 13; + uint blockFixupID = PatternFixupID(pattern); + if (blockFixupID == 15) + { + block.z |= indices[0] << 18; + block.z |= indices[1] << 20; + block.z |= indices[2] << 23; + block.z |= indices[3] << 26; + block.z |= indices[4] << 29; + block.w |= indices[5] << 0; + block.w |= indices[6] << 3; + block.w |= indices[7] << 6; + block.w |= indices[8] << 9; + block.w |= indices[9] << 12; + block.w |= indices[10] << 15; + block.w |= indices[11] << 18; + block.w |= indices[12] << 21; + block.w |= indices[13] << 24; + block.w |= indices[14] << 27; + block.w |= indices[15] << 30; + } + else if (blockFixupID == 2) + { + block.z |= indices[0] << 18; + block.z |= indices[1] << 20; + block.z |= indices[2] << 23; + block.z |= indices[3] << 25; + block.z |= indices[4] << 28; + block.z |= indices[5] << 31; + block.w |= indices[5] >> 1; + block.w |= indices[6] << 2; + block.w |= indices[7] << 5; + block.w |= indices[8] << 8; + block.w |= indices[9] << 11; + block.w |= indices[10] << 14; + block.w |= indices[11] << 17; + block.w |= indices[12] << 20; + block.w |= indices[13] << 23; + block.w |= indices[14] << 26; + block.w |= indices[15] << 29; + } + else + { + block.z |= indices[0] << 18; + block.z |= indices[1] << 20; + block.z |= indices[2] << 23; + block.z |= indices[3] << 26; + block.z |= indices[4] << 29; + block.w |= indices[5] << 0; + block.w |= indices[6] << 3; + block.w |= indices[7] << 6; + block.w |= indices[8] << 9; + block.w |= indices[9] << 11; + block.w |= indices[10] << 14; + block.w |= indices[11] << 17; + block.w |= indices[12] << 20; + block.w |= indices[13] << 23; + block.w |= indices[14] << 26; + block.w |= indices[15] << 29; + } + } + } + + uint4 pack_bc6h_block(float4 texels[16]) + { + // gather texels for current 4x4 block + // 0 1 2 3 + // 4 5 6 7 + // 8 9 10 11 + // 12 13 14 15 + + float3 rgb_texels[16]; +#define COPY_ITERATION(n) rgb_texels[n] = texels[n].rgb; + COPY_ITERATION(0); + COPY_ITERATION(1); + COPY_ITERATION(2); + COPY_ITERATION(3); + COPY_ITERATION(4); + COPY_ITERATION(5); + COPY_ITERATION(6); + COPY_ITERATION(7); + COPY_ITERATION(8); + COPY_ITERATION(9); + COPY_ITERATION(10); + COPY_ITERATION(11); + COPY_ITERATION(12); + COPY_ITERATION(13); + COPY_ITERATION(14); + COPY_ITERATION(15); +#undef COPY_ITERATION + + uint4 block = uint4(0, 0, 0, 0); + float blockMSLE = 0.0f; + + EncodeP1(block, blockMSLE, rgb_texels); +##if bc6h_compression_mode == bc6h_compression_quality && !hardware.vulkan//HLSLcc or Vulkan generates shit + for (uint i = 0; i < 32; ++i) + { + EncodeP2Pattern(block, blockMSLE, i, rgb_texels); + } +##endif + + return block; + } +} +endmacro diff --git a/prog/gameLibs/render/shaders/bc_compression.sh b/prog/gameLibs/render/shaders/bc_compression.sh deleted file mode 100644 index 4ee89478f..000000000 --- a/prog/gameLibs/render/shaders/bc_compression.sh +++ /dev/null @@ -1,128 +0,0 @@ -include "land_block_inc.sh" -include "bc_compression_inc.sh" -include "bc6h_compression.sh" - -texture src_tex; -float src_mip = 0; -float dst_mip = 0; - -int src_face = -1; -interval src_face : src_single_face < 0, src_cube; - - -shader bc1_srgbwrite_compressor, bc1_compressor, bc3_compressor, bc3_srgbwrite_compressor, bc4_compressor, bc5_compressor, bc6h_compressor -{ - supports none; - supports global_frame; - supports land_mesh_prepare_clipmap; - - hlsl - { - #define BC_COMPRESSION_USE_MIPS - } - - (vs) { src_dst_mip__src_face@f3 = (src_mip, dst_mip, src_face, 0); } - - if (src_face != src_single_face) - { - (ps) { src_tex@smpCube = src_tex; } - hlsl { - #define BC_COMPRESSION_FOR_CUBE 1 - } - } - else - { - (ps) { src_tex@smp2d = src_tex; } - } - - INIT_BC_COMPRESSION() - - if ( shader == bc1_compressor || shader == bc1_srgbwrite_compressor) - { - USE_BC1_COMPRESSION(ps) - // USE_R5G6B5_2BPP_COMPRESSION_DEBUG() - } - - if ( shader == bc3_compressor || shader == bc3_srgbwrite_compressor ) - { - USE_BC3_COMPRESSION(ps) - } - - if ( shader == bc4_compressor ) - { - USE_BC4_COMPRESSION(ps) - } - - if ( shader == bc5_compressor ) - { - USE_BC5_COMPRESSION(ps) - } - - if ( shader == bc6h_compressor ) - { - USE_BC6H_COMPRESSION(ps) - } - - if ( shader == bc1_compressor || shader == bc4_compressor) - { - PS4_DEF_TARGET_FMT_32_GR() - } - else - { - PS4_DEF_TARGET_FMT_32_ABGR() - } - - hlsl(ps) - { -##if shader == bc3_srgbwrite_compressor || shader == bc1_srgbwrite_compressor - #include -##endif - - uint4 bc_compressor_ps( VsOutput input ) : SV_Target - { - half4 texels[16]; - half4 min_color, max_color; - -#ifdef BC_COMPRESSION_FOR_CUBE - get_texels( input.tex, texels, input.src_mip, input.src_face ); -#else - get_texels( input.tex, texels, input.src_mip ); -#endif - -##if shader == bc3_srgbwrite_compressor || shader == bc1_srgbwrite_compressor - for ( int i = 0; i < 16; ++i ) - texels[i].rgb = accurateLinearToSRGB( texels[i].rgb ); -##endif - -##if shader == bc6h_compressor - //DXGI_FORMAT_BC6H_UF16 fmt is used, so limit negative values - for ( int i = 0; i < 16; ++i ) - texels[i].rgb = max( 0., texels[i].rgb ); - return pack_bc6h_block(texels); -##endif - find_base_colors( texels, min_color, max_color ); - -##if shader == bc1_compressor || shader == bc1_srgbwrite_compressor - refine_rgb_base_colors( texels, min_color, max_color ); - return pack_bc1_block( texels, min_color, max_color ); - // return pack_r5g6b5_2bpp_psnr_block( texels, min_color, max_color ); -##endif - -##if shader == bc3_compressor || shader == bc3_srgbwrite_compressor - refine_rgb_base_colors( texels, min_color, max_color ); - return pack_bc3_block( texels, min_color, max_color ); -##endif - -##if shader == bc4_compressor - return pack_bc4_block( texels, min_color, max_color ); -##endif - -##if shader == bc5_compressor - return pack_bc5_block( texels, min_color, max_color ); -##endif - } - } - - compile( "target_vs", "bc_compressor_vs" ); - compile( "target_ps", "bc_compressor_ps" ); -} diff --git a/prog/gameLibs/render/shaders/bc_compression_inc.sh b/prog/gameLibs/render/shaders/bc_compression_inc.sh index cd4f1bcb9..2de8fa34c 100644 --- a/prog/gameLibs/render/shaders/bc_compression_inc.sh +++ b/prog/gameLibs/render/shaders/bc_compression_inc.sh @@ -445,3 +445,68 @@ macro USE_BC5_COMPRESSION(code) } endmacro // USE_BC5_COMPRESSION + +macro COMMON_BC_SHADER(rg_channels, srgb) + supports none; + supports global_frame; + + hlsl + { + #define BC_COMPRESSION_USE_MIPS + } + + (vs) { src_dst_mip__src_face@f3 = (src_mip, dst_mip, src_face, 0); } + + if (src_face != src_single_face) + { + (ps) { src_tex@smpCube = src_tex; } + hlsl { + #define BC_COMPRESSION_FOR_CUBE 1 + } + } + else + { + (ps) { src_tex@smp2d = src_tex; } + } + + INIT_BC_COMPRESSION() + + + if (rg_channels) + { + PS4_DEF_TARGET_FMT_32_GR() + } + else + { + PS4_DEF_TARGET_FMT_32_ABGR() + } + + hlsl(ps) + { +##if srgb + #include +##endif + + uint4 bc_compressor_ps( VsOutput input ) : SV_Target + { + half4 texels[16]; + half4 min_color, max_color; + +#ifdef BC_COMPRESSION_FOR_CUBE + get_texels( input.tex, texels, input.src_mip, input.src_face ); +#else + get_texels( input.tex, texels, input.src_mip ); +#endif + +##if srgb + for ( int i = 0; i < 16; ++i ) + texels[i].rgb = accurateLinearToSRGB( texels[i].rgb ); +##endif + + return compress( texels, min_color, max_color ); + } + } + + compile( "target_vs", "bc_compressor_vs" ); + compile( "target_ps", "bc_compressor_ps" ); +endmacro diff --git a/prog/gameLibs/render/shaders/contactShadows.hlsl b/prog/gameLibs/render/shaders/contactShadows.hlsl index c7b480d20..5e02d513b 100644 --- a/prog/gameLibs/render/shaders/contactShadows.hlsl +++ b/prog/gameLibs/render/shaders/contactShadows.hlsl @@ -54,11 +54,19 @@ half contactShadowRayCastWithScale( { float3 sampleUVz = rayStartUVz + rayStepUVz * sampleT; #if USE_LINEAR_THRESHOLD - float sampleDepth = linearize_z(tex2Dlod( depth_gbuf, float4(sampleUVz.xy, 0,0) ).r, zn_zfar.zw); + #if FSR_DISTORTION + float sampleDepth = linearize_z(tex2Dlod(depth_gbuf, float4(linearToDistortedTc(sampleUVz.xy), 0, 0)).r, zn_zfar.zw); + #else + float sampleDepth = linearize_z(tex2Dlod(depth_gbuf, float4(sampleUVz.xy, 0, 0)).r, zn_zfar.zw); + #endif float depthDiff = sampleUVz.z - sampleDepth - 0.02 * linearDepth * TraceBias; bool hasHit = (depthDiff > 0.0 && depthDiff < zThickness); #else - float sampleDepth = tex2Dlod( depth_gbuf, float4(sampleUVz.xy, 0,0) ).r; + #if FSR_DISTORTION + float sampleDepth = tex2Dlod(depth_gbuf, float4(linearToDistortedTc(sampleUVz.xy), 0, 0)).r; + #else + float sampleDepth = tex2Dlod(depth_gbuf, float4(sampleUVz.xy, 0, 0)).r; + #endif float depthDiff = sampleUVz.z - sampleDepth; bool hasHit = abs( depthDiff + compareTolerance ) < compareTolerance; #endif diff --git a/prog/gameLibs/render/shaders/debugGbuffer.sh b/prog/gameLibs/render/shaders/debugGbuffer.sh index 9384c2d40..baec4f558 100644 --- a/prog/gameLibs/render/shaders/debugGbuffer.sh +++ b/prog/gameLibs/render/shaders/debugGbuffer.sh @@ -266,7 +266,7 @@ shader debug_final_gbuffer float NoL = dot(gbuffer.normal, lightDir); float3 result = diffuse * lerp(max(NoL, 0), 1, LOD_AMBIENT_WEIGHT); - return half4(result,1); + return half4(result, 1); ##elif show_gbuffer == baseColor return half4(accurateLinearToSRGB(gbuf.albedo),1); ##elif show_gbuffer == diffuseColor diff --git a/prog/gameLibs/render/shaders/decals/decals.sh b/prog/gameLibs/render/shaders/decals/decals.sh index 115188c06..76a119132 100644 --- a/prog/gameLibs/render/shaders/decals/decals.sh +++ b/prog/gameLibs/render/shaders/decals/decals.sh @@ -46,8 +46,8 @@ macro USE_PLANAR_DECALS(start_params_no) float4 n0 = DECAL_GET_DYNREND_PARAM(norm_idx + 0, params); float4 n1 = DECAL_GET_DYNREND_PARAM(norm_idx + 1, params); - float influence0 = dot(model_normal, n0.xyz); - float influence1 = dot(model_normal, n1.xyz); + float influence0 = pow(dot(model_normal, n0.xyz), 3.); + float influence1 = pow(dot(model_normal, n1.xyz), 3.); FLATTEN if (dec_mul < 0 && influence1 > influence0) { @@ -69,6 +69,8 @@ macro USE_PLANAR_DECALS(start_params_no) dot(modelPos4, DECAL_GET_DYNREND_PARAM(line_idx + 0, params)), dot(modelPos4, DECAL_GET_DYNREND_PARAM(line_idx + 1, params))); } +#define INFLUENCE_MULT 8.0 + decals_uv_depth_influence.w = saturate(decals_uv_depth_influence.w * INFLUENCE_MULT); //increase influence back, to get full alpha on fading part } void apply_planar_decals_vs( diff --git a/prog/gameLibs/render/shaders/dof/dof.sh b/prog/gameLibs/render/shaders/dof/dof.sh index a4c11f68d..67d90ed28 100644 --- a/prog/gameLibs/render/shaders/dof/dof.sh +++ b/prog/gameLibs/render/shaders/dof/dof.sh @@ -221,10 +221,20 @@ shader dof_downsample ); float4 vCocNear4, vCocFar4; Compute4CircleOfConfusion(depths, vCocNear4, vCocFar4, dof_focus_params); - float cocNear = max(max(vCocNear4.x, vCocNear4.y), max(vCocNear4.z, vCocNear4.w)); + float cocNear = dot(vCocNear4, 0.25); float maxCocFar = max(max(vCocFar4.x, vCocFar4.y), max(vCocFar4.z, vCocFar4.w)); float minCocFar = min(min(vCocFar4.x, vCocFar4.y), min(vCocFar4.z, vCocFar4.w)); float depth = dot(depths, 0.25); + ##if dof_coc_history != NULL && simplified_dof == off + float2 tc = IN.tc.xy + getCameraMotion(depth, IN.tc.xy); + float cocNearHistory = dof_coc_history.SampleLevel(dof_coc_history_samplerstate, tc, 0).x; + float nearDepth = max(max(depths.x, depths.y), max(depths.z, depths.w)); + float nearDepthMin = min(min(depths.x, depths.y), min(depths.z, depths.w)); + float cocNearMin = ComputeNearCircleOfConfusion(nearDepthMin, dof_focus_params); + float cocNearMax = ComputeNearCircleOfConfusion(nearDepth, dof_focus_params); + cocNearHistory = clamp(cocNearHistory, cocNearMin * 0.75, cocNearMax * 1.25); + cocNear = lerp(saturate(cocNear), cocNearHistory, 0.9); + ##endif ##endif diff --git a/prog/gameLibs/render/shaders/flow_map.sh b/prog/gameLibs/render/shaders/flow_map.sh index eb0b36f45..ba81cfb4c 100644 --- a/prog/gameLibs/render/shaders/flow_map.sh +++ b/prog/gameLibs/render/shaders/flow_map.sh @@ -25,6 +25,26 @@ float flowmap_damping = 0.5; int flowmap_height = 0; interval flowmap_height : depth_above < 1, heightmap; +shader copy_flowmap_texture +{ + cull_mode=none; + z_test=false; + z_write=false; + + POSTFX_VS(1) + + (ps) { flowmap_temp_tex@tex = flowmap_temp_tex hlsl { Texture2D flowmap_temp_tex@tex; } } + + hlsl(ps) { + float copy_ps(VsOutput input) : SV_Target0 + { + return flowmap_temp_tex[input.pos.xy].r; + } + } + + compile("target_ps", "copy_ps"); +} + shader water_flowmap { cull_mode=none; @@ -150,10 +170,13 @@ shader water_flowmap if (any(heightNeighbours > waterHeight)) { float2 heightGradient = float2(heightNeighbours.w - heightNeighbours.z, heightNeighbours.x - heightNeighbours.y); - heightGradient = normalize(heightGradient); - float2 floodfillVec = tex2Dlod(flowmap_floodfill_tex, float4(htc,0,0)).rg * 2 - 1; - heightGradient *= heightGradient.x * floodfillVec.x + heightGradient.y * floodfillVec.y; - f.xy += heightGradient; + if (length(heightGradient) > 0.001) + { + heightGradient = normalize(heightGradient); + float2 floodfillVec = tex2Dlod(flowmap_floodfill_tex, float4(htc,0,0)).rg * 2 - 1; + heightGradient *= heightGradient.x * floodfillVec.x + heightGradient.y * floodfillVec.y; + f.xy += heightGradient; + } } ##endif diff --git a/prog/gameLibs/render/shaders/gpu_occlusion.sh b/prog/gameLibs/render/shaders/gpu_occlusion.sh index 82371fa76..c705c1961 100644 --- a/prog/gameLibs/render/shaders/gpu_occlusion.sh +++ b/prog/gameLibs/render/shaders/gpu_occlusion.sh @@ -50,7 +50,7 @@ macro BASE_GPU_OCCLUSION(code) //sbox.xyXY in screen TC //return closeset raw depth - float check_box_occl_visible_tc_base(float4 sbox, out float lod) + float check_box_occl_visible_tc_base(float4 sbox) { uint2 dim; ##if separate_depth_mips == no @@ -74,8 +74,6 @@ macro BASE_GPU_OCCLUSION(code) if (dims.x <= OCCLUSION_RECT_SZ && dims.y <= OCCLUSION_RECT_SZ) level = level_lower; } - lod = level; - //minTc = (floor(sbox_vp.xy/exp2(level))+0.5)/(dim/exp2(level)); @@ -129,7 +127,7 @@ macro GPU_OCCLUSION(code) return VISIBLE; } - bool check_box_occl_visible_base(float3 minb, float3 maxb, out float4 sbox, out float lod, out float2 minMaxRawDepth) + bool check_box_occl_visible_base(float3 minb, float3 maxb, out float4 sbox, out float2 minMaxRawDepth) { //todo: we can speed it up float4 screenPos[8]; @@ -161,15 +159,15 @@ macro GPU_OCCLUSION(code) minMaxRawDepth.y = maxScreen.z; sbox = saturate(minMaxTc.xwzy); - if (check_box_occl_visible_tc_base(sbox, lod) > maxScreen.z) + if (check_box_occl_visible_tc_base(sbox) > maxScreen.z) return false; return true; } bool check_box_occl_visible(float3 minb, float3 maxb) { - float4 sbox; float lod; float2 minMaxRawDepth; + float4 sbox; float2 minMaxRawDepth; minMaxRawDepth = 0; - return check_box_occl_visible_base(minb, maxb, sbox, lod, minMaxRawDepth); + return check_box_occl_visible_base(minb, maxb, sbox, minMaxRawDepth); } } endmacro diff --git a/prog/gameLibs/render/shaders/hardware_defines.sh b/prog/gameLibs/render/shaders/hardware_defines.sh index 0f25eedab..737a16329 100644 --- a/prog/gameLibs/render/shaders/hardware_defines.sh +++ b/prog/gameLibs/render/shaders/hardware_defines.sh @@ -252,13 +252,16 @@ hlsl(vs) { #define CLIPPLANE(v) ##endif -##if hardware.vulkan || hardware.metal +##if hardware.vulkan //vulkan compiler uses internal definitions for HW_VERTEX_ID, based on internal compiler used #if SHADER_COMPILER_DXC #define USE_VERTEX_ID_WITHOUT_BASE_OFFSET(input_struct) input_struct.vertexId -= input_struct.baseVertexId; #else #define USE_VERTEX_ID_WITHOUT_BASE_OFFSET(input_struct) #endif +##elif hardware.metal + // we cannot use BASE_VERTEX on some Apple devies (old A8 iPads) + #define USE_VERTEX_ID_WITHOUT_BASE_OFFSET(input_struct) ##else #define HW_VERTEX_ID uint vertexId: SV_VertexID; #define HW_BASE_VERTEX_ID error! not supported on this compiler/API @@ -411,6 +414,13 @@ hlsl(ps) { return tex.Sample(smp, uv); } + float4 tex2DBindless(TextureArraySampler a, float3 uv) + { + Texture2DArray tex = a.tex; + SamplerState smp = a.smp; + return tex.Sample(smp, uv); + } + float4 tex2DBindlessSampler(TextureSampler a, TextureSampler b, float2 uv) { Texture2D tex = a.tex; @@ -823,7 +833,7 @@ hlsl { ##else #if HALF_PRECISION return min(val, 65504.h); // clamp to half max - #elif SHADER_COMPILER_HLSL2021 + #elif __HLSL_VERSION >= 2021 return select(isfinite(dot(val, val)).xxx, val, half3(0, 0, 0)); #else return isfinite(dot(val, val)).xxx ? val : half3(0, 0, 0); diff --git a/prog/gameLibs/render/shaders/light_probe.sh b/prog/gameLibs/render/shaders/light_probe.sh index 225cba147..16cc37353 100644 --- a/prog/gameLibs/render/shaders/light_probe.sh +++ b/prog/gameLibs/render/shaders/light_probe.sh @@ -129,136 +129,3 @@ shader specular_cube compile("target_vs", "light_probe_vs"); compile("target_ps", "light_probe_ps"); } - -texture dynamic_cube_tex_1; -texture dynamic_cube_tex_2; -float dynamic_cube_tex_level = 0; -float dynamic_cube_tex_blend = 0; -int blend_faces = 0; -interval blend_faces : first<1, second; - -shader blend_light_probes -{ - SUPPORT_GLOBAL_FRAME() - - cull_mode = none; - z_test = false; - z_write = false; - no_ablend; - - (ps) { - dynamic_cube_tex_1@smpCube = dynamic_cube_tex_1; - dynamic_cube_tex_2@smpCube = dynamic_cube_tex_2; - dynamic_cube_tex_blend_level@f2 = (dynamic_cube_tex_blend, dynamic_cube_tex_level,0,0); - } - USE_HDR_SH() - USE_POSTFX_VERTEX_POSITIONS() - - hlsl { - struct VsOutput - { - VS_OUT_POSITION(pos) - float2 tc : TEXCOORD0; - }; - } - - - hlsl(vs) { - VsOutput blend_cubes_vs(uint vertex_id : SV_VertexID) - { - VsOutput output; - float2 pos = getPostfxVertexPositionById(vertex_id); - output.pos = float4(pos.x, pos.y, 1, 1); - output.tc = pos.xy; - return output; - } - } - - - hlsl(ps) { - struct MRT_OUTPUT - { - half4 color0:SV_Target0; - half4 color1:SV_Target1; - half4 color2:SV_Target2; - #if !JUST_THREE_FACES - half4 color3:SV_Target3; - half4 color4:SV_Target4; - half4 color5:SV_Target5; - #define FACES - #endif - }; - - MRT_OUTPUT blend_cubes_ps(VsOutput input) - { - half4 outColor[6]; - #define BLEND_FACE(faceNo, target)\ - {\ - float3 tc = GetCubemapVector(input.tc, faceNo);\ - fixed4 cube1 = texCUBElod(dynamic_cube_tex_1, float4(tc, dynamic_cube_tex_blend_level.y));\ - fixed4 cube2 = texCUBElod(dynamic_cube_tex_2, float4(tc, dynamic_cube_tex_blend_level.y));\ - target.rgb = lerp(cube1.rgb, cube2.rgb, dynamic_cube_tex_blend_level.x);\ - target.a = 1;\ - } - - MRT_OUTPUT res; - #if !JUST_THREE_FACES - BLEND_FACE(0, res.color0); - BLEND_FACE(1, res.color1); - BLEND_FACE(2, res.color2); - BLEND_FACE(3, res.color3); - BLEND_FACE(4, res.color4); - BLEND_FACE(5, res.color5); - #else - ##if blend_faces == first - BLEND_FACE(0, res.color0); - BLEND_FACE(1, res.color1); - BLEND_FACE(2, res.color2); - ##else - BLEND_FACE(3, res.color0); - BLEND_FACE(4, res.color1); - BLEND_FACE(5, res.color2); - ##endif - #endif - return res; - } - } - compile("target_vs", "blend_cubes_vs"); - compile("target_ps", "blend_cubes_ps"); -} - -texture tex; -float4x4 probetm; -int blend_face_no = 0; - -shader blend_light_probe_face -{ - SUPPORT_GLOBAL_FRAME() - - cull_mode = none; - z_test = false; - z_write = false; - no_ablend; - - (ps) { - dynamic_cube_tex_1@smpCube = dynamic_cube_tex_1; - tex@smp2d = tex; - probetm@f44 = probetm; - blend_face_no@f1 = (blend_face_no); - } - - POSTFX_VS_TEXCOORD(1, texcoord) - - hlsl(ps) { - half4 blend_cube_face_ps(VsOutput input) : SV_Target - { - float3 tc = GetCubemapVector(input.texcoord, int(blend_face_no+0.01)); - tc = mul(float4(tc, 0), probetm).xyz; - half3 cube = texCUBElod(dynamic_cube_tex_1, float4(tc, 0)).rgb; - half4 face = tex2D(tex, input.texcoord); - return half4(lerp(cube, face.rgb, face.a), 1); - } - } - - compile("target_ps", "blend_cube_face_ps"); -} diff --git a/prog/gameLibs/render/shaders/light_probe_blend.sh b/prog/gameLibs/render/shaders/light_probe_blend.sh new file mode 100644 index 000000000..d56de7632 --- /dev/null +++ b/prog/gameLibs/render/shaders/light_probe_blend.sh @@ -0,0 +1,138 @@ +include "shader_global.sh" + +texture dynamic_cube_tex_1; +texture dynamic_cube_tex_2; +float dynamic_cube_tex_level = 0; +float dynamic_cube_tex_blend = 0; +int blend_faces = 0; +interval blend_faces : first<1, second; + +hlsl { + #include +} + +shader blend_light_probes +{ + SUPPORT_GLOBAL_FRAME() + + cull_mode = none; + z_test = false; + z_write = false; + no_ablend; + + (ps) { + dynamic_cube_tex_1@smpCube = dynamic_cube_tex_1; + dynamic_cube_tex_2@smpCube = dynamic_cube_tex_2; + dynamic_cube_tex_blend_level@f2 = (dynamic_cube_tex_blend, dynamic_cube_tex_level,0,0); + } + USE_HDR_SH() + USE_POSTFX_VERTEX_POSITIONS() + + hlsl { + struct VsOutput + { + VS_OUT_POSITION(pos) + float2 tc : TEXCOORD0; + }; + } + + + hlsl(vs) { + VsOutput blend_cubes_vs(uint vertex_id : SV_VertexID) + { + VsOutput output; + float2 pos = getPostfxVertexPositionById(vertex_id); + output.pos = float4(pos.x, pos.y, 1, 1); + output.tc = pos.xy; + return output; + } + } + + + hlsl(ps) { + struct MRT_OUTPUT + { + half4 color0:SV_Target0; + half4 color1:SV_Target1; + half4 color2:SV_Target2; + #if !JUST_THREE_FACES + half4 color3:SV_Target3; + half4 color4:SV_Target4; + half4 color5:SV_Target5; + #define FACES + #endif + }; + + MRT_OUTPUT blend_cubes_ps(VsOutput input) + { + half4 outColor[6]; + #define BLEND_FACE(faceNo, target)\ + {\ + float3 tc = GetCubemapVector(input.tc, faceNo);\ + fixed4 cube1 = texCUBElod(dynamic_cube_tex_1, float4(tc, dynamic_cube_tex_blend_level.y));\ + fixed4 cube2 = texCUBElod(dynamic_cube_tex_2, float4(tc, dynamic_cube_tex_blend_level.y));\ + target.rgb = lerp(cube1.rgb, cube2.rgb, dynamic_cube_tex_blend_level.x);\ + target.a = 1;\ + } + + MRT_OUTPUT res; + #if !JUST_THREE_FACES + BLEND_FACE(0, res.color0); + BLEND_FACE(1, res.color1); + BLEND_FACE(2, res.color2); + BLEND_FACE(3, res.color3); + BLEND_FACE(4, res.color4); + BLEND_FACE(5, res.color5); + #else + ##if blend_faces == first + BLEND_FACE(0, res.color0); + BLEND_FACE(1, res.color1); + BLEND_FACE(2, res.color2); + ##else + BLEND_FACE(3, res.color0); + BLEND_FACE(4, res.color1); + BLEND_FACE(5, res.color2); + ##endif + #endif + return res; + } + } + compile("target_vs", "blend_cubes_vs"); + compile("target_ps", "blend_cubes_ps"); +} + +texture tex; +float4x4 probetm; +int blend_face_no = 0; + +shader blend_light_probe_face +{ + SUPPORT_GLOBAL_FRAME() + + cull_mode = none; + z_test = false; + z_write = false; + no_ablend; + + (ps) { + dynamic_cube_tex_1@smpCube = dynamic_cube_tex_1; + tex@smp2d = tex; + probetm@f44 = probetm; + blend_face_no@f1 = (blend_face_no); + } + + POSTFX_VS_TEXCOORD(1, texcoord) + + hlsl(ps) { + half4 blend_cube_face_ps(VsOutput input) : SV_Target + { + float3 tc = GetCubemapVector(input.texcoord, int(blend_face_no+0.01)); + tc = mul(float4(tc, 0), probetm).xyz; + half3 cube = texCUBElod(dynamic_cube_tex_1, float4(tc, 0)).rgb; + half4 face = tex2D(tex, input.texcoord); + return half4(lerp(cube, face.rgb, face.a), 1); + } + } + + compile("target_ps", "blend_cube_face_ps"); +} diff --git a/prog/gameLibs/render/shaders/loading/gradients.glsl b/prog/gameLibs/render/shaders/loading/gradients.glsl new file mode 100644 index 000000000..a1636e615 --- /dev/null +++ b/prog/gameLibs/render/shaders/loading/gradients.glsl @@ -0,0 +1,19 @@ +void mainImage( out vec4 fragColor, in vec2 fragCoord ) +{ + float t = iTime * 0.2; + float param = exp(cos(t)) - 2.0 * cos(4.0 * t) - pow5(sin(t / 12.0)); + vec2 uv = fragCoord/iResolution.xy - 0.5 + vec2(sin(t), cos(t)) * param * 0.3; + + vec3 colors[4] = { + vec3(0x60, 0x6c, 0x38) / 255.0, + vec3(0x28, 0x36, 0x18) / 255.0, + vec3(0xdd, 0xa1, 0x5e) / 255.0, + vec3(0xbc, 0x6c, 0x25) / 255.0 + }; + vec3 col = mix( + mix(colors[0], colors[1], uv.x), + mix(colors[2], colors[3], uv.x), + uv.y); + + fragColor = vec4(col,1.0); +} diff --git a/prog/gameLibs/render/shaders/rendinst_impostor_inc.sh b/prog/gameLibs/render/shaders/rendinst_impostor_inc.sh index 8e0eba74c..ee57ded36 100644 --- a/prog/gameLibs/render/shaders/rendinst_impostor_inc.sh +++ b/prog/gameLibs/render/shaders/rendinst_impostor_inc.sh @@ -12,11 +12,13 @@ macro INIT_BAKED_IMPOSTOR_STATIC_TEX() static texture impostor_albedo_alpha = 0; static texture impostor_normal_translucency = 0; static texture impostor_ao_smoothness = 0; + static texture impostor_preshadow = 0; (ps) { impostor_albedo_alpha@static = impostor_albedo_alpha; impostor_normal_translucency@static = impostor_normal_translucency; impostor_ao_smoothness@static = impostor_ao_smoothness; + impostor_preshadow@staticTexArray = impostor_preshadow; } endmacro @@ -318,7 +320,7 @@ macro USE_BAKED_IMPOSTORS() #define impostor_tex_slice(tx, tc) \ tex2DBindless(tx, tc) #define impostor_tex_slice_shadow(tx, tc, z) \ - tex3D(tx, float3(tc, z)) + tex2DBindless(tx, float3(tc, z)) #if USE_MULTISLICED @@ -564,11 +566,11 @@ macro INIT_AND_USE_BAKED_IMPOSTOR_PROPERTIES() ##elif baked_impostor_preshadows == preshadow_dynamic BRANCH if (hasPreshadow) - ret.shadowing = (sample_impostor_tex_shadow(preshadow_tex, tc1, tc2, paletteId, ds)).r; + ret.shadowing = (sample_impostor_tex_shadow(get_impostor_preshadow(), tc1, tc2, paletteId, ds)).r; else ret.shadowing = 1; ##else - ret.shadowing = (sample_impostor_tex_shadow(preshadow_tex, tc1, tc2, paletteId, ds)).r; + ret.shadowing = (sample_impostor_tex_shadow(get_impostor_preshadow(), tc1, tc2, paletteId, ds)).r; ##endif ret.depthSq = 0; return ret; diff --git a/prog/gameLibs/render/shaders/terraform/terraform_inc.sh b/prog/gameLibs/render/shaders/terraform/terraform_inc.sh index 61a9d07af..4a861f76a 100644 --- a/prog/gameLibs/render/shaders/terraform/terraform_inc.sh +++ b/prog/gameLibs/render/shaders/terraform/terraform_inc.sh @@ -61,6 +61,7 @@ macro USE_TFORM_COMMON(code) tform_height = lerp(0, tform_height, weight); return lerp(0, deltaHt, weight); #else + tform_height = 0; return 0; #endif } diff --git a/prog/gameLibs/render/shaders/water_ssr_common.hlsl b/prog/gameLibs/render/shaders/water_ssr_common.hlsl index e1735c917..d96af404b 100644 --- a/prog/gameLibs/render/shaders/water_ssr_common.hlsl +++ b/prog/gameLibs/render/shaders/water_ssr_common.hlsl @@ -114,6 +114,10 @@ float4 hierarchRayMarch(float3 rayStart_uv_z, float3 R, float linear_roughness, } hitRawDepth = result.z; + + // hack: offset texture 1 pixel to raymarch direction for prevention of "leaking" of close objects to reflections of far objects + result.xy += normalize(rayStepUVz.xy) * ssr_inv_target_size.xy;//rcp(ssr_target_size.xy); + result.z = linearize_z(result.z, zn_zfar.zw); return result; @@ -196,4 +200,4 @@ half4 performSSR(uint2 pixelPos, float2 UV, float linear_roughness, float3 N, fl depth = hit_uv_z_fade.z; return float4(result.rgb*brdf_G_GGX(N, originToPoint, linear_roughness), result.a); -} \ No newline at end of file +} diff --git a/prog/gameLibs/render/shaders/wetness_inc.sh b/prog/gameLibs/render/shaders/wetness_inc.sh index 160713233..943840d25 100644 --- a/prog/gameLibs/render/shaders/wetness_inc.sh +++ b/prog/gameLibs/render/shaders/wetness_inc.sh @@ -335,20 +335,32 @@ if (compatibility_mode == compatibility_mode_off) #endif } - void get_rain_ripples(float4 ripples_tc, float3 world_pos, float3 vertical_norm, float smoothness, float strength, out float3 result_norm, out float ripples_power CUBE_RAIN_DROPLETS_GRAD_SIG) + void get_rain_ripples_impl(float4 ripples_tc, float3 world_pos, float3 vertical_norm, float smoothness, float strength, out float3 result_norm CUBE_RAIN_DROPLETS_GRAD_SIG , out float dropletsFade, out float finalStrength) { half3 dropletsNorm, ripplesNorm; get_rain_normals(float3(ripples_tc.xy, world_pos.y * rain_ripple_params2.y), dropletsNorm, ripplesNorm CUBE_RAIN_DROPLETS_GRAD_ARG); - float dropletsFade = ClampRange(vertical_norm.y, -MAX_RAIN_FALL_ANGLE, 0); - float finalStrength = droplets_scale * strength; + dropletsFade = ClampRange(vertical_norm.y, -MAX_RAIN_FALL_ANGLE, 0); + finalStrength = droplets_scale * strength; result_norm = lerp(half3(0, 0, 1), dropletsNorm, dropletsFade); result_norm = lerp(result_norm, ripplesNorm, ripples_tc.w); result_norm = normalize(float3(result_norm.xy * rain_ripple_params2.z * finalStrength, result_norm.z)).xzy; + } + + void get_rain_ripples(float4 ripples_tc, float3 world_pos, float3 vertical_norm, float smoothness, float strength, out float3 result_norm CUBE_RAIN_DROPLETS_GRAD_SIG) + { + float dropletsFade, finalStrength; + get_rain_ripples_impl(ripples_tc, world_pos, vertical_norm, smoothness, strength, result_norm CUBE_RAIN_DROPLETS_GRAD_ARG , dropletsFade, finalStrength); + } + + void get_rain_ripples(float4 ripples_tc, float3 world_pos, float3 vertical_norm, float smoothness, float strength, out float3 result_norm, out float ripples_power CUBE_RAIN_DROPLETS_GRAD_SIG) + { + float dropletsFade, finalStrength; + get_rain_ripples_impl(ripples_tc, world_pos, vertical_norm, smoothness, strength, result_norm CUBE_RAIN_DROPLETS_GRAD_ARG , dropletsFade, finalStrength); ripples_power = dropletsFade * finalStrength * saturate((1.0 - result_norm.y) * rain_ripple_params2.w); } - void apply_rain_ripples(float3 world_pos, float3 vertical_norm, float smoothness, float strength, inout float3 world_normal, out float ripples_power) + void apply_rain_ripples(float3 world_pos, float3 vertical_norm, float smoothness, float strength, inout float3 world_normal) { #if use_grad float4 tc; @@ -360,11 +372,11 @@ if (compatibility_mode == compatibility_mode_off) { float3 ripplesNorm; #if use_grad - get_rain_ripples(tc, world_pos, vertical_norm, smoothness, strength, ripplesNorm, ripples_power, tcDx, tcDy); + get_rain_ripples(tc, world_pos, vertical_norm, smoothness, strength, ripplesNorm, tcDx, tcDy); #else float4 tc; get_rain_ripples_tc(world_pos, vertical_norm, smoothness, tc); - get_rain_ripples(tc, world_pos, vertical_norm, smoothness, strength, ripplesNorm, ripples_power); + get_rain_ripples(tc, world_pos, vertical_norm, smoothness, strength, ripplesNorm); #endif world_normal = RNM_ndetail_normalized(world_normal.xzy, ripplesNorm.xzy).xzy; } @@ -372,15 +384,14 @@ if (compatibility_mode == compatibility_mode_off) void apply_rain_ripples_water(float3 world_pos, float dist, inout float3 world_normal) { - float ripplesPower; - apply_rain_ripples(world_pos * 0.5, float3(0.0, 1.0, 0.0), 1.0, rain_ripple_params.x * saturate(12 - dist / (rain_ripple_params2.x * 48.0)), world_normal, ripplesPower); + apply_rain_ripples(world_pos * 0.5, float3(0.0, 1.0, 0.0), 1.0, rain_ripple_params.x * saturate(12 - dist / (rain_ripple_params2.x * 48.0)), world_normal); } } } else { hlsl(ps) { - #define apply_rain_ripples(world_pos, vertical_norm, smoothness, strength, world_normal, ripples_power) + #define apply_rain_ripples(world_pos, vertical_norm, smoothness, strength, world_normal) #define apply_rain_ripples_water(world_pos, dist, world_normal) } } diff --git a/prog/gameLibs/render/temporalAA.cpp b/prog/gameLibs/render/temporalAA.cpp index 1ddb9cae9..293a810ef 100644 --- a/prog/gameLibs/render/temporalAA.cpp +++ b/prog/gameLibs/render/temporalAA.cpp @@ -225,18 +225,21 @@ void TemporalAA::loadParamsFromBlk(const DataBlock *taaBlk) static float gaussianKernel(const Point2 &uv) { return expf(-2.29f * lengthSq(uv)); } +extern Point2 get_halton_jitter(int counter, int subsamples, float subsample_scale) +{ + // +1 is because halton sequence returns 0 when index is 0. This causes first jitter to be (-0.5,-0.5). + // Halton sequence is correct when the index starts from 1. + int index = counter % subsamples + 1; + return Point2(halton_sequence(index, 2) - 0.5f, halton_sequence(index, 3) - 0.5f) * subsample_scale; +} + Point2 get_taa_jitter(int counter, const TemporalAAParams &p) { static const Point2 SSAA8x[8] = {Point2(0.0625, -0.1875), Point2(-0.0625, 0.1875), Point2(0.3125, 0.0625), Point2(-0.1875, -0.3125), Point2(-0.3125, 0.3125), Point2(-0.4375, -0.0625), Point2(0.1875, 0.4375), Point2(0.4375, -0.4375)}; if (p.useHalton) - { - // +1 is because halton sequence returns 0 when index is 0. This causes first jitter to be (-0.5,-0.5). - // Halton sequence is correct when the index starts from 1. - int index = counter % p.subsamples + 1; - return Point2(halton_sequence(index, 2) - 0.5f, halton_sequence(index, 3) - 0.5f) * p.subsampleScale; - } + return get_halton_jitter(counter, p.subsamples, p.subsampleScale); else { int max_count = min(p.subsamples, 8); diff --git a/prog/gameLibs/render/texcompressors/bcCompressor.cpp b/prog/gameLibs/render/texcompressors/bcCompressor.cpp index 85c707e96..9847e8317 100644 --- a/prog/gameLibs/render/texcompressors/bcCompressor.cpp +++ b/prog/gameLibs/render/texcompressors/bcCompressor.cpp @@ -192,7 +192,7 @@ bool BcCompressor::resetBuffer(unsigned int mips, unsigned int width, unsigned i } #if !(_TARGET_C1 | _TARGET_C2) - vb = d3d::create_vb(data_size(verts), SBCF_MAYBELOST, "bcCompr"); + vb = d3d::create_vb(data_size(verts), 0, "bcCompr"); d3d_err(vb); if (!vb) return false; diff --git a/prog/gameLibs/render/tireTracks.cpp b/prog/gameLibs/render/tireTracks.cpp index 6cedaa38d..18b059273 100644 --- a/prog/gameLibs/render/tireTracks.cpp +++ b/prog/gameLibs/render/tireTracks.cpp @@ -794,7 +794,7 @@ void init(const char *blk_file, bool has_normalmap, bool has_vertex_norm, bool i // create shader buffer tire_tracks::has_vertex_normal = has_vertex_norm; int vsize = has_vertex_norm ? sizeof(TireTrackVertexNorm) : sizeof(TireTrackVertexNoNorm); - vbuffer = d3d::create_vb(vertexCount * vsize, SBCF_DYNAMIC | SBCF_MAYBELOST, "tires"); + vbuffer = d3d::create_vb(vertexCount * vsize, SBCF_DYNAMIC, "tires"); // create material & shader Ptr matNull = new MaterialData; diff --git a/prog/gameLibs/render/tracer/tracer.cpp b/prog/gameLibs/render/tracer/tracer.cpp index 2510837c1..79c0fa687 100644 --- a/prog/gameLibs/render/tracer/tracer.cpp +++ b/prog/gameLibs/render/tracer/tracer.cpp @@ -759,8 +759,8 @@ void TracerManager::initTrails() const float noisePeriodScale = tailLengthMeters / (float)tailNoisePeriodMeters; const float numTailParticlesInv = 1.0f / numTailParticles; - tailParticles.create(instancingSBufSupported, sizeof(TailParticle), 0, numTailParticles, - SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED | SBCF_MAYBELOST, 0, "tailBuffer"); + tailParticles.create(instancingSBufSupported, sizeof(TailParticle), 0, numTailParticles, SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED, + 0, "tailBuffer"); TailParticle *particleData; tailParticles.lock(0, 0, (void **)&particleData, VBLOCK_WRITEONLY); G_ASSERT(particleData); @@ -775,7 +775,7 @@ void TracerManager::initTrails() tailParticles.unlock(); uint32_t vbTotalParticles = numTailParticles * (1 - pow(0.5f, (float)numTailLods)) / (1 - 0.5f); - tailVb = dag::create_vb(vbTotalParticles * FX_VERTICES_PER_PARTICLE * vertexSize, SBCF_MAYBELOST, "tailVb"); + tailVb = dag::create_vb(vbTotalParticles * FX_VERTICES_PER_PARTICLE * vertexSize, 0, "tailVb"); G_ASSERT(tailVb); uint8_t *tailVbData; @@ -805,7 +805,7 @@ void TracerManager::initTrails() } tailVb->unlock(); - tailIb = dag::create_ib(numTailParticles * FX_INDICES_PER_PARTICLE * sizeof(uint16_t), SBCF_MAYBELOST, "tailIb"); + tailIb = dag::create_ib(numTailParticles * FX_INDICES_PER_PARTICLE * sizeof(uint16_t), 0, "tailIb"); G_ASSERT(tailIb); uint16_t *tailIbData; @@ -830,7 +830,7 @@ void TracerManager::initTrails() VSD_REG(VSDR_TEXC0, VSDT_SHORT2), VSD_END}; tailInstancingVdecl = d3d::create_vdecl(vsdInstancing); tailRendElem.vDecl = tailInstancingVdecl; - tailInstancesIds = dag::create_vb(MAX_FX_TRACERS * MAX_FX_SEGMENTS * sizeof(uint32_t), SBCF_MAYBELOST, "tailInstancesId"); + tailInstancesIds = dag::create_vb(MAX_FX_TRACERS * MAX_FX_SEGMENTS * sizeof(uint32_t), 0, "tailInstancesId"); d3d_err(!!tailInstancesIds); short *data; tailInstancesIds->lock(0, 0, (void **)&data, VBLOCK_WRITEONLY); @@ -843,7 +843,7 @@ void TracerManager::initTrails() else tailParticles.close(); - const uint32_t bufferFlags = SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED | SBCF_DYNAMIC | SBCF_CPU_ACCESS_WRITE | SBCF_MAYBELOST; + const uint32_t bufferFlags = SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED | SBCF_DYNAMIC | SBCF_CPU_ACCESS_WRITE; tracerBuffer.create(instancingSBufSupported, sizeof(GPUFxTracer), sizeof(GPUFxTracerCreate), MAX_FX_TRACERS, createCmdCs ? SBCF_UA_SR_STRUCTURED : bufferFlags, 0, "tracerBuffer"); if (instancingSBufSupported && !createCmdCs) @@ -965,7 +965,7 @@ void TracerManager::initHeads() headRendElem.startIndex = 0; headRendElem.numPrim = FX_PRIMITIVES_PER_PARTICLE; - headVb = dag::create_vb(MAX_FX_TRACERS * FX_HEAD_VERTICES_PER_PARTICLE * sizeof(HeadVertex), SBCF_MAYBELOST, "headVb"); + headVb = dag::create_vb(MAX_FX_TRACERS * FX_HEAD_VERTICES_PER_PARTICLE * sizeof(HeadVertex), 0, "headVb"); G_ASSERT(headVb); uint8_t *headVbData; @@ -983,7 +983,7 @@ void TracerManager::initHeads() headVb->unlock(); } - headIb = dag::create_ib(MAX_FX_TRACERS * FX_HEAD_INDICES_PER_PARTICLE * sizeof(uint16_t), SBCF_MAYBELOST, "headIb"); + headIb = dag::create_ib(MAX_FX_TRACERS * FX_HEAD_INDICES_PER_PARTICLE * sizeof(uint16_t), 0, "headIb"); G_ASSERT(headIb); uint16_t *headIbData; @@ -1005,7 +1005,7 @@ void TracerManager::initHeads() bool instancingSBufSupported = computeSupported; tracerTypeBuffer.create(instancingSBufSupported, sizeof(GPUFXTracerType), 0, tracerTypes.size(), - SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED | SBCF_MAYBELOST, 0, "tracerTypeBuffer"); + SBCF_BIND_SHADER_RES | SBCF_MISC_STRUCTURED, 0, "tracerTypeBuffer"); GPUFXTracerType *tracerTypeData = NULL; tracerTypeBuffer.lock(0, 0, (void **)&tracerTypeData, VBLOCK_WRITEONLY); if (tracerTypeData) diff --git a/prog/gameLibs/render/treesAbove/treesAbove.cpp b/prog/gameLibs/render/treesAbove/treesAbove.cpp index a5849bf90..82730ee98 100644 --- a/prog/gameLibs/render/treesAbove/treesAbove.cpp +++ b/prog/gameLibs/render/treesAbove/treesAbove.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -14,7 +15,7 @@ #include #include -#include +#include #include @@ -70,9 +71,9 @@ void TreesAbove::renderInvalidBboxes(const TMatrix &view_itm, float minZ, float const float fullDistance = 2 * trees2dDist; const float texelSize = (fullDistance / trees2dHelper.texSize); const uint32_t updatesNum = min(invalidBoxes.size(), MAX_INVALID_TREES_PER_FRAME); - for (int i = 0; i < updatesNum; ++i) + dag::Span bboxesToUpdate = make_span(invalidBoxes).last(updatesNum); + for (bbox3f bbox : bboxesToUpdate) { - bbox3f bbox = invalidBoxes[i]; Point4 bMin = as_point4(&bbox.bmax); Point4 bMax = as_point4(&bbox.bmin); const BBox2 box2d = {Point2(bMax.x, bMax.z), Point2(bMin.x, bMin.z)}; @@ -89,7 +90,7 @@ void TreesAbove::renderInvalidBboxes(const TMatrix &view_itm, float minZ, float d3d::resource_barrier({trees2d.getTex2D(), RB_RO_SRV | RB_STAGE_PIXEL, 0, 0}); d3d::resource_barrier({trees2dDepth.getTex2D(), RB_RO_SRV | RB_STAGE_PIXEL, 0, 0}); - invalidBoxes.erase(invalidBoxes.begin(), invalidBoxes.begin() + updatesNum); + invalidBoxes.resize(invalidBoxes.size() - updatesNum); if (invalidBoxes.empty()) invalidBoxes.shrink_to_fit(); } diff --git a/prog/gameLibs/render/volumetricLights/shaders/volume_light.sh b/prog/gameLibs/render/volumetricLights/shaders/volume_light.sh index 0cadf27e0..f461200d2 100644 --- a/prog/gameLibs/render/volumetricLights/shaders/volume_light.sh +++ b/prog/gameLibs/render/volumetricLights/shaders/volume_light.sh @@ -354,8 +354,7 @@ shader volfog_occlusion_cs hlsl(cs) { float checkVolFogBox(float2 coord) { - float lod = 0; - float closestRawDepth = check_box_occl_visible_tc_base(saturate(coord.xyxy*inv_occlusion_resolution.xyxy + bbox_offset), lod); + float closestRawDepth = check_box_occl_visible_tc_base(saturate(coord.xyxy*inv_occlusion_resolution.xyxy + bbox_offset)); return linearize_z(closestRawDepth, zn_zfar.zw); } diff --git a/prog/gameLibs/render/volumetricLights/shaders/volume_lights_raymarch.hlsl b/prog/gameLibs/render/volumetricLights/shaders/volume_lights_raymarch.hlsl index 30ef31f4f..281abd58b 100644 --- a/prog/gameLibs/render/volumetricLights/shaders/volume_lights_raymarch.hlsl +++ b/prog/gameLibs/render/volumetricLights/shaders/volume_lights_raymarch.hlsl @@ -380,7 +380,11 @@ float4 accumulateFog_impl(uint2 dtId, float4 transformed_znzfar, float2 screen_t out_debug.ra = 1; #endif - float3 sunColor = sun_color_0.rgb * calc_sun_phase(viewVecN, from_sun_direction.rgb); + const float DF_NIGHT_SUN_COS = 0.1; + float3 sunColor = sun_color_0.rgb; + sunColor *= saturate(-from_sun_direction.y/DF_NIGHT_SUN_COS); // fix low angle (underground) sun color + sunColor *= calc_sun_phase(viewVecN, from_sun_direction.xyz); + float3 ambientColor = get_base_ambient_color() * phaseFunctionConst(); uint noiseIndex = calc_raymarch_noise_index(dtId); // TODO: maybe use reconstructionId as index (needs more testing) diff --git a/prog/gameLibs/render/wakePs/wakePs.cpp b/prog/gameLibs/render/wakePs/wakePs.cpp index a20cd6e73..af222719d 100644 --- a/prog/gameLibs/render/wakePs/wakePs.cpp +++ b/prog/gameLibs/render/wakePs/wakePs.cpp @@ -434,7 +434,7 @@ void ParticleSystem::emit(float dt) continue; float maxLifetime = emitter.params.spawn.lifeTime + emitter.params.spawn.lifeSpread; - float maxAllowedEmissionFrequency = MAX_PARTICLES_PER_EMITTER / maxLifetime * 0.9f; + float maxAllowedEmissionFrequency = maxLifetime > 0.f ? MAX_PARTICLES_PER_EMITTER / maxLifetime * 0.9f : 0.f; float distance = length(emitter.lastPos - emitter.params.pose.pos); float emitPerSecond = min(emitter.params.spawn.emitPerSecond, maxAllowedEmissionFrequency); float emitPerMeter = emitter.params.spawn.emitPerMeter; diff --git a/prog/gameLibs/render/waterFoamTrail.cpp b/prog/gameLibs/render/waterFoamTrail.cpp index c2b435709..997826521 100644 --- a/prog/gameLibs/render/waterFoamTrail.cpp +++ b/prog/gameLibs/render/waterFoamTrail.cpp @@ -311,9 +311,8 @@ struct Renderer shElem = shMat->make_elem(); - activeBuffer = d3d::create_vb(g_settings.activeVertexCount * sizeof(Vertex), SBCF_DYNAMIC | SBCF_MAYBELOST, "water_foam_trail_a"); - finalizedBuffer = - d3d::create_vb(g_settings.finalizedVertexCount * sizeof(Vertex), SBCF_DYNAMIC | SBCF_MAYBELOST, "water_foam_trail_f"); + activeBuffer = d3d::create_vb(g_settings.activeVertexCount * sizeof(Vertex), SBCF_DYNAMIC, "water_foam_trail_a"); + finalizedBuffer = d3d::create_vb(g_settings.finalizedVertexCount * sizeof(Vertex), SBCF_DYNAMIC, "water_foam_trail_f"); SharedTex maskTexRes = dag::get_tex_gameres(g_settings.texName); G_ASSERT(maskTexRes); diff --git a/prog/gameLibs/render/waterProjFx.cpp b/prog/gameLibs/render/waterProjFx.cpp index 63eb7963e..22e7e1fe5 100644 --- a/prog/gameLibs/render/waterProjFx.cpp +++ b/prog/gameLibs/render/waterProjFx.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include <3d/dag_drv3d.h> #include <3d/dag_drv3dCmd.h> @@ -75,7 +76,7 @@ void WaterProjectedFx::setTextures() internalTargets[i].setVar(); } -void WaterProjectedFx::setView() { setView(newViewTM, newProjTM, newViewItm); } +void WaterProjectedFx::setView() { setView(newViewTM, newProjTM); } bool WaterProjectedFx::getView(TMatrix4 &view_tm, TMatrix4 &proj_tm, Point3 &camera_pos) { @@ -90,122 +91,129 @@ bool WaterProjectedFx::getView(TMatrix4 &view_tm, TMatrix4 &proj_tm, Point3 &cam bool WaterProjectedFx::isValidView() const { return numIntersections > 0; } -void WaterProjectedFx::prepare(const TMatrix &view_tm, const TMatrix4 &proj_tm, const TMatrix4 &glob_tm, float water_level, - float significant_wave_height, int frame_no) +void WaterProjectedFx::prepare(const TMatrix &view_tm, const TMatrix &view_itm, const TMatrix4 &proj_tm, const TMatrix4 &glob_tm, + float water_level, float significant_wave_height, int frame_no, bool change_projection) { waterLevel = water_level; numIntersections = 0; newViewTM = view_tm; newProjTM = proj_tm; - newViewItm = ::grs_cur_view.itm; - savedCamPos = ::grs_cur_view.pos; - - float wavesDeltaH = max(significant_wave_height * 2.2f, MIN_WAVE_HEIGHT); - float waterHeightTop = water_level + wavesDeltaH; - float waterHeightBottom = water_level - wavesDeltaH; - - Point3 cameraDir = newViewItm.getcol(2); - Point3 cameraPos = newViewItm.getcol(3); - Point4 bottomPlane; - v_stu(&bottomPlane.x, Frustum(glob_tm).camPlanes[Frustum::BOTTOM]); - float cosA = min(bottomPlane.y, CAMERA_PLANE_BOTTOM_MIN_ANGLE); - cameraPos += -normalize(Point3(cameraDir.x, 0.0f, cameraDir.z)) * - max(waterHeightTop + CAMERA_PLANE_ELEVATION - abs(cameraPos.y), 0.0f) * safediv(cosA, safe_sqrt(1.0f - SQR(cosA))); - cameraPos.y = max(abs(cameraPos.y), waterHeightTop + CAMERA_PLANE_ELEVATION) * (cameraPos.y > 0 ? 1.0f : -1.0f); - newViewItm.setcol(3, cameraPos); - - newViewTM = orthonormalized_inverse(newViewItm); - TMatrix4 currentGlobTm = TMatrix4(newViewTM) * newProjTM; - - // We have current camera frustum - find intersection with water bounds (upper & lower water planes) - int i, j; - vec4f points[8]; - Point3 frustumPoints[8]; - Frustum frustum = Frustum(currentGlobTm); - // Points order (it is important) - for edges_vid - // -1 1 1 - // -1 1 0 - // -1 -1 1 - // -1 -1 0 - // 1 1 1 - // 1 1 0 - // 1 -1 1 - // 1 -1 0 - frustum.generateAllPointFrustm(points); - - Point4 p; - for (i = 0; i < 8; i++) - { - v_stu(&p.x, points[i]); - frustumPoints[i] = Point3::xyz(p); - } - int edges_vid[] = {0, 1, 2, 3, 4, 5, 6, 7, 0, 4, 4, 6, 6, 2, 2, 0, 1, 5, 5, 7, 7, 3, 3, 1}; - - // Total 12 frustum edges in frustum * 2 planes - carray intersectionPoints; + newViewItm = view_itm; + savedCamPos = view_itm.getcol(3); - for (j = 0; j < 2; j++) + if (change_projection) { - float waterPlaneHeight = j == 0 ? waterHeightTop : waterHeightBottom; - for (i = 0; i < 24; i += 2) + float wavesDeltaH = max(significant_wave_height * 2.2f, MIN_WAVE_HEIGHT); + float waterHeightTop = water_level + wavesDeltaH; + float waterHeightBottom = water_level - wavesDeltaH; + + Point3 cameraDir = newViewItm.getcol(2); + Point3 cameraPos = newViewItm.getcol(3); + Point4 bottomPlane; + v_stu(&bottomPlane.x, Frustum(glob_tm).camPlanes[Frustum::BOTTOM]); + float cosA = min(bottomPlane.y, CAMERA_PLANE_BOTTOM_MIN_ANGLE); + cameraPos += -normalize(Point3(cameraDir.x, 0.0f, cameraDir.z)) * + max(waterHeightTop + CAMERA_PLANE_ELEVATION - abs(cameraPos.y), 0.0f) * safediv(cosA, safe_sqrt(1.0f - SQR(cosA))); + cameraPos.y = max(abs(cameraPos.y), waterHeightTop + CAMERA_PLANE_ELEVATION) * (cameraPos.y > 0 ? 1.0f : -1.0f); + newViewItm.setcol(3, cameraPos); + + newViewTM = orthonormalized_inverse(newViewItm); + TMatrix4 currentGlobTm = TMatrix4(newViewTM) * newProjTM; + + // We have current camera frustum - find intersection with water bounds (upper & lower water planes) + int i, j; + vec4f points[8]; + Point3 frustumPoints[8]; + Frustum frustum = Frustum(currentGlobTm); + // Points order (it is important) - for edges_vid + // -1 1 1 + // -1 1 0 + // -1 -1 1 + // -1 -1 0 + // 1 1 1 + // 1 1 0 + // 1 -1 1 + // 1 -1 0 + frustum.generateAllPointFrustm(points); + + Point4 p; + for (i = 0; i < 8; i++) { - // Frustum edge - Point3 &p1 = frustumPoints[edges_vid[i]]; - Point3 &p2 = frustumPoints[edges_vid[i + 1]]; + v_stu(&p.x, points[i]); + frustumPoints[i] = Point3::xyz(p); + } + int edges_vid[] = {0, 1, 2, 3, 4, 5, 6, 7, 0, 4, 4, 6, 6, 2, 2, 0, 1, 5, 5, 7, 7, 3, 3, 1}; - float planeDist1 = p1.y - waterPlaneHeight; - float planeDist2 = p2.y - waterPlaneHeight; + // Total 12 frustum edges in frustum * 2 planes + carray intersectionPoints; - if (planeDist1 * planeDist2 < 0.f) // Points are opposite side of the plane? - then edge intersects the plane + for (j = 0; j < 2; j++) + { + float waterPlaneHeight = j == 0 ? waterHeightTop : waterHeightBottom; + for (i = 0; i < 24; i += 2) { - float k = safediv(planeDist1, planeDist1 - planeDist2); // Should be positive - Point3 intersectionPoint = p1 + (p2 - p1) * k; // Frustum-edge intersection point with water plane - intersectionPoint.y = water_level; // Project point exactly on plane which we use for rendering - intersectionPoints[numIntersections++] = intersectionPoint; + // Frustum edge + Point3 &p1 = frustumPoints[edges_vid[i]]; + Point3 &p2 = frustumPoints[edges_vid[i + 1]]; + + float planeDist1 = p1.y - waterPlaneHeight; + float planeDist2 = p2.y - waterPlaneHeight; + + if (planeDist1 * planeDist2 < 0.f) // Points are opposite side of the plane? - then edge intersects the plane + { + float k = safediv(planeDist1, planeDist1 - planeDist2); // Should be positive + Point3 intersectionPoint = p1 + (p2 - p1) * k; // Frustum-edge intersection point with water plane + intersectionPoint.y = water_level; // Project point exactly on plane which we use for rendering + intersectionPoints[numIntersections++] = intersectionPoint; + } } } - } - if (numIntersections > 0) - { - // Project all points on screen & calculate x&y bounds - Point2 boxMin = Point2(10000.f, 10000.f); - Point2 boxMax = Point2(-10000.f, -10000.f); - for (i = 0; i < numIntersections; i++) + if (numIntersections > 0) { - currentGlobTm.transform(intersectionPoints[i], p); - boxMin.x = min(boxMin.x, safediv(p.x, p.w)); - boxMin.y = min(boxMin.y, safediv(p.y, p.w)); + // Project all points on screen & calculate x&y bounds + Point2 boxMin = Point2(10000.f, 10000.f); + Point2 boxMax = Point2(-10000.f, -10000.f); + for (i = 0; i < numIntersections; i++) + { + currentGlobTm.transform(intersectionPoints[i], p); + boxMin.x = min(boxMin.x, safediv(p.x, p.w)); + boxMin.y = min(boxMin.y, safediv(p.y, p.w)); - boxMax.x = max(boxMax.x, safediv(p.x, p.w)); - boxMax.y = max(boxMax.y, safediv(p.y, p.w)); - } - boxMin.x = clamp(boxMin.x, -2.0f, 2.0f); - boxMin.y = clamp(boxMin.y, -2.0f, 2.0f); - boxMax.x = clamp(boxMax.x, -2.0f, 2.0f) + 0.001f; - boxMax.y = clamp(boxMax.y, -2.0f, 2.0f) + 0.001f; - TMatrix4 cropMatrix = matrix_perspective_crop(boxMin.x, boxMax.x, boxMin.y, boxMax.y, 0.0f, 1.0f); - newProjTM = newProjTM * cropMatrix; - - if (taaEnabled) - { - const float watJitterx = 0.2f / ((float)frameWidth); - const float watJittery = 0.2f / ((float)frameHeight); - - float xShift = (float)(frame_no % 2); - float yShift = (float)((frame_no % 4) / 2); - curJitter = Point2((-3.0f + 4.0f * xShift + 2.0f * yShift) * watJitterx, (-1.0f - 2.0f * xShift + 4.0f * yShift) * watJittery); - TMatrix4 jitterMatrix; - jitterMatrix.setcol(0, 1.f, 0.f, 0.f, 0.f); - jitterMatrix.setcol(1, 0.f, 1.f, 0.f, 0.f); - jitterMatrix.setcol(2, 0.f, 0.f, 1.f, 0.f); - jitterMatrix.setcol(3, curJitter.x, curJitter.y, 0.f, 1.f); - newProjTMJittered = newProjTM * jitterMatrix.transpose(); + boxMax.x = max(boxMax.x, safediv(p.x, p.w)); + boxMax.y = max(boxMax.y, safediv(p.y, p.w)); + } + boxMin.x = clamp(boxMin.x, -2.0f, 2.0f); + boxMin.y = clamp(boxMin.y, -2.0f, 2.0f); + boxMax.x = clamp(boxMax.x, -2.0f, 2.0f) + 0.001f; + boxMax.y = clamp(boxMax.y, -2.0f, 2.0f) + 0.001f; + TMatrix4 cropMatrix = matrix_perspective_crop(boxMin.x, boxMax.x, boxMin.y, boxMax.y, 0.0f, 1.0f); + newProjTM = newProjTM * cropMatrix; } } + else + { + numIntersections = 1; + } - currentGlobTm = TMatrix4(newViewTM) * newProjTM; + if (isValidView() && taaEnabled) + { + const float watJitterx = 0.2f / ((float)frameWidth); + const float watJittery = 0.2f / ((float)frameHeight); + + float xShift = (float)(frame_no % 2); + float yShift = (float)((frame_no % 4) / 2); + curJitter = Point2((-3.0f + 4.0f * xShift + 2.0f * yShift) * watJitterx, (-1.0f - 2.0f * xShift + 4.0f * yShift) * watJittery); + TMatrix4 jitterMatrix; + jitterMatrix.setcol(0, 1.f, 0.f, 0.f, 0.f); + jitterMatrix.setcol(1, 0.f, 1.f, 0.f, 0.f); + jitterMatrix.setcol(2, 0.f, 0.f, 1.f, 0.f); + jitterMatrix.setcol(3, curJitter.x, curJitter.y, 0.f, 1.f); + newProjTMJittered = newProjTM * jitterMatrix.transpose(); + } + + TMatrix4 currentGlobTm = TMatrix4(newViewTM) * newProjTM; process_tm_for_drv_consts(currentGlobTm); setWaterMatrix(currentGlobTm); } @@ -289,9 +297,7 @@ bool WaterProjectedFx::render(IWwaterProjFxRenderHelper *render_helper, dag::Spa bool taaEnabledForThisFrame = taaEnabled && !targetsCleared; SCOPE_VIEW_PROJ_MATRIX; - DagorCurView savedView = ::grs_cur_view; - - setView(newViewTM, taaEnabledForThisFrame ? newProjTMJittered : newProjTM, newViewItm); + setView(newViewTM, taaEnabledForThisFrame ? newProjTMJittered : newProjTM); SCOPE_RENDER_TARGET; if (!targetsCleared) @@ -317,7 +323,6 @@ bool WaterProjectedFx::render(IWwaterProjFxRenderHelper *render_helper, dag::Spa renderedAnything |= render_helper->render_geometry_without_aa(); ShaderGlobal::set_int(renderWaterProjectibleDecalsVarId, 0); - ::grs_cur_view = savedView; if (taaEnabledForThisFrame) { @@ -384,7 +389,7 @@ bool WaterProjectedFx::render(IWwaterProjFxRenderHelper *render_helper, dag::Spa for (int i = 0; i < nTargets; ++i) d3d::set_render_target(i, targets[i].getTex2D(), 0); - setView(newViewTM, newProjTM, newViewItm); + setView(newViewTM, newProjTM); renderedAnything |= render_helper->render_geometry_without_aa(); ShaderGlobal::setBlock(globalFrameId, ShaderGlobal::LAYER_FRAME); @@ -395,13 +400,10 @@ bool WaterProjectedFx::render(IWwaterProjFxRenderHelper *render_helper, dag::Spa return renderedAnything; } -void WaterProjectedFx::setView(const TMatrix &view_tm, const TMatrix4 &proj_tm, const TMatrix &view_itm) +void WaterProjectedFx::setView(const TMatrix &view_tm, const TMatrix4 &proj_tm) { d3d::settm(TM_VIEW, view_tm); d3d::settm(TM_PROJ, &proj_tm); - ::grs_cur_view.itm = view_itm; - ::grs_cur_view.tm = view_tm; - ::grs_cur_view.pos = view_itm.getcol(3); } void WaterProjectedFx::setWaterMatrix(const TMatrix4 &glob_tm) diff --git a/prog/gameLibs/render/waterRipples/dropSplashes.cpp b/prog/gameLibs/render/waterRipples/dropSplashes.cpp index 2f7f1164f..b73d7f99a 100644 --- a/prog/gameLibs/render/waterRipples/dropSplashes.cpp +++ b/prog/gameLibs/render/waterRipples/dropSplashes.cpp @@ -160,10 +160,10 @@ DropSplashes::DropSplashes(const DataBlock &blk) : splashRendElem(nullptr), spri initSplashShader(); initSpriteShader(); - splashVb = dag::create_vb(VERTICES_PER_SPLASH * sizeof(float) * 3, SBCF_MAYBELOST, "splashVb"); + splashVb = dag::create_vb(VERTICES_PER_SPLASH * sizeof(float) * 3, 0, "splashVb"); G_ASSERT(splashVb); - splashIb = dag::create_ib(INDICES_PER_SPLASH * sizeof(uint16_t), SBCF_MAYBELOST, "splashIb"); + splashIb = dag::create_ib(INDICES_PER_SPLASH * sizeof(uint16_t), 0, "splashIb"); G_ASSERT(splashIb); fillBuffers(); diff --git a/prog/gameLibs/render/wind/clusterWindRenderer.cpp b/prog/gameLibs/render/wind/clusterWindRenderer.cpp index 78902dc8f..ec3102bd4 100644 --- a/prog/gameLibs/render/wind/clusterWindRenderer.cpp +++ b/prog/gameLibs/render/wind/clusterWindRenderer.cpp @@ -187,7 +187,7 @@ ClusterWindRenderer::ClusterWindRenderer(bool need_historical_buffer) cluster_buf.set(d3d::buffers::create_persistent_cb(bufferSize, "cluster_buf"), "cluster_buf"); if (need_historical_buffer) - cluster_buf_prev.set(d3d::create_sbuffer(sizeof(uint4), bufferSize, SBCF_BIND_CONSTANT | SBCF_MAYBELOST, 0, "cluster_buf_prev"), + cluster_buf_prev.set(d3d::create_sbuffer(sizeof(uint4), bufferSize, SBCF_BIND_CONSTANT, 0, "cluster_buf_prev"), "cluster_buf_prev"); treeBendingMultVarId = get_shader_variable_id("cluster_wind_tree_bending_mult"); diff --git a/prog/gameLibs/render/wind/fluidWind.cpp b/prog/gameLibs/render/wind/fluidWind.cpp index 9d569c8a9..1725f838a 100644 --- a/prog/gameLibs/render/wind/fluidWind.cpp +++ b/prog/gameLibs/render/wind/fluidWind.cpp @@ -99,14 +99,27 @@ void FluidWind::update(float dt, const Point3 &origin) { // return -cos(time / duration * 2 * PI) * 0.5 + 0.5; // return 1.0 - abs(cos(time / duration * PI)); + if (motor.phaseAttack.enabled) + { + float attackTime = min(motor.phaseAttack.maxAttackTime, motor.duration); + phase *= attackTime > 0 ? saturate(motor.time / attackTime) : 1.0f; + } float fade = min(motor.duration * 0.5f, motor.phaseFade.maxFadeTime); phase *= motor.duration > 0.001 ? (1.0 /*- SQR(1.0 - saturate(motor.time / fade))*/) - - (1.0 - SQR(1.0 - saturate(motor.time - motor.duration + fade) / fade)) + (1.0 - SQR(1.0 - saturate((motor.time - motor.duration + fade) / fade))) : 1; } if (motor.phaseSin.enabled && motor.duration > 0.001) { - phase *= sin(motor.time / motor.duration * motor.phaseSin.numWaves * PI); + float sinOffset = 0.0f; // we do not use cos here to preserve old behavior when attackTime is not used + if (motor.phaseAttack.enabled) + { + sinOffset = ((float)motor.phaseSin.numWaves) * 0.5 * motor.duration; + float attackTime = min(sinOffset, motor.phaseAttack.maxAttackTime); + sinOffset -= attackTime; + phase *= attackTime > 0 ? saturate(motor.time / attackTime) : 1.0f; + } + phase *= sin(motor.time / motor.duration * motor.phaseSin.numWaves * PI + sinOffset); phase *= SQR(1.0 - motor.time / motor.duration); } @@ -321,6 +334,7 @@ int FluidWind::pushMotor(MotorType type, const MotorBase &base) motor.shake = base.shake; motor.phaseFade = base.phaseFade; motor.phaseSin = base.phaseSin; + motor.phaseAttack = base.phaseAttack; motor.direction = Point3(1, 0, 0); diff --git a/prog/gameLibs/soundSystem/banks.cpp b/prog/gameLibs/soundSystem/banks.cpp index 90a432d31..ae7e43b06 100644 --- a/prog/gameLibs/soundSystem/banks.cpp +++ b/prog/gameLibs/soundSystem/banks.cpp @@ -92,7 +92,9 @@ static Bitset loaded_banks; static Bitset failed_banks; static eastl::fixed_string locale; -static bool is_inited = false; +static eastl::fixed_string master_preset_name; + +static bool g_was_inited = false; static void def_err_cb(const char *sndsys_message, const char *fmod_error_message, const char *bank_path, bool is_mod) { @@ -461,9 +463,11 @@ static void add_bank(const char *name, const DataBlock &blk, const char *banks_f void init(const DataBlock &blk) { + SNDSYS_IS_MAIN_THREAD; G_ASSERT_RETURN(sndsys::is_inited(), ); SNDSYS_BANKS_BLOCK; - G_ASSERT_RETURN(!is_inited, ); + G_ASSERT_RETURN(!g_was_inited, ); + g_was_inited = true; const DataBlock &banksBlk = *blk.getBlockByNameEx("banks"); const DataBlock &modBlk = *blk.getBlockByNameEx("mod"); @@ -473,17 +477,18 @@ void init(const DataBlock &blk) const char *folder = banksBlk.getStr("folder", "sound"); const char *extension = banksBlk.getStr("extension", ".bank"); + master_preset_name = banksBlk.getStr("preset", "master"); const bool enableMod = blk.getBool("enableMod", false) && modBlk.getBool("allow", true); g_report_bank_loading_time = blk.getBool("reportBankLoadingTime", false); FrameStr name, path; - const DataBlock *presetsBlk = banksBlk.getBlockByNameEx("presets"); - all_presets.reserve(presetsBlk->blockCount()); - for (int j = 0; j < presetsBlk->blockCount(); ++j) + const DataBlock &presetsBlk = *banksBlk.getBlockByNameEx("presets"); + all_presets.reserve(presetsBlk.blockCount()); + for (int j = 0; j < presetsBlk.blockCount(); ++j) { - const DataBlock *presetBlk = presetsBlk->getBlock(j); + const DataBlock *presetBlk = presetsBlk.getBlock(j); all_presets.emplace_back(presetBlk->getBlockName()); Preset &preset = all_presets.back(); @@ -510,24 +515,9 @@ void init(const DataBlock &blk) const char *fileName = pluginBlk->getStr(PLUGIN_SUBSYSTEM); all_plugins.emplace_back(fileName); } - - is_inited = true; } -static constexpr const char *g_default_preset_name = "master"; -const char *get_default_preset(const DataBlock &blk) -{ - const DataBlock &banksBlk = *blk.getBlockByNameEx("banks"); - const DataBlock &presetsBlk = *banksBlk.getBlockByNameEx("presets", banksBlk.getBlockByNameEx("games")); - - const char *preset = banksBlk.getStr("preset", g_default_preset_name); - if (presetsBlk.blockExists(preset)) - return preset; - - G_ASSERTF(!preset || !*preset || !banksBlk.paramExists("preset"), - "Missing block sound{presets{%s{}}} in '%s'; parameter preset:t=\"%s\" is not valid", preset, blk.resolveFilename(), preset); - return ""; -} +const char *get_master_preset() { return master_preset_name.c_str(); } static void enable_preset_impl(Preset &preset, bool enable, const PathTags &path_tags) { @@ -616,12 +606,14 @@ bool is_exist(const char *preset_name) bool is_preset_has_failed_banks(const char *preset_name) { + SNDSYS_BANKS_BLOCK; const Preset *preset = find_preset(preset_name); return preset && (preset->banks & failed_banks).any(); } void get_failed_banks_names(eastl::vector &failed_banks_names) { + SNDSYS_BANKS_BLOCK; for (auto &bank : all_banks) { if (failed_banks.test(bank.id)) @@ -631,6 +623,7 @@ void get_failed_banks_names(eastl::vector &fa void get_loaded_banks_names(eastl::vector &banks_names) { + SNDSYS_BANKS_BLOCK; for (auto &bank : all_banks) { if (loaded_banks.test(bank.id)) @@ -638,16 +631,26 @@ void get_loaded_banks_names(eastl::vector &ba } } -void add_guid_to_prohibited(const FMODGUID &event_id) { g_guid_prohibited.insert(hash_fun(event_id)); } +void add_guid_to_prohibited(const FMODGUID &event_id) +{ + SNDSYS_BANKS_BLOCK; + g_guid_prohibited.insert(hash_fun(event_id)); +} bool is_guid_prohibited(const FMODGUID &event_id) { + SNDSYS_BANKS_BLOCK; return g_guid_prohibited.empty() ? false : g_guid_prohibited.has_key(hash_fun(event_id)); } -void clear_prohibited_guids() { g_guid_prohibited.clear(); } +void clear_prohibited_guids() +{ + SNDSYS_BANKS_BLOCK; + g_guid_prohibited.clear(); +} void prohibit_bank_events(const eastl::string &bank_name) { + SNDSYS_BANKS_BLOCK; auto pred = [bank_name](const Bank &bank) { return bank.path == bank_name; }; auto it = eastl::find_if(all_banks.begin(), all_banks.end(), pred); if (it != all_banks.end()) @@ -703,8 +706,10 @@ void unload_banks_sample_data() } } -bool any_banks_pending() { return pending_banks.any(); } - -bool are_inited() { return is_inited; } +bool any_banks_pending() +{ + SNDSYS_BANKS_BLOCK; + return pending_banks.any(); +} } // namespace sndsys::banks diff --git a/prog/gameLibs/soundSystem/debug.cpp b/prog/gameLibs/soundSystem/debug.cpp index 1ed23dc89..4cd514043 100644 --- a/prog/gameLibs/soundSystem/debug.cpp +++ b/prog/gameLibs/soundSystem/debug.cpp @@ -43,7 +43,6 @@ static const E3DCOLOR g_stopped_color = E3DCOLOR_MAKE(0xff, 0x80, 0, 0xff); static const E3DCOLOR g_stopping_color = E3DCOLOR_MAKE(0, 0, 0, 0xff); static const E3DCOLOR g_invalid_color = E3DCOLOR_MAKE(0xff, 0xff, 0, 0xff); static const E3DCOLOR g_snapshot_color = E3DCOLOR_MAKE(0x66, 0, 0xff, 0xff); -static bool g_enable_debug_draw = false; static bool g_draw_audibility = false; enum class TraceLevel : int @@ -515,9 +514,6 @@ void debug_draw(const TMatrix4 &glob_tm) StdGuiRender::flush_data(); } -void set_enable_debug_draw(bool enable) { g_enable_debug_draw = enable; } -bool get_enable_debug_draw() { return g_enable_debug_draw; } - void set_draw_audibility(bool enable) { g_draw_audibility = enable; } void debug_enum_events() diff --git a/prog/gameLibs/soundSystem/delayed.cpp b/prog/gameLibs/soundSystem/delayed.cpp index 33e772d33..894da7e91 100644 --- a/prog/gameLibs/soundSystem/delayed.cpp +++ b/prog/gameLibs/soundSystem/delayed.cpp @@ -118,6 +118,7 @@ void abandon(EventHandle handle, float add_delay) void init(const DataBlock &blk) { + SNDSYS_DELAYED_BLOCK; g_enable_distant_delay = blk.getBool("enableDistantDelay", false); g_dist_threshold = blk.getReal("distantDelayThreshold", 50.f); } @@ -132,6 +133,7 @@ void close() void get_debug_info(size_t &events, size_t &actions, size_t &max_events, size_t &max_actions) { + SNDSYS_DELAYED_BLOCK; events = g_debug_num_events; actions = g_debug_num_actions; max_events = g_debug_max_events; @@ -194,10 +196,15 @@ void update(float dt) g_events.erase(it, g_events.end()); } -void enable_distant_delay(bool enable) { g_enable_distant_delay = enable; } +void enable_distant_delay(bool enable) +{ + SNDSYS_DELAYED_BLOCK; + g_enable_distant_delay = enable; +} void release_delayed_events() { + SNDSYS_DELAYED_BLOCK; for (Event &e : g_events) release_immediate(e.handle); g_events.clear(); diff --git a/prog/gameLibs/soundSystem/dsp.cpp b/prog/gameLibs/soundSystem/dsp.cpp index 1214b4aea..43a2f161c 100644 --- a/prog/gameLibs/soundSystem/dsp.cpp +++ b/prog/gameLibs/soundSystem/dsp.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -7,6 +8,9 @@ #include #include "internal/debug.h" +static WinCritSec g_dsp_cs; +#define DSP_BLOCK WinAutoLock dspLock(g_dsp_cs); + namespace sndsys { namespace dsp @@ -55,12 +59,23 @@ static void create_pan_dsp() void init(const DataBlock &blk) { + SNDSYS_IS_MAIN_THREAD; + DSP_BLOCK; const DataBlock &dspBlk = *blk.getBlockByNameEx("dsp"); g_pan_3d_decay = saturate(dspBlk.getReal("pan3DDecay", 0.f)); } -void close() { release_pan_dsp(); } +void close() +{ + SNDSYS_IS_MAIN_THREAD; + DSP_BLOCK; + release_pan_dsp(); +} -void apply() { create_pan_dsp(); } +void apply() +{ + DSP_BLOCK; + create_pan_dsp(); +} } // namespace dsp } // namespace sndsys diff --git a/prog/gameLibs/soundSystem/events.cpp b/prog/gameLibs/soundSystem/events.cpp index 505a517f9..87c42a8f5 100644 --- a/prog/gameLibs/soundSystem/events.cpp +++ b/prog/gameLibs/soundSystem/events.cpp @@ -1265,6 +1265,7 @@ bool unload(const FMODGUID &event_id, bool is_strict) void events_init(const DataBlock &blk) { + SNDSYS_IS_MAIN_THREAD; g_max_event_instances = max(0, blk.getInt("maxEventInstances", 0)); g_max_oneshot_event_instances = max(0, blk.getInt("maxOneshotEventInstances", g_max_event_instances)); g_reject_far_oneshots = blk.getBool("rejectFarOneshots", false); @@ -1286,6 +1287,7 @@ static inline auto events_close_impl() void events_close() { + SNDSYS_IS_MAIN_THREAD; auto instances = events_close_impl(); for (FMOD::Studio::EventInstance *instance : instances) release_event_instance(*instance, true); diff --git a/prog/gameLibs/soundSystem/internal/debug.h b/prog/gameLibs/soundSystem/internal/debug.h index b62bd3fc3..2ebb8a6a8 100644 --- a/prog/gameLibs/soundSystem/internal/debug.h +++ b/prog/gameLibs/soundSystem/internal/debug.h @@ -6,7 +6,7 @@ #include #include -#define SNDSYS_MAIN_THREAD G_ASSERT(is_main_thread()) +#define SNDSYS_IS_MAIN_THREAD G_ASSERT(is_main_thread()) #define SNDSYS_IF_NOT_INITED_RETURN_(RET) \ if (!::sndsys::is_inited()) \ diff --git a/prog/gameLibs/soundSystem/internal/streams.h b/prog/gameLibs/soundSystem/internal/streams.h index e3debe9c3..ae9490e15 100644 --- a/prog/gameLibs/soundSystem/internal/streams.h +++ b/prog/gameLibs/soundSystem/internal/streams.h @@ -8,11 +8,20 @@ class DataBlock; class Point3; +namespace FMOD +{ +class System; +namespace Studio +{ +class System; +} +} // namespace FMOD + namespace sndsys { namespace streams { -void init(const DataBlock &blk, float virtual_vol_limit); +void init(const DataBlock &blk, float virtual_vol_limit, FMOD::System *system); void close(); void update(float dt); void debug_get_info(uint32_t &num_handles, uint32_t &to_release_count); diff --git a/prog/gameLibs/soundSystem/quirrel/soundSystemSq.cpp b/prog/gameLibs/soundSystem/quirrel/soundSystemSq.cpp index f4969a5ca..e57e7078f 100644 --- a/prog/gameLibs/soundSystem/quirrel/soundSystemSq.cpp +++ b/prog/gameLibs/soundSystem/quirrel/soundSystemSq.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -26,9 +27,12 @@ void play_sound(const char *name, const Sqrat::Object ¶ms, float volume, con sndsys::EventHandle eh = pos ? sndsys::init_event(name, *pos) : sndsys::init_event(name); if (!eh) { - if (sndsys::is_inited() && !sndsys::has_event(name)) + if (sndsys::is_inited() && sndsys::banks::is_loaded(sndsys::banks::get_master_preset())) { - logerr("sqapi: there is no sound event '%s'", name); + if (!sndsys::has_event(name)) + { + logerr("sqapi: there is no sound event '%s'", name); + } } return; } diff --git a/prog/gameLibs/soundSystem/soundSystem.cpp b/prog/gameLibs/soundSystem/soundSystem.cpp index ffd5bf976..3624a18fd 100644 --- a/prog/gameLibs/soundSystem/soundSystem.cpp +++ b/prog/gameLibs/soundSystem/soundSystem.cpp @@ -96,8 +96,9 @@ static int g_max_channels = 0; static int g_max_software_channels = 0; static bool g_fixed_mem_pool = true; -static int g_last_records_list_changed_time = 0; -static int g_last_outputs_list_changed_time = 0; +static record_list_changed_cb_t g_record_list_changed_cb = nullptr; +static output_list_changed_cb_t g_output_list_changed_cb = nullptr; +static device_lost_cb_t g_device_lost_cb = nullptr; static bool g_auto_change_device = false; @@ -468,18 +469,16 @@ static FMOD_RESULT F_CALLBACK fmod_debug_cb(FMOD_DEBUG_FLAGS flags, const char * } #endif -void init(const DataBlock &blk) +bool init(const DataBlock &blk) { - SNDSYS_MAIN_THREAD; + SNDSYS_IS_MAIN_THREAD; + G_ASSERT(!is_inited()); if (is_inited()) - return; + return false; ScoopedJavaThreadAttacher scoopedJavaThreadAttacher; - g_last_outputs_list_changed_time = g_last_records_list_changed_time = get_time_msec(); - g_is_inited = true; - debug_init(blk); #if (_TARGET_C1 | _TARGET_C2) && !FMOD_SRC_VERSION @@ -511,11 +510,11 @@ void init(const DataBlock &blk) { debug("[SNDSYS] Not inited."); shutdown(); - return; + return false; } events_init(blk); - streams::init(blk, settings::virtual_vol_limit); + streams::init(blk, settings::virtual_vol_limit, g_low_level_system); delayed::init(blk); eastl::basic_string memoryInfo; @@ -530,14 +529,15 @@ void init(const DataBlock &blk) #if _TARGET_IOS registerIOSNotifications(&set_snd_suspend); #endif + + g_is_inited = true; + + return g_is_inited; } void shutdown() { - SNDSYS_MAIN_THREAD; - - if (!is_inited()) - return; + SNDSYS_IS_MAIN_THREAD; ScoopedJavaThreadAttacher scoopedJavaThreadAttacher; @@ -659,20 +659,23 @@ static FMOD_RESULT F_CALLBACK fmod_system_cb(FMOD_SYSTEM * /*system*/, FMOD_SYST if (type == FMOD_SYSTEM_CALLBACK_RECORDLISTCHANGED) { debug("[SNDSYS] Record devices list changed"); - g_last_records_list_changed_time = get_time_msec(); + if (g_record_list_changed_cb != nullptr) + g_record_list_changed_cb(); } else if (type == FMOD_SYSTEM_CALLBACK_DEVICELISTCHANGED) { debug("[SNDSYS] Devices list changed"); validate_output(); - g_last_outputs_list_changed_time = get_time_msec(); + if (g_output_list_changed_cb != nullptr) + g_output_list_changed_cb(); if (g_auto_change_device) select_default_device(); } else if (type == FMOD_SYSTEM_CALLBACK_DEVICELOST) { debug("[SNDSYS] Device lost"); - g_last_outputs_list_changed_time = get_time_msec(); + if (g_device_lost_cb != nullptr) + g_device_lost_cb(); if (g_auto_change_device) select_default_device(); } @@ -753,7 +756,6 @@ static bool is_loopback_record_device(const char *dev_name) eastl::vector get_record_devices() { - eastl::vector devices; SNDSYS_IF_NOT_INITED_RETURN_(devices); ScoopedJavaThreadAttacher scoopedJavaThreadAttacher; @@ -804,13 +806,18 @@ void set_snd_suspend(bool suspend) suspend ? fmodapi::get_system()->mixerSuspend() : fmodapi::get_system()->mixerResume(); } -int get_last_records_list_changed_time() { return g_last_records_list_changed_time; } - -int get_last_outputs_list_changed_time() { return g_last_outputs_list_changed_time; } +void set_device_changed_async_callbacks(record_list_changed_cb_t record_list_changed_cb, + output_list_changed_cb_t output_list_changed_cb, device_lost_cb_t device_lost_cb) +{ + SNDSYS_IS_MAIN_THREAD; + g_record_list_changed_cb = record_list_changed_cb; + g_output_list_changed_cb = output_list_changed_cb; + g_device_lost_cb = device_lost_cb; +} void flush_commands() { - SNDSYS_MAIN_THREAD; + SNDSYS_IS_MAIN_THREAD; SNDSYS_IF_NOT_INITED_RETURN; g_system->flushCommands(); } diff --git a/prog/gameLibs/soundSystem/streams.cpp b/prog/gameLibs/soundSystem/streams.cpp index 2dbe3bace..3b3718bbc 100644 --- a/prog/gameLibs/soundSystem/streams.cpp +++ b/prog/gameLibs/soundSystem/streams.cpp @@ -314,17 +314,19 @@ static inline void update_stream(Stream &stream, float dt) set_state(stream, StreamState::PLAYING); } -void init(const DataBlock &blk, float virtual_vol_limit) +void init(const DataBlock &blk, float virtual_vol_limit, FMOD::System *system) { + SNDSYS_IS_MAIN_THREAD; SNDSYS_POOL_BLOCK; const int defBufferSizeKb = 64; const int bufferSizeKb = blk.getInt("streamBufferSizeKb", defBufferSizeKb); - SOUND_VERIFY(get_system()->setStreamBufferSize(bufferSizeKb * 1024, FMOD_TIMEUNIT_RAWBYTES)); + SOUND_VERIFY(system->setStreamBufferSize(bufferSizeKb * 1024, FMOD_TIMEUNIT_RAWBYTES)); g_virtual_vol_limit = virtual_vol_limit; } void close() { + SNDSYS_IS_MAIN_THREAD; SNDSYS_POOL_BLOCK; all_streams.enumerate([&](Stream &stream) { #if DAGOR_DBGLEVEL > 0 diff --git a/prog/gameLibs/soundSystem/stub/debugStub.cpp b/prog/gameLibs/soundSystem/stub/debugStub.cpp index fc1e54527..5320c2510 100644 --- a/prog/gameLibs/soundSystem/stub/debugStub.cpp +++ b/prog/gameLibs/soundSystem/stub/debugStub.cpp @@ -9,8 +9,6 @@ void debug_trace_warn(const char *, ...) {} void debug_trace_err(const char *, ...) {} void debug_trace_log(const char *, ...) {} void debug_draw(const TMatrix4 &) {} -void set_enable_debug_draw(bool) {} -bool get_enable_debug_draw() { return false; } void set_draw_audibility(bool) {} void debug_enum_events() {} void debug_init(const DataBlock &) {} diff --git a/prog/gameLibs/soundSystem/stub/soundSystemStub.cpp b/prog/gameLibs/soundSystem/stub/soundSystemStub.cpp index a84334b2a..a175f812f 100644 --- a/prog/gameLibs/soundSystem/stub/soundSystemStub.cpp +++ b/prog/gameLibs/soundSystem/stub/soundSystemStub.cpp @@ -15,7 +15,7 @@ class DataBlock; namespace sndsys { -void init(const DataBlock &) {} +bool init(const DataBlock &) { return false; } void shutdown() {} bool is_inited() { return false; } void get_memory_statistics(unsigned & /*system_allocated*/, unsigned & /*current_allocated*/, unsigned & /*max_allocated*/) {} @@ -23,6 +23,7 @@ void get_memory_statistics(unsigned & /*system_allocated*/, unsigned & /*current void set_output_device(int) {} eastl::vector get_output_devices() { return {}; } eastl::vector get_record_devices() { return {}; } +void set_device_changed_async_callbacks(record_list_changed_cb_t, output_list_changed_cb_t, device_lost_cb_t) {} void flush_commands() {} @@ -41,7 +42,7 @@ void init(const DataBlock &) {} void enable(const char *, bool, const PathTags &) {} void enable_starting_with(const char *, bool, const PathTags &) {} -const char *get_default_preset(const DataBlock &) { return ""; } +const char *get_master_preset() { return ""; } bool is_enabled(const char *preset_name) { return false; } bool is_loaded(const char *preset_name) { return false; } bool is_exist(const char *preset_name) { return false; } @@ -162,7 +163,9 @@ void set_pitch(float) {} // update.cpp void update_listener(float, const TMatrix &) {} void reset_3d_listener() {} -void update(float, float) {} +void set_time_speed(float) {} +void update(float) {} +void lazy_update() {} void override_time_speed(float) {} Point3 get_3d_listener_pos() { return {}; } TMatrix get_3d_listener() { return {}; } diff --git a/prog/gameLibs/soundSystem/update.cpp b/prog/gameLibs/soundSystem/update.cpp index f42b7f688..e78c9b4a6 100644 --- a/prog/gameLibs/soundSystem/update.cpp +++ b/prog/gameLibs/soundSystem/update.cpp @@ -13,6 +13,7 @@ #include "internal/events.h" #include "internal/streams.h" #include "internal/debug.h" +#include static WinCritSec g_listener_cs; #define SNDSYS_LISTENER_BLOCK WinAutoLock listenerLock(g_listener_cs); @@ -25,6 +26,8 @@ static bool g_reset_listener_tm = true; static float g_override_time_speed = 0.f; static float g_current_pitch = 1.f; static const Point4 g_speed_to_pitch = Point4(0.f, 4.f, 0.f, 4.f); +static constexpr float g_max_listener_speed = 500; +static constexpr float g_max_listener_speed_sq = g_max_listener_speed * g_max_listener_speed; using namespace fmodapi; @@ -75,16 +78,18 @@ void update_listener(float delta_time, const TMatrix &listener_tm) g_listener_vel = Point3(safediv(g_listener_vel.x, delta_time), safediv(g_listener_vel.y, delta_time), safediv(g_listener_vel.z, delta_time)); g_listener_tm = listener_tm; - float listenerSpeedSq = g_listener_vel.lengthSq(); - constexpr float maxListenerSpeedSq = 250000.f; - if (listenerSpeedSq > maxListenerSpeedSq && listenerSpeedSq > 0.f) - g_listener_vel *= sqrt(maxListenerSpeedSq / listenerSpeedSq); - - Attributes3D listener3dAttributes(listener_tm, g_listener_vel); + if constexpr (g_max_listener_speed_sq != 0.f) + { + G_STATIC_ASSERT(g_max_listener_speed_sq > VERY_SMALL_NUMBER); + const float listenerSpeedSq = g_listener_vel.lengthSq(); + if (listenerSpeedSq > g_max_listener_speed_sq) + g_listener_vel *= sqrt(g_max_listener_speed_sq / listenerSpeedSq); + } + const Attributes3D listener3dAttributes(listener_tm, g_listener_vel); SOUND_VERIFY(get_studio_system()->setListenerAttributes(0, &listener3dAttributes)); } -static inline void apply_time_speed(float time_speed) +void set_time_speed(float time_speed) { const float speed = g_override_time_speed > 0.f ? g_override_time_speed : time_speed; const float pitch = cvt(speed, g_speed_to_pitch.x, g_speed_to_pitch.y, g_speed_to_pitch.z, g_speed_to_pitch.w); @@ -95,23 +100,39 @@ static inline void apply_time_speed(float time_speed) } } -void update(float dt, float time_speed /* = 1.f*/) -{ - SNDSYS_IF_NOT_INITED_RETURN; +static WinCritSec g_update_cs; +#define UPDATE_BLOCK WinAutoLock updateLock(g_update_cs); + +static constexpr int g_lazy_step_ms = 500; +static constexpr int g_invalid_lazy_value = -1; +static std::atomic_int g_next_lazy_at = ATOMIC_VAR_INIT(g_invalid_lazy_value); +void update(float dt) +{ TIME_PROFILE(sndsys_update); - apply_time_speed(time_speed); + g_next_lazy_at = get_time_msec() + g_lazy_step_ms; + + UPDATE_BLOCK; + SNDSYS_IF_NOT_INITED_RETURN; events_update(dt); delayed::update(dt); + streams::update(dt); + SOUND_VERIFY(get_studio_system()->update()); banks::update(); +} - streams::update(dt); +void lazy_update() +{ + if (g_next_lazy_at == g_invalid_lazy_value || get_time_msec() >= g_next_lazy_at) + { + update((g_lazy_step_ms + get_time_msec() - g_next_lazy_at) / 1000.f); + } } } // namespace sndsys diff --git a/prog/gameLibs/streamIO/http.cpp b/prog/gameLibs/streamIO/http.cpp index 20f561e49..4d3e11aa0 100644 --- a/prog/gameLibs/streamIO/http.cpp +++ b/prog/gameLibs/streamIO/http.cpp @@ -42,7 +42,7 @@ struct StreamContext httprequests::RequestId getReqId() const { return *reqId; } - void sendRequest(const char *url, completion_cb_t complete_cb, stream_data_cb_t stream_cb, header_cb_t header_cb, + void sendRequest(const char *url, completion_cb_t complete_cb, stream_data_cb_t stream_cb, resp_headers_cb_t resp_headers_cb, progress_cb_t progress_cb, void *cb_arg, int64_t modified_since, CreationParams::Timeouts const &timeouts, HTTPContext *owner); void syncWait() @@ -116,7 +116,7 @@ struct HTTPContext : public Context streams.erase(req_id); } - intptr_t createStream(const char *name, completion_cb_t complete_cb, stream_data_cb_t stream_cb, header_cb_t header_cb, + intptr_t createStream(const char *name, completion_cb_t complete_cb, stream_data_cb_t stream_cb, resp_headers_cb_t resp_headers_cb, progress_cb_t progress_cb, void *cb_arg, int64_t modified_since, bool do_sync) override { const char *scheme = strstr(name, "://"); @@ -127,7 +127,7 @@ struct HTTPContext : public Context if (dd_strnicmp(name, supported_schemes[i], scheme - name) == 0) { auto ctx = eastl::make_unique(); - ctx->sendRequest(name, complete_cb, stream_cb, header_cb, progress_cb, cb_arg, modified_since, timeouts, this); + ctx->sendRequest(name, complete_cb, stream_cb, resp_headers_cb, progress_cb, cb_arg, modified_since, timeouts, this); intptr_t id = ctx->getReqId(); if (do_sync) ctx->syncWait(); @@ -152,21 +152,20 @@ struct HTTPContext : public Context Context *create(CreationParams *params) { return new HTTPContext(params); } -void StreamContext::sendRequest(const char *url, completion_cb_t complete_cb, stream_data_cb_t stream_cb, header_cb_t header_cb, - progress_cb_t progress_cb, void *cb_arg, int64_t modified_since, CreationParams::Timeouts const &timeouts, HTTPContext *owner) +void StreamContext::sendRequest(const char *url, completion_cb_t complete_cb, stream_data_cb_t stream_cb, + resp_headers_cb_t resp_headers_cb, progress_cb_t progress_cb, void *cb_arg, int64_t modified_since, + CreationParams::Timeouts const &timeouts, HTTPContext *owner) { httprequests::AsyncRequestParams reqParams; eastl::string urlSaved = url; eastl::string prUrlSaved = url; reqParams.url = url; - reqParams.needHeaders = header_cb != nullptr; + reqParams.needResponseHeaders = modified_since >= 0 || resp_headers_cb != nullptr; reqParams.reqType = httprequests::HTTPReq::GET; char dateBuf[512]; - if (modified_since) - { - reqParams.headers.push_back(eastl::make_pair("If-Modified-Since", format_date(dateBuf, 512, modified_since))); - } + if (modified_since >= 0) + reqParams.headers.push_back(eastl::make_pair("If-Modified-Since", format_date(dateBuf, sizeof(dateBuf), modified_since))); reqParams.reqTimeoutMs = timeouts.reqTimeoutSec * 1000; reqParams.connectTimeoutMs = timeouts.connectTimeoutSec * 1000; reqParams.lowSpeedTime = timeouts.lowSpeedTimeSec; @@ -175,7 +174,7 @@ void StreamContext::sendRequest(const char *url, completion_cb_t complete_cb, st reqParams.callback = httprequests::make_http_callback( [complete_cb, cb_arg, urlSaved = eastl::move(urlSaved), reqIdWptr, owner](httprequests::RequestStatus status, int http_code, - dag::ConstSpan response, httprequests::StringMap const &headers) { + dag::ConstSpan response, httprequests::StringMap const &resp_headers) { int result = ERR_UNKNOWN; MemGeneralLoadCB *loadCb = nullptr; int lastModified = 0; @@ -186,14 +185,10 @@ void StreamContext::sendRequest(const char *url, completion_cb_t complete_cb, st result = ERR_NOT_MODIFIED; if (response.size() > 0 && http_code == HTTP_OK) loadCb = new MemGeneralLoadCB(response.data(), (int)response.size()); - - auto lmIt = headers.find("Last-Modified"); - if (lmIt != headers.end()) + if (auto lmIt = resp_headers.find("Last-Modified"); lmIt != resp_headers.end()) { - eastl::string_view const &headerValue = lmIt->second; - - // copy to string because headerValue is not zero-terminated - eastl::string dateStr(headerValue.begin(), headerValue.end()); + // copy to string because it is not zero-terminated + eastl::string dateStr(lmIt->second.begin(), lmIt->second.end()); lastModified = getdate(dateStr.c_str(), nullptr); } } @@ -223,9 +218,9 @@ void StreamContext::sendRequest(const char *url, completion_cb_t complete_cb, st } return false; }, - [header_cb, cb_arg](httprequests::StringMap const &headers) { - if (header_cb) - header_cb(headers, cb_arg); + [resp_headers_cb, cb_arg](httprequests::StringMap const &resp_headers) { + if (resp_headers_cb) + resp_headers_cb(resp_headers, cb_arg); }, [progress_cb, prUrlSaved = eastl::move(prUrlSaved)](size_t dltotal, size_t dlnow) { if (progress_cb) diff --git a/prog/gameLibs/vehiclePhys/legacyRenderableRayCar.h b/prog/gameLibs/vehiclePhys/legacyRenderableRayCar.h index 8ba474ad0..e600d2c6f 100644 --- a/prog/gameLibs/vehiclePhys/legacyRenderableRayCar.h +++ b/prog/gameLibs/vehiclePhys/legacyRenderableRayCar.h @@ -3,7 +3,7 @@ #include "rayCar.h" #include #include -#include <3d/dag_render.h> // grs_cur_view +#include #ifndef NO_3D_GFX #include diff --git a/prog/gameLibs/video360/video360.cpp b/prog/gameLibs/video360/video360.cpp index 604567bed..8174f52cc 100644 --- a/prog/gameLibs/video360/video360.cpp +++ b/prog/gameLibs/video360/video360.cpp @@ -1,6 +1,7 @@ #include "ioSys/dag_dataBlock.h" #include <3d/dag_render.h> #include <3d/dag_drv3d.h> +#include #include #include #include "shaders/dag_shaderBlock.h" @@ -128,50 +129,30 @@ static TMatrix build_cubemap_face_view_matrix(int face_index, const TMatrix &sou return cameraMatrix; } -bool Video360::getCamera(DagorCurView &cur_view, Driver3dPerspective &persp) +eastl::optional Video360::getCamera() const { if (!enabled || frameIndex == -1) - return false; - - int current_face_rendering = (frameIndex) % 6; - - TMatrix cameraMatrix = build_cubemap_face_view_matrix(current_face_rendering, TMatrix::IDENT); - cameraMatrix = savedCameraTm * cameraMatrix; - - cur_view.itm = cameraMatrix; - cur_view.tm = orthonormalized_inverse(curViewItm); - cur_view.pos = curViewItm.getcol(3); - - persp.wk = 1.f; - persp.hk = 1.f; - persp.zn = zNear; - persp.zf = zFar; - - return true; -} - -bool Video360::getCamera(CameraSetup &cam, Driver3dPerspective &persp) -{ - if (!enabled || frameIndex == -1) - return false; + return eastl::nullopt; int current_face_rendering = (frameIndex) % 6; TMatrix cameraMatrix = build_cubemap_face_view_matrix(current_face_rendering, TMatrix::IDENT); cameraMatrix = savedCameraTm * cameraMatrix; + CameraSetup cam; cam.transform = cameraMatrix; cam.accuratePos = dpoint3(cameraMatrix.getcol(3)); cam.fov = 90; cam.znear = zNear; cam.zfar = zFar; + Driver3dPerspective persp; persp.wk = 1.f; persp.hk = 1.f; persp.zn = zNear; persp.zf = zFar; - return true; + return CameraSetupPerspPair{cam, persp}; } diff --git a/prog/gameLibs/vr/vrGuiSurface.cpp b/prog/gameLibs/vr/vrGuiSurface.cpp index cfdeda519..daa446738 100644 --- a/prog/gameLibs/vr/vrGuiSurface.cpp +++ b/prog/gameLibs/vr/vrGuiSurface.cpp @@ -226,10 +226,10 @@ bool init_surface(int ui_width, int ui_height, SurfaceCurvature curvature) re.startIndex = 0; re.numPrim = VRGUI_GRID_SIZE * VRGUI_GRID_SIZE * 2; - render_params.vb = d3d::create_vb(re.numVert * re.stride, SBCF_MAYBELOST, "vrGuiVb"); + render_params.vb = d3d::create_vb(re.numVert * re.stride, 0, "vrGuiVb"); G_ASSERTF_RETURN(render_params.vb != nullptr, false, "[VRUI] failed to create vertex buffer"); - render_params.ib = d3d::create_ib(re.numPrim * 3 * sizeof(short), SBCF_MAYBELOST, "vrGuiIb"); + render_params.ib = d3d::create_ib(re.numPrim * 3 * sizeof(short), 0, "vrGuiIb"); G_ASSERTF_RETURN(render_params.ib != nullptr, false, "[VRUI] failed to create index buffer"); fill_buffers(); diff --git a/prog/gameLibs/webui/httpserver.cpp b/prog/gameLibs/webui/httpserver.cpp index 36b046621..b169e5c62 100644 --- a/prog/gameLibs/webui/httpserver.cpp +++ b/prog/gameLibs/webui/httpserver.cpp @@ -237,6 +237,7 @@ class HttpServer : public DaThread HttpServer(Config *cfg) : DaThread("HttpServer", 128 << 10), listenSocket(OS_SOCKET_INVALID), listenPort(0), requests(midmem) { + stripStackInMinidump(); if (os_sockets_init() < 0) { debug("sockets init failed"); diff --git a/prog/gameLibs/webui/plugins/grapheditor/graphEditor.html b/prog/gameLibs/webui/plugins/grapheditor/graphEditor.html index a7652979d..e6d257e32 100644 --- a/prog/gameLibs/webui/plugins/grapheditor/graphEditor.html +++ b/prog/gameLibs/webui/plugins/grapheditor/graphEditor.html @@ -309,7 +309,7 @@