diff --git a/build.sh b/build.sh index 95f9080f12..a5d057090b 100755 --- a/build.sh +++ b/build.sh @@ -42,6 +42,17 @@ function gpu_on { cudnn_config=ON } +function test_doc { + mkdir -p $build_dir + cd $build_dir + export runtime_include_dir=$workspace/cinn/runtime/cuda + + prepare_ci + cmake_ + build + make_doc +} + function cudnn_off { cudnn_config=OFF } @@ -94,36 +105,46 @@ function prepare_ci { pip install pre-commit pip install clang-format==9.0 pip install wheel - pip install sphinx==3.3.1 sphinx_gallery==0.8.1 recommonmark==0.6.0 exhale scipy breathe==4.24.0 matplotlib + pip install sphinx==3.3.1 sphinx_gallery==0.8.1 recommonmark==0.6.0 exhale scipy breathe==4.24.0 matplotlib sphinx_rtd_theme pip install paddlepaddle-gpu==2.1.2.post101 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html } -function make_doc { +function prepare_doc_model_file { proxy_off - cd $workspace/tutorials - if [[ -f "ResNet18.tar.gz" ]]; then - echo "model file for tutorials already downloaded." - elif [[ -f "$build_dir/thirds/ResNet18.tar.gz" ]]; then - rm -rf $workspace/tutorials/ResNet18 - ln -s $build_dir/thirds/ResNet18 $workspace/tutorials/ResNet18 + local tar_file=$1 + if [[ -f "$tar_file.tar.gz" ]]; then + echo "model file $tar_file.tar.gz for tutorials already downloaded." + elif [[ -f "$build_dir/thirds/$tar_file.tar.gz" ]]; then + rm -rf $workspace/tutorials/$tar_file + ln -s $build_dir/thirds/$tar_file $workspace/tutorials/$tar_file else - wget http://paddle-inference-dist.bj.bcebos.com/CINN/ResNet18.tar.gz - tar -zxvf ResNet18.tar.gz + wget https://paddle-inference-dist.bj.bcebos.com/CINN/$tar_file.tar.gz + tar -zxvf $tar_file.tar.gz fi +} + +function make_doc { + proxy_off + cd $workspace/tutorials + prepare_doc_model_file ResNet50 + prepare_doc_model_file MobileNetV2 + prepare_doc_model_file EfficientNet + prepare_doc_model_file FaceDet + if [[ $cuda_config == "ON" && ! -d "./is_cuda" ]]; then mkdir is_cuda fi - + if [[ $cuda_config == "OFF" && -d "./is_cuda" ]]; then + rm -rf ./is_cuda + fi cd $build_dir rm -f $workspace/python/cinn/core_api.so ln -s $build_dir/cinn/pybind/core_api.so $workspace/python/cinn/ cd $workspace/docs mkdir -p docs/source/cpp - cat $workspace/tutorials/matmul.cc | python${py_version} $workspace/tools/gen_c++_tutorial.py > $workspace/docs/source/matmul.md + cat $workspace/tutorials/matmul.cc | python${py_version} $workspace/tools/gen_c++_tutorial.py > $workspace/docs/source/matmul.md + cat $workspace/tutorials/load_paddle_model.cc | python${py_version} $workspace/tools/gen_c++_tutorial.py > $workspace/docs/source/load_paddle_model.md make html - if [[ $cuda_config == "ON" && -d "./is_cuda" ]]; then - rm -rf $workspace/tutorials/is_cuda - fi } function cmake_ { @@ -308,6 +329,10 @@ function main { run_test shift ;; + test_doc) + test_doc + shift + ;; ci) CI shift @@ -320,10 +345,6 @@ function main { prepare_model shift ;; - make_doc) - make_doc - shift - ;; esac done } diff --git a/cinn/backends/codegen_cuda_dev_test.cc b/cinn/backends/codegen_cuda_dev_test.cc old mode 100644 new mode 100755 index b637d459a8..348075268d --- a/cinn/backends/codegen_cuda_dev_test.cc +++ b/cinn/backends/codegen_cuda_dev_test.cc @@ -89,7 +89,7 @@ TEST(CodeGenCUDA, basic) { CodeGenCUDA_Dev codegen(target); - auto func = Lower("elementwise_add", stages, {A, B, C}); + auto func = Lower("elementwise_mul", stages, {A, B, C}); auto compiled = codegen.Compile(func); @@ -115,7 +115,7 @@ TEST(CodeGenCUDA, Module_output) { CodeGenCUDA_Dev codegen(target); - auto func = Lower("elementwise_add", stages, {A, B, C}); + auto func = Lower("elementwise_mul", stages, {A, B, C}); Module::Builder builder("module", target); builder.AddFunction(func); @@ -149,7 +149,7 @@ TEST(CodeGenCUDA2, test_of_cacheread) { stages[B_cache]->ComputeAt(stages[C], 1); CodeGenCUDA_Dev codegen(target); - auto func = Lower("elementwise_add", stages, {A, B, C}); + auto func = Lower("elementwise_mul", stages, {A, B, C}); Module::Builder builder("module", target); builder.AddFunction(func); @@ -181,7 +181,7 @@ TEST(CodeGenCUDA2, test_of_cacheread) { dim3 grid(10, 1, 1); dim3 block(10, 1, 1); - cuda_module.LaunchKernel(0, "elementwise_add", grid, block, args); + cuda_module.LaunchKernel(0, "elementwise_mul", grid, block, args); CUDA_CALL(cudaMemcpy(host_data3.data(), reinterpret_cast(Cd), @@ -221,7 +221,7 @@ TEST(CodeGenCUDA2, test_of_splitcudakernel) { CodeGenCUDA_Dev codegen(target); - auto func = lang::LowerVec("elementwise_add", stages, {A, B, C, D}, {}, {}, nullptr, target); + auto func = lang::LowerVec("elementwise_mul_and_add", stages, {A, B, C, D}, {}, {}, nullptr, target); Module::Builder builder("module", target); for (auto& i : func) { @@ -251,7 +251,7 @@ typedef char int8_t; __global__ -void __launch_bounds__(200) elementwise_add(const float* __restrict__ X, const float* __restrict__ Y, float* __restrict__ C) +void __launch_bounds__(200) elementwise_mul_and_add(const float* __restrict__ X, const float* __restrict__ Y, float* __restrict__ C) { if (((int)blockIdx.x < 100)) { if (((int)threadIdx.x < 200)) { @@ -259,7 +259,7 @@ void __launch_bounds__(200) elementwise_add(const float* __restrict__ X, const f }; }; }__global__ -void __launch_bounds__(200) elementwise_add_1(const float* __restrict__ X, const float* __restrict__ Y, const float* __restrict__ C, float* __restrict__ D) +void __launch_bounds__(200) elementwise_mul_and_add_1(const float* __restrict__ X, const float* __restrict__ Y, const float* __restrict__ C, float* __restrict__ D) { if (((int)blockIdx.x < 100)) { if (((int)threadIdx.x < 200)) { diff --git a/cinn/backends/compiler.cc b/cinn/backends/compiler.cc old mode 100644 new mode 100755 index 2fd2428d36..d8ef99fef4 --- a/cinn/backends/compiler.cc +++ b/cinn/backends/compiler.cc @@ -70,14 +70,14 @@ void Compiler::CompileCudaModule(const Module& module, const std::string& code, auto _host_module_device_module_ = SplitCudaAndHostModule(module); // NOLINT auto& host_module = std::get<0>(_host_module_device_module_); auto& device_module = std::get<1>(_host_module_device_module_); - LOG(INFO) << "[CUDA] host module:\n" << host_module; + VLOG(3) << "[CUDA] host module:\n" << host_module; { // compile cuda device - LOG(INFO) << "[CUDA] device module:\n" << device_module; + VLOG(3) << "[CUDA] device module:\n" << device_module; CodeGenCUDA_Dev codegen(target_); auto source_code = codegen.Compile(device_module); if (!code.empty()) source_code = code; - LOG(INFO) << "[CUDA] source code:\n" << source_code; + VLOG(3) << "[CUDA] source code:\n" << source_code; using runtime::cuda::CUDAModule; backends::NVRTC_Compiler compiler; diff --git a/cinn/backends/llvm/execution_engine.cc b/cinn/backends/llvm/execution_engine.cc old mode 100644 new mode 100755 index 43f8d625b8..4b9f29c048 --- a/cinn/backends/llvm/execution_engine.cc +++ b/cinn/backends/llvm/execution_engine.cc @@ -98,7 +98,7 @@ std::unique_ptr NaiveObjectCache::getObject(const llvm::Modu return nullptr; } - LOG(INFO) << "Object for " << m->getModuleIdentifier() << " loaded from cache."; + VLOG(3) << "Object for " << m->getModuleIdentifier() << " loaded from cache."; return llvm::MemoryBuffer::getMemBuffer(it->second->getMemBufferRef()); } @@ -178,25 +178,25 @@ void ExecutionEngine::Link(const ir::Module &module) { decltype(auto) es = jit_->getExecutionSession(); if (false) { - LOG(INFO) << "======= dump jit execution session ======"; + VLOG(3) << "======= dump jit execution session ======"; std::string buffer; llvm::raw_string_ostream os(buffer); es.dump(os); os.flush(); - LOG(INFO) << buffer; + VLOG(3) << buffer; } } bool ExecutionEngine::AddModule(std::unique_ptr module, std::unique_ptr context) { module->setDataLayout(jit_->getDataLayout()); if (false) { - LOG(INFO) << "======= dump jit lib =========="; + VLOG(3) << "======= dump jit lib =========="; std::string buffer; llvm::raw_string_ostream os(buffer); module->print(os, {}); // main_jd_->dump(os); os.flush(); - LOG(INFO) << buffer; + VLOG(3) << buffer; } llvm::orc::ThreadSafeContext tsc(std::move(context)); llvm::orc::ThreadSafeModule tsm(std::move(module), std::move(tsc)); diff --git a/cinn/backends/llvm/simple_jit.cc b/cinn/backends/llvm/simple_jit.cc old mode 100644 new mode 100755 index ed5e2062e3..8439ec9be5 --- a/cinn/backends/llvm/simple_jit.cc +++ b/cinn/backends/llvm/simple_jit.cc @@ -71,8 +71,8 @@ void SimpleJIT::AddModule(std::unique_ptr module, bool optimize) { module_pass_manager.run(*module, module_analysis_manager); } - LOG(INFO) << "jit target: " << jit_->getDataLayout().getStringRepresentation(); - LOG(INFO) << "module target: " << module->getDataLayout().getStringRepresentation(); + VLOG(3) << "jit target: " << jit_->getDataLayout().getStringRepresentation(); + VLOG(3) << "module target: " << module->getDataLayout().getStringRepresentation(); llvm::orc::ThreadSafeModule tsm(std::move(module), context_); llvm::cantFail(jit_->addIRModule(std::move(tsm))); @@ -82,7 +82,7 @@ void SimpleJIT::AddModule(std::unique_ptr module, bool optimize) { llvm::raw_string_ostream os(buffer); jit_->getExecutionSession().dump(os); os.flush(); - LOG(INFO) << "compiled jit:\n" << buffer; + VLOG(3) << "compiled jit:\n" << buffer; } } diff --git a/cinn/backends/llvm/simple_jit.h b/cinn/backends/llvm/simple_jit.h old mode 100644 new mode 100755 diff --git a/cinn/backends/nvrtc_util.cc b/cinn/backends/nvrtc_util.cc old mode 100644 new mode 100755 index 012bc1c3c2..e397cbacd4 --- a/cinn/backends/nvrtc_util.cc +++ b/cinn/backends/nvrtc_util.cc @@ -91,7 +91,7 @@ std::string NVRTC_Compiler::CompilePTX(const std::string& code, bool include_hea for (const auto& option : compile_options) { param_cstrings.push_back(option.c_str()); } - LOG(INFO) << "compile options: " << utils::Join(compile_options, " "); + VLOG(3) << "compile options: " << utils::Join(compile_options, " "); NVRTC_CALL(nvrtcCreateProgram(&prog, code.c_str(), nullptr, 0, nullptr, nullptr)); nvrtcResult compile_res = nvrtcCompileProgram(prog, param_cstrings.size(), param_cstrings.data()); diff --git a/cinn/common/cas.cc b/cinn/common/cas.cc index 57766f2400..927ac37008 100644 --- a/cinn/common/cas.cc +++ b/cinn/common/cas.cc @@ -2005,8 +2005,8 @@ Expr CasSimplifyMutator::FurtherSimplifyFracWithInterval( auto it = var_intervals.find(bv->name); auto ai_abs = std::abs(ai->value); if (it != var_intervals.end()) { - LOG(INFO) << "found " << bv->name << " " << it->second << " " - << " ai " << ai_abs; + VLOG(3) << "found " << bv->name << " " << it->second << " " + << " ai " << ai_abs; } if (it != var_intervals.end() && std::abs(it->second.r) > ai_abs && std::abs(it->second.l) > ai_abs) { return make_const(a.type(), 0); diff --git a/cinn/common/ir_util.cc b/cinn/common/ir_util.cc index 022778db5e..48fd22f4f5 100644 --- a/cinn/common/ir_util.cc +++ b/cinn/common/ir_util.cc @@ -125,7 +125,7 @@ Expr RampRelatedMul(Expr a, Expr b) { CHECK_EQ(a_broadcast->lanes, b_broadcast->lanes); return ir::Broadcast::Make(a_broadcast->value * b_broadcast->value, a_broadcast->lanes); } else { - LOG(INFO) << "a,b: " << a << " " << b; + VLOG(3) << "a,b: " << a << " " << b; CINN_NOT_IMPLEMENTED } } diff --git a/cinn/frontend/computation.cc b/cinn/frontend/computation.cc old mode 100644 new mode 100755 index 99a464e949..c62b04aead --- a/cinn/frontend/computation.cc +++ b/cinn/frontend/computation.cc @@ -127,7 +127,7 @@ std::shared_ptr CinnComputation::CompilePaddleModel( } program->SetInputs({input_vars}); program->Validate(); - LOG(INFO) << "program:\n" << *program; + VLOG(3) << "program:\n" << *program; for (auto &name : fetch_names) { output_vars.push_back(varmap.at(name)); diff --git a/cinn/frontend/interpreter.cc b/cinn/frontend/interpreter.cc old mode 100644 new mode 100755 index 9b5679dbb8..f98f498107 --- a/cinn/frontend/interpreter.cc +++ b/cinn/frontend/interpreter.cc @@ -103,7 +103,7 @@ void Interpreter::Impl::Build(const std::vector& input_names, program_->SetInputs({input_vars}); program_->Validate(); - LOG(INFO) << "Program:\n" << *program_; + VLOG(3) << "Program:\n" << *program_; auto graph = std::make_shared(*program_, target); graph->attrs["model_name"] = std::make_shared(model_name); diff --git a/cinn/frontend/paddle/model_parser.cc b/cinn/frontend/paddle/model_parser.cc old mode 100644 new mode 100755 index 5c0d83bd6a..8ab48da30e --- a/cinn/frontend/paddle/model_parser.cc +++ b/cinn/frontend/paddle/model_parser.cc @@ -222,9 +222,9 @@ void LoadModelPb(const std::string &model_dir, CHECK(cpp_prog); CHECK(scope); cpp_prog->ClearBlocks(); - LOG(INFO) << "model_dir is: " << model_dir; - LOG(INFO) << "model_file is: " << model_file; - LOG(INFO) << "param_file is: " << param_file; + VLOG(3) << "model_dir is: " << model_dir; + VLOG(3) << "model_file is: " << model_file; + VLOG(3) << "param_file is: " << param_file; // Load model VLOG(4) << "Start load model program..."; std::string prog_path = model_dir + "/__model__"; diff --git a/cinn/hlir/pe/nn.cc b/cinn/hlir/pe/nn.cc old mode 100644 new mode 100755 index 1a124b2f37..8c93189503 --- a/cinn/hlir/pe/nn.cc +++ b/cinn/hlir/pe/nn.cc @@ -253,11 +253,11 @@ std::vector Conv2d_NCHW(const ir::Tensor &input, std::to_string(output_shape_int[1]) + " " + std::to_string(output_shape_int[2]) + " " + std::to_string(output_shape_int[3]); if (res.count(key) > 0) { - LOG(INFO) << "Find saved winograd_conv2d schedule param! key is: " << key; + VLOG(3) << "Find saved winograd_conv2d schedule param! key is: " << key; return Conv2d_winograd_NCHW( input, weights, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, output_name); } - LOG(INFO) << "Didn't find saved winograd_conv2d schedule param! key is: " << key; + VLOG(3) << "Didn't find saved winograd_conv2d schedule param! key is: " << key; } ir::Tensor input_pad; if (pad_h == 0 && pad_w == 0) { diff --git a/cinn/optim/replace_var_with_expr.cc b/cinn/optim/replace_var_with_expr.cc index 6611a7278e..eeee96f2cd 100755 --- a/cinn/optim/replace_var_with_expr.cc +++ b/cinn/optim/replace_var_with_expr.cc @@ -227,7 +227,7 @@ struct ReplaceVarIndexOfCacheMutator : public ir::IRMutator<> { if (tensor_shape[index].is_constant() && tensor_shape[index].get_constant() <= 0) { tensor_shape[index] = Expr(1); } else if (!tensor_shape[index].is_constant()) { - LOG(INFO) << "Index is not constant: " << tensor_shape[index] << " and it will be replaced to 1"; + VLOG(3) << "Index is not constant: " << tensor_shape[index] << " and it will be replaced to 1"; tensor_shape[index] = Expr(1); } (*global_tensor_map_).at(tensor_name)->shape = tensor_shape; @@ -239,7 +239,7 @@ struct ReplaceVarIndexOfCacheMutator : public ir::IRMutator<> { VLOG(3) << i; } } else { - LOG(INFO) << "extent not defined"; + VLOG(3) << "extent not defined"; } } diff --git a/cinn/poly/compute_at_transform.cc b/cinn/poly/compute_at_transform.cc old mode 100644 new mode 100755 index ece3b89e9d..1f69706a4c --- a/cinn/poly/compute_at_transform.cc +++ b/cinn/poly/compute_at_transform.cc @@ -128,7 +128,7 @@ void ComputeAtTransform::DisplayC(isl_map* pschedule, isl_map* cschedule) { auto* build = isl_ast_build_from_context(context.release()); auto* node = isl_ast_build_node_from_schedule_map(build, intersect_schedule.release()); - LOG(INFO) << "code:\n\n" << isl_ast_node_to_C_str(node); + VLOG(3) << "code:\n\n" << isl_ast_node_to_C_str(node); isl_ast_node_free(node); } diff --git a/cinn/poly/stage.cc b/cinn/poly/stage.cc index c13de5f772..ae17b26951 100755 --- a/cinn/poly/stage.cc +++ b/cinn/poly/stage.cc @@ -970,12 +970,12 @@ void Stage::Vectorize(int level, int factor) { CHECK_LT(level, n_out_dims()); CHECK_GT(factor, 0); if (factor == 1) { - LOG(INFO) << "Vectorize-factor 1 has no sense, skip it"; + VLOG(3) << "Vectorize-factor 1 has no sense, skip it"; return; } auto transformed_domain = this->transformed_domain(); if (isl_is_removed_axis(transformed_domain.get(), level)) { - LOG(INFO) << "Vectorizing for-1 has no sense, skip it"; + VLOG(3) << "Vectorizing for-1 has no sense, skip it"; return; } int removed_axes_counts = isl_get_precending_removed_axes_counts(transformed_domain.get(), level); @@ -1008,7 +1008,7 @@ void Stage::Parallel(int level) { auto transformed_domain = this->transformed_domain(); VLOG(3) << "transformed_domain" << transformed_domain; if (isl_is_removed_axis(transformed_domain.get(), level)) { - LOG(INFO) << "Paralleling for-1 has no sense, skip it"; + VLOG(3) << "Paralleling for-1 has no sense, skip it"; return; } int removed_axes_counts = isl_get_precending_removed_axes_counts(transformed_domain.get(), level); diff --git a/cinn/pybind/frontend.cc b/cinn/pybind/frontend.cc old mode 100644 new mode 100755 index 693ec3edb8..6e9c9da1cf --- a/cinn/pybind/frontend.cc +++ b/cinn/pybind/frontend.cc @@ -226,7 +226,7 @@ void BindFrontend(pybind11::module *m) { CINN_NOT_IMPLEMENTED } } - LOG(INFO) << info; + VLOG(3) << info; program->ExecuteTest(repeat_); auto out = scope->GetTensor(tensor_out->id); return out; @@ -268,7 +268,7 @@ void BindFrontend(pybind11::module *m) { CINN_NOT_IMPLEMENTED } } - LOG(INFO) << info; + VLOG(3) << info; program->ExecuteTest(repeat_); auto out = scope->GetTensor(tensor_out->id); return out; diff --git a/cinn/pybind/lang.cc b/cinn/pybind/lang.cc old mode 100644 new mode 100755 index f96332fc03..6804e618eb --- a/cinn/pybind/lang.cc +++ b/cinn/pybind/lang.cc @@ -41,6 +41,7 @@ using utils::StringFormat; namespace { void BindBuffer(py::module *); void BindLower(py::module *); +void BindLowerVec(py::module *); void BindPlaceholder(py::module *); void BindCompute(py::module *); void BindModule(py::module *); @@ -66,6 +67,19 @@ void BindLower(py::module *m) { arg("target") = common::DefaultHostTarget()); } +void BindLowerVec(py::module *m) { + using py::arg; + m->def("lower_vec", + &lang::LowerVec, + arg("name"), + arg("stages"), + arg("tensor_args"), + arg("scalar_args") = std::vector(), + arg("temp_tensors") = std::vector(), + arg("b") = nullptr, + arg("target") = common::DefaultHostTarget()); +} + void BindCompute(py::module *m) { #define MAKE_COMPUTE_FN(__fn) \ py::overload_cast &, __fn, const std::string &, const std::vector &>( \ @@ -218,6 +232,7 @@ void BindBuiltin(py::module *m) { void BindLang(py::module *m) { BindBuffer(m); BindLower(m); + BindLowerVec(m); BindPlaceholder(m); BindCompute(m); BindModule(m); diff --git a/cinn/pybind/poly.cc b/cinn/pybind/poly.cc index e1bc8a52ab..dae4c51700 100644 --- a/cinn/pybind/poly.cc +++ b/cinn/pybind/poly.cc @@ -47,6 +47,18 @@ void BindMap(py::module *m) { condition.def_readwrite("cond", &Condition::cond).def(py::init()).def("__str__", &Condition::__str__); } +void BindStageMap(py::module *m) { + DefineShared(m, "StageMap"); + py::class_> stage_map(*m, "StageMap"); + stage_map // + .def( + "__getitem__", + [](poly::StageMap self, ir::Tensor &t) -> Stage & { return *self[t]; }, + py::return_value_policy::reference); + + m->def("create_stages", &poly::CreateStages, py::arg("tensors")); +} + void BindStage(py::module *m) { py::class_ stage(*m, "Stage"); // enum Stage::ComputeAtKind @@ -73,6 +85,7 @@ void BindStage(py::module *m) { .def("split", py::overload_cast(&Stage::Split), arg("level"), arg("factor")) .def("split", py::overload_cast(&Stage::Split), arg("level"), arg("factor")) .def("fuse", py::overload_cast(&Stage::Fuse), arg("level0"), arg("level1")) + .def("fuse", py::overload_cast &>(&Stage::Fuse)) .def("reorder", py::overload_cast &>(&Stage::Reorder), "Reorder the axis in the computation") @@ -87,23 +100,17 @@ void BindStage(py::module *m) { .def("unroll", py::overload_cast(&Stage::Unroll)) .def("unroll", py::overload_cast(&Stage::Unroll)) .def("unroll", py::overload_cast(&Stage::Unroll)) + .def("parallel", py::overload_cast(&Stage::Parallel)) + .def("parallel", py::overload_cast(&Stage::Parallel)) + .def("parallel", py::overload_cast(&Stage::Parallel)) .def("compute_at", &Stage::ComputeAtSchedule, arg("other"), arg("level"), arg("kind") = Stage::kComputeAtAuto) .def("skew", &Stage::Skew) .def("ctrl_depend", &Stage::CtrlDepend) .def("cache_read", &Stage::CacheRead) - .def("cache_write", &Stage::CacheWrite); -} - -void BindStageMap(py::module *m) { - DefineShared(m, "StageMap"); - py::class_> stage_map(*m, "StageMap"); - stage_map // - .def( - "__getitem__", - [](poly::StageMap self, ir::Tensor &t) -> Stage & { return *self[t]; }, - py::return_value_policy::reference); - - m->def("create_stages", &poly::CreateStages, py::arg("tensors")); + .def("cache_write", &Stage::CacheWrite) + .def("sync_threads", py::overload_cast(&Stage::SyncThreads)) + .def("sync_threads", + py::overload_cast &, poly::StageMap>(&Stage::SyncThreads)); } } // namespace diff --git a/docs/guide.md b/docs/guide.md deleted file mode 100644 index 4ea81d4616..0000000000 --- a/docs/guide.md +++ /dev/null @@ -1,41 +0,0 @@ -# CINN INSTAllATION GUIDANCE - -### Step 1. Clone Source Code - -Clone CINN from github. - -`git clone https://github.com/PaddlePaddle/CINN.git` - -### Step 2. Build Docker Image - -Build docker image based on the given dockerfile in ./tools/docker/Dockerfile. - -`cd ./CINN/tools/docker` - -`sudo docker build -t cinn_image:v1 .` - -### Step 3. Start a docker container - -Start a docker container and mount folder ./CINN into it. - -Go back to the path where you clone CINN. - -`sudo nvidia-docker run -it --net=host -v $PWD/CINN:/WorkSpace/CINN --name=your_docker_name cinn_image:v1` - -### Step 4. Prepare dependencies - -After enter the container, run ./CINN/tools/ci_build.sh - -`./CINN/tools/ci_build.sh` - -### Step 5. Build CINN and do ci test - -Build CINN and do ci test to verify correctness. - -`cd CINN` - -There are 3 kinds of ci test: - -1. Test on CPU(X86) backends: `./build.sh ci` -2. Test on NVGPU(cuda) backends with CUDNN library: `./build.sh gpu_on ci` -3. Test on NVGPU(cuda) backends without CUDNN library: `./build.sh gpu_on cudnn_off ci` diff --git a/docs/source/conf.py b/docs/source/conf.py index 20366d5e8c..00ad0e1855 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -34,11 +34,11 @@ # -- Project information ----------------------------------------------------- project = 'cinn' -copyright = '2020, cinn team' +copyright = '2021, cinn team' author = 'cinn Team' # The full version, including alpha/beta/rc tags -release = '0.1-alpha' +release = 'release/v0.1-rc' # -- General configuration --------------------------------------------------- @@ -47,6 +47,7 @@ # ones. extensions = [ 'sphinx.ext.doctest', + 'sphinx_rtd_theme', 'sphinx.ext.autosummary', 'sphinx.ext.mathjax', 'sphinx_gallery.gen_gallery', @@ -76,7 +77,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/source/guide.md b/docs/source/guide.md new file mode 100644 index 0000000000..4e076523e5 --- /dev/null +++ b/docs/source/guide.md @@ -0,0 +1,33 @@ +# Install CINN using docker + +### Step 1. Start a docker container + +Start a docker container based on upstream image. + +`nvidia-docker run --name $CONTAINER_NAME -it --net=host registry.baidubce.com/paddlepaddle/paddle:2.2.0-gpu-cuda11.2-cudnn8 /bin/bash` + +If you are using the latest version of docker, try: + +`docker run --gpus all --name $CONTAINER_NAME -it --net=host registry.baidubce.com/paddlepaddle/paddle:2.2.0-gpu-cuda11.2-cudnn8 /bin/bash` + +And notice that if your cuda version is not 11.2, replace the docker image to the corresponding paddle image with identical cuda version [here](https://registry.hub.docker.com/r/paddlepaddle/paddle). + +### Step 2. Clone Source Code + +After entering the container, clone the source code from github. + +`git clone https://github.com/PaddlePaddle/CINN.git` + +### Step 3. Build CINN and do ci test + +Build CINN and do ci test to verify correctness. + +`cd CINN` + +There are 5 kinds of ci test: + +1. Test on CPU(X86) backends: `bash ./build.sh ci` +2. Test on CPU(X86) backends without mklcblas: `bash ./build.sh mklcblas_off ci` +3. Test on CPU(X86) backends without mkldnn: `bash ./build.sh mkldnn_off ci` +4. Test on NVGPU(cuda) backends with CUDNN library: `bash ./build.sh gpu_on ci` +5. Test on NVGPU(cuda) backends without CUDNN library: `bash ./build.sh gpu_on cudnn_off ci` diff --git a/docs/source/index.rst b/docs/source/index.rst old mode 100644 new mode 100755 index 12cf9db542..7103cc3fcb --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -21,8 +21,9 @@ Install :maxdepth: 1 ./install.md + ./guide.md -cinn +CINN ------ Get Started @@ -34,15 +35,16 @@ Get Started C++ APIs -~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~ .. toctree:: :maxdepth: 1 matmul.md + load_paddle_model.md cpp/library_root.rst -cinnrt +CINNRT ------- TBD diff --git a/python/tests/test_computation.py b/python/tests/test_computation.py old mode 100644 new mode 100755 index aa2499052e..5a257e5ea8 --- a/python/tests/test_computation.py +++ b/python/tests/test_computation.py @@ -26,7 +26,6 @@ from cinn import lang from cinn.common import * import numpy as np -import paddle.fluid as fluid import sys assert len(sys.argv) == 3 diff --git a/tutorials/README.md b/tutorials/README.md new file mode 100644 index 0000000000..54ed716083 --- /dev/null +++ b/tutorials/README.md @@ -0,0 +1,13 @@ +================= +Tutorials +================= +This page contains the tutorials about CINN. + +--------- +Run demo +--------- +Compile ``demo.cc``:: + + cd build/dist + + bash build_demo.sh diff --git a/tutorials/README.txt b/tutorials/README.txt deleted file mode 100644 index 9159112743..0000000000 --- a/tutorials/README.txt +++ /dev/null @@ -1,11 +0,0 @@ -Tutorials -=========== -This page contains the tutorials about CINN. - -#### Run demo -compile demo.cc - -```bash -cd build/dist -bash build_demo.sh -``` diff --git a/tutorials/jit.py b/tutorials/jit.py index e9e866705e..6812ea42c1 100755 --- a/tutorials/jit.py +++ b/tutorials/jit.py @@ -18,11 +18,10 @@ In this tutorial, we will introduce the JIT module that execute the DSL on X86 and NV GPU. """ -# sphinx_gallery_thumbnail_path = '_static/icon.png' - import cinn import numpy as np from cinn import runtime +# sphinx_gallery_thumbnail_path = './paddlepaddle.png' ################################################################## # declare some variables for latter use diff --git a/tutorials/load_paddle_model.cc b/tutorials/load_paddle_model.cc new file mode 100644 index 0000000000..458329c7a9 --- /dev/null +++ b/tutorials/load_paddle_model.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2021 CINN Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! @h1 Load and Execute Paddle Model in C++ +//! In this tutorial, we will show you how to load and execute a paddle model in CINN using C++. +//! We will use model ResNet50 as an example. + +#include + +#include "cinn/cinn.h" + +using namespace cinn; // NOLINT + +//! @IGNORE-NEXT +TEST(LOAD_MODEL, basic) { + //! @h2 Prepare to Load Model + //! Declare the params and prepare to load and execute the paddle model. + //! + `input_name` is the name of input tensor in the model. + //! + `target_name` is the name of output tensor we want. + //! + `x_shape` is the input tensor's shape of the model. + + std::string input_name = "inputs"; + std::string target_name = "save_infer_model/scale_0.tmp_1"; + std::vector x_shape = {1, 3, 224, 224}; + + //! @h2 Set the target backend + //! Now CINN only supports two backends: X86 and CUDA. + //! + To choose X86 backends, use : + //! `auto target = common::DefaultHostTarget();` + //! + To choose CUDA backends, use : + //! `auto target = common::DefaultNVGPUTarget();` + + auto target = common::DefaultHostTarget(); + + //! @h2 Load Model to CINN + //! Load the paddle model and compile it into CINN IR. + //! + `target` is the backend to execute model on. + //! + `model_dir` is the path where the paddle model is stored. + //! + `params_combined` implies whether the params of paddle model is stored in one file. + + std::string model_dir = "./ResNet50"; + bool params_combined = true; + auto computation = + frontend::CinnComputation::CompilePaddleModel(target, model_dir, {input_name}, {x_shape}, params_combined); + + //! @h2 Get input tensor and set input data + //! Here we use all-zero data as input. In practical applications, please replace it with real data according to your + //! needs. + + auto input_tensor = computation.GetTensor(input_name); + + std::vector fake_input(input_tensor->shape().numel(), 0.f); + + auto *input_data = input_tensor->mutable_data(target); + if (target.arch == Target::Arch::X86) { + std::copy(fake_input.begin(), fake_input.end(), input_data); + } else if (target.arch == Target::Arch::NVGPU) { + CUDA_CALL(cudaMemcpy( + input_data, fake_input.data(), input_tensor->shape().numel() * sizeof(float), cudaMemcpyHostToDevice)); + } + + //! @h2 Execute Model + //! Execute the model and get output tensor's data. + + computation.Execute(); + + auto target_tensor = computation.GetTensor(target_name); + std::vector output_data(target_tensor->shape().numel(), 0.f); + if (target.arch == Target::Arch::X86) { + std::copy(target_tensor->data(), + target_tensor->data() + target_tensor->shape().numel(), + output_data.data()); + } else if (target.arch == Target::Arch::NVGPU) { + CUDA_CALL(cudaMemcpy(output_data.data(), + reinterpret_cast(target_tensor->mutable_data(target)), + target_tensor->shape().numel() * sizeof(float), + cudaMemcpyDeviceToHost)); + } + //! @IGNORE-NEXT + LOG(INFO) << "Succeed!"; +} diff --git a/tutorials/load_paddle_model.py b/tutorials/load_paddle_model.py index 9d59e49b95..cef609ede3 100755 --- a/tutorials/load_paddle_model.py +++ b/tutorials/load_paddle_model.py @@ -16,10 +16,11 @@ ===================== In this tutorial, we will show you how to load and execute a paddle model in CINN. +We offer you four optional models: ResNet50, MobileNetV2, EfficientNet and FaceDet. """ -# sphinx_gallery_thumbnail_path = '_static/icon.png' - +import paddle +import paddle.fluid as fluid import cinn from cinn import * from cinn.frontend import * @@ -27,71 +28,133 @@ from cinn.common import * import numpy as np import os +import sys +# sphinx_gallery_thumbnail_path = './paddlepaddle.png' ################################################################## -# Prepare to Load Model -# ------------------------- +# **Prepare to Load Model** +# -------------------------- # Declare the params and prepare to load and execute the paddle model. # -# - :code:`enable_gpu` implies whether to run CINN on CUDA backends. -# -# - :code:`mnodel_dir` is the path where the paddle model is stored. +# - :code:`model_dir` is the path where the paddle model is stored. # # - :code:`input_tensor` is the name of input tensor in the model. # # - :code:`target_tensor` is the name of output tensor we want. # # - :code:`x_shape` is the input tensor's shape of the model - -model_dir = "./ResNet18" -input_tensor = 'image' -target_tensor = 'save_infer_model/scale_0' +# +# - When choosing model ResNet50, the params should be :: +# +# model_dir = "./ResNet50" +# +# input_tensor = 'inputs' +# +# target_tensor = 'save_infer_model/scale_0.tmp_1' +# +# x_shape = [1, 3, 224, 224] +# +# - When choosing model MobileNetV2, the params should be :: +# +# model_dir = "./MobileNetV2" +# +# input_tensor = 'image' +# +# target_tensor = 'save_infer_model/scale_0' +# +# x_shape = [1, 3, 224, 224] +# +# - When choosing model EfficientNet, the params should be :: +# +# model_dir = "./EfficientNet" +# +# input_tensor = 'image' +# +# target_tensor = 'save_infer_model/scale_0' +# +# x_shape = [1, 3, 224, 224] +# +# - When choosing model FaceDet, the params should be :: +# +# model_dir = "./FaceDet" +# +# input_tensor = 'image' +# +# target_tensor = 'save_infer_model/scale_0' +# +# x_shape = [1, 3, 240, 320] +# +model_dir = "./ResNet50" +input_tensor = 'inputs' +target_tensor = 'save_infer_model/scale_0.tmp_1' x_shape = [1, 3, 224, 224] ################################################################## -# Set the target backend +# **Set the target backend** +# ------------------------------ +# Now CINN only supports two backends: X86 and CUDA. +# +# - For CUDA backends, set ``target = DefaultNVGPUTarget()`` +# +# - For X86 backends, set ``target = DefaultHostTarget()`` +# if os.path.exists("is_cuda"): target = DefaultNVGPUTarget() else: target = DefaultHostTarget() ################################################################## -# Set the input tensor and init interpreter -executor = Interpreter([input_tensor], [x_shape]) - -################################################################## -# Load Model to CINN +# **Load Model to CINN** # ------------------------- -# Load the paddle model and transform it into CINN IR -# -# * :code:`mnodel_dir` is the path where the paddle model is stored. +# Load the paddle model and transform it into CINN IR. # # * :code:`target` is the backend to execute model on. # -# * :code:`params_combined` implies whether the params of paddle -# model is stored in one file. - +# * :code:`model_dir` is the path where the paddle model is stored. +# +# * :code:`params_combined` implies whether the params of paddle model is stored in one file. +# +# +model_name = "resnet50" params_combined = True -executor.load_paddle_model(model_dir, target, params_combined) +computation = Computation.compile_paddle_model( + target, model_dir, [input_tensor], [x_shape], params_combined) ################################################################## -# Get input tensor and set input data -a_t = executor.get_tensor(input_tensor) +# **Get input tensor and set input data** +# ----------------------------------------- +# Here we use random data as input. In practical applications, +# please replace it with real data according to your needs. +# +a_t = computation.get_tensor(input_tensor) x_data = np.random.random(x_shape).astype("float32") a_t.from_numpy(x_data, target) ################################################################## -# Get output tensor and init its data to zero. -out = executor.get_tensor(target_tensor) +# Here we set the output tensor's data to zero before running the model. +out = computation.get_tensor(target_tensor) out.from_numpy(np.zeros(out.shape(), dtype='float32'), target) ################################################################## -# Execute Model +# **Execute Model** # ------------------------- # Execute the model and get output tensor's data. -# * :code:`out` is the data of output tensor we want. +# :code:`out` is the data of output tensor we want. +computation.execute() +res_cinn = out.numpy(target) +print("CINN Execution Done!") -executor.run() -out = out.numpy(target) -print("Execution Done!\nResult shape is:\n", out.shape) -print("Result data is:\n", out) +################################################################## +# **Use Paddle to Verify Correctness** +# ------------------------- +# Now we run the model by paddle and check if the 2 results are identical. +config = fluid.core.AnalysisConfig(model_dir + '/__model__', + model_dir + '/params') +config.disable_gpu() +config.switch_ir_optim(False) +paddle_predictor = fluid.core.create_paddle_predictor(config) +data = fluid.core.PaddleTensor(x_data) +paddle_out = paddle_predictor.run([data]) +res_paddle = paddle_out[0].as_ndarray() +print("Paddle Execution Done!\n =============================") +print("Verification result is: ", np.allclose(res_cinn, res_paddle, atol=1e-3)) diff --git a/tutorials/matmul.py b/tutorials/matmul.py index 8cae9ae1cf..1924b80fbb 100755 --- a/tutorials/matmul.py +++ b/tutorials/matmul.py @@ -18,12 +18,11 @@ In this tutorial, we will introduce several ways to optimize the performance of the matrix multiplication on X86 CPU. """ -# sphinx_gallery_thumbnail_path = '_static/icon.png' - import cinn import numpy as np import time from cinn import runtime +# sphinx_gallery_thumbnail_path = './paddlepaddle.png' ################################################################## # Declare the basic computation for a matmul diff --git a/tutorials/paddlepaddle.png b/tutorials/paddlepaddle.png new file mode 100644 index 0000000000..d877d50303 Binary files /dev/null and b/tutorials/paddlepaddle.png differ diff --git a/tutorials/schedule_primitives.py b/tutorials/schedule_primitives.py index 6207c056c1..9adc0e8ed0 100755 --- a/tutorials/schedule_primitives.py +++ b/tutorials/schedule_primitives.py @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. """ +=============================== Schedule Primitives in CINN =============================== In this tutorial, we will guide you through the examples of using schedule primitives. """ -# sphinx_gallery_thumbnail_path = '_static/icon.png' - import cinn import numpy as np +# sphinx_gallery_thumbnail_path = './paddlepaddle.png' ################################################################## # declare some variables for latter use @@ -224,3 +224,56 @@ BCR = stages[B].cache_write("local", stages, B) fn = cinn.lower("fn", stages, [A.to_tensor(), B, BCR]) print(fn) + +################################################################## +# +# Parallel +# ------ +# :code:`parallel` will mark one loop to execute in parallel.(Only used in X86 backends) +A = cinn.Placeholder('float32', 'A', (m, n)) +B = cinn.compute((m, n), lambda v: A(v[0], v[1]) * 2., name='B') + +stages = cinn.create_stages([B]) +stages[B].parallel(0) +fn = cinn.lower("fn", stages, [A.to_tensor(), B]) +print(fn) + +################################################################## +# +# Vectorize +# ------ +# :code:`vectorize` will vectorize one loop in param `level`.(Only used in X86 backends) +A = cinn.Placeholder('float32', 'A', (m, n)) +B = cinn.compute((m, n), lambda v: A(v[0], v[1]) * 2., name='B') + +stages = cinn.create_stages([B]) +stages[B].vectorize(0, 10) +fn = cinn.lower("fn", stages, [A.to_tensor(), B]) +print(fn) + +################################################################## +# -------------------------------------------------------------- +# An example of optimizing performance in cuda backends +# -------------------------------------------------------------- +# +# **In this section, we will show you a practical example about optimizing performance using schedule primitives** +# +# Optimize an elementwise_add kernel using `fuse`, `split` and `bind` +# +A = cinn.Placeholder('float32', 'A', (m, m)) +B = cinn.compute((m, m), lambda v: A([v[0], v[1]]) * 2., name='B') + +stages = cinn.create_stages([B]) +fn0 = cinn.lower("fn", stages, [A.to_tensor(), B]) +print("Original kernel before optimizing:\n", fn0) +stages[B].fuse(0, 1) +stages[B].split(level=0, factor=256) +stages[B].bind(0, "blockIdx.x") +stages[B].bind(1, "threadIdx.x") +fn1 = cinn.lower("fn", stages, [A.to_tensor(), B]) +print("\n======================================\nThe optimized kernel:\n", fn1) + +################################################################## +# +# Thus we get an optimized kernel. +#