[AutoParallel] Polish dist tensor design (#56368)

* polish dist teensor design * adjust constructor * polish details * polish details design * fix compile error * refactor init tensor impl * fix reshard test * polish details * add unittest for coverage
PaddlePaddle · Aug 22, 2023 · 8495377 · 8495377
1 parent ffff3da
commit 8495377
Show file tree

Hide file tree

Showing 26 changed files with 326 additions and 318 deletions.
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
@@ -112,7 +112,7 @@ void GradNodeBase::SetGradInMeta(const paddle::Tensor& fwd_out,
     return;
   }
 
-  phi::DenseTensor* dense_tensor = nullptr;
+  const phi::DenseTensor* dense_tensor = nullptr;
   // Record TensorMeta
   if (phi::DenseTensor::classof(fwd_out.impl().get())) {
     // Only Copy Meta
@@ -130,8 +130,8 @@ void GradNodeBase::SetGradInMeta(const paddle::Tensor& fwd_out,
     // TODO(chenweihang): DistTensor contains global and local meta, here
     // only set the local meta now, we should set global meta later
     dense_tensor =
-        static_cast<phi::distributed::DistTensor*>(fwd_out.impl().get())
-            ->mutable_value();
+        &(static_cast<phi::distributed::DistTensor*>(fwd_out.impl().get())
+              ->value());
 #endif
   } else {
     VLOG(7) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
@@ -270,16 +270,16 @@ void GradNodeBase::SetGradOutMeta(const paddle::Tensor& fwd_in,
       meta.SetPlace(fwd_in.place());
 #ifdef PADDLE_WITH_DISTRIBUTE
     } else if (phi::distributed::DistTensor::classof(fwd_in.impl().get())) {
-      phi::DenseTensor* dense_tensor =
+      const phi::DenseTensor& dense_tensor =
           static_cast<phi::distributed::DistTensor*>(fwd_in.impl().get())
-              ->mutable_value();
+              ->value();
       PADDLE_ENFORCE_NE(
-          dense_tensor->meta().dtype,
+          dense_tensor.meta().dtype,
           phi::DataType::UNDEFINED,
           paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta "
                                           "with phi::DataType::UNDEFINED,"
                                           "which is illegal."));
-      meta.SetTensorMeta(dense_tensor->meta());
+      meta.SetTensorMeta(dense_tensor.meta());
       meta.SetPlace(fwd_in.place());
 #endif
     } else {

diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -94,12 +94,9 @@ void GradTensorHolder::CopyValueFromTensor(size_t slot_id,
         // TODO(chenweihang): replace by valid dist_attr later
         auto temp =
             paddle::experimental::full(t.shape(), 1, t.dtype(), t.place());
-        auto dense_temp =
-            std::dynamic_pointer_cast<phi::DenseTensor>(temp.impl());
+        auto dense_temp = static_cast<phi::DenseTensor*>(temp.impl().get());
         auto dist_tensor = std::make_shared<phi::distributed::DistTensor>(
-            dense_temp,
-            dense_temp->meta(),
-            std::make_shared<phi::distributed::TensorDistAttr>());
+            *dense_temp, phi::distributed::TensorDistAttr());
         temp.set_impl(dist_tensor);
         buffer_[slot_id][rank] = temp;
 #endif

diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -121,8 +121,7 @@ void BindAutoParallel(py::module *m) {
               "is_suitable",
               [](phi::distributed::ReshardFunction &self,
                  py::handle py_tensor,
-                 const std::shared_ptr<phi::distributed::TensorDistAttr>
-                     &dist_attr) {
+                 const phi::distributed::TensorDistAttr &dist_attr) {
                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
                 auto p_dist =
                     std::dynamic_pointer_cast<phi::distributed::DistTensor>(
@@ -135,8 +134,7 @@ void BindAutoParallel(py::module *m) {
               [](phi::distributed::ReshardFunction &self,
                  phi::DeviceContext *dev_ctx,
                  py::handle py_tensor,
-                 const std::shared_ptr<phi::distributed::TensorDistAttr>
-                     &dist_attr) {
+                 const phi::distributed::TensorDistAttr &dist_attr) {
                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
                 auto p_dist =
                     std::dynamic_pointer_cast<phi::distributed::DistTensor>(
@@ -281,8 +279,7 @@ void BindAutoParallel(py::module *m) {
           py::arg("memo"))
       .def("__str__", &DeviceMesh::to_string);
 
-  py::class_<TensorDistAttr, std::shared_ptr<TensorDistAttr>> py_dist_attr(
-      *m, "TensorDistAttr");
+  py::class_<TensorDistAttr> py_dist_attr(*m, "TensorDistAttr");
   g_tensor_dist_attr_pytype =
       reinterpret_cast<PyTypeObject *>(py_dist_attr.ptr());
   py_dist_attr.def(py::init<>())

diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
@@ -68,52 +68,6 @@ PyObject* TensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
   return obj;
 }
 
-#ifdef PADDLE_WITH_DISTRIBUTE
-void EmptyDistTensorInitializer(
-    TensorObject* self,
-    const std::string& name,
-    const paddle::platform::Place& place,
-    const std::shared_ptr<TensorDistAttr>& dist_attr,
-    bool persistable = false,
-    int stop_gradient = -1,
-    framework::proto::VarType::Type dtype =
-        paddle::framework::proto::VarType::FP32,
-    const std::vector<int>& dims = {0}) {
-  auto ddims = phi::make_ddim(dims);
-  self->tensor.set_name(name);
-  auto autograd_meta = egr::EagerUtils::autograd_meta(&(self->tensor));
-  autograd_meta->SetPersistable(persistable);
-  if (stop_gradient != -1) {
-    autograd_meta->SetStopGradient(static_cast<bool>(stop_gradient));
-  }
-
-  std::shared_ptr<DistTensor> dist_tensor = nullptr;
-  if (dims.size() == 1 && dims[0] == 0) {
-    std::shared_ptr<phi::Allocation> allocation_ptr = nullptr;
-    dist_tensor = std::make_shared<DistTensor>(
-        allocation_ptr,
-        phi::DenseTensorMeta(paddle::framework::TransToPhiDataType(dtype),
-                             ddims),
-        dist_attr);
-  } else {
-    dist_tensor = std::make_shared<DistTensor>(
-        std::make_shared<phi::Allocation>(),
-        phi::DenseTensorMeta(paddle::framework::TransToPhiDataType(dtype),
-                             ddims),
-        dist_attr);
-  }
-  self->tensor.set_impl(dist_tensor);
-
-  if (!autograd_meta->GetMutableGradNode()) {
-    autograd_meta->SetGradNode(
-        std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
-    VLOG(3) << "Tensor(" << name
-            << ") have not GradNode, add GradNodeAccumulation"
-            << autograd_meta->GradNode() << " for it.";
-  }
-}
-#endif
-
 // TODO(jiabin): Overload this once we need more constructor in Python
 void EmptyTensorInitializer(TensorObject* self,
                             const std::string& name,
@@ -184,44 +138,71 @@ void EmptyStringTensorInitializer(TensorObject* self,
 }
 
 #ifdef PADDLE_WITH_DISTRIBUTE
-void InitDistTensorWithNumpyValue(TensorObject* self,
-                                  const py::object& array,
-                                  const paddle::platform::Place& place,
-                                  bool zero_copy = false) {
-  PADDLE_ENFORCE_EQ(
-      self->tensor.defined(),
-      true,
-      paddle::platform::errors::Unavailable(
-          "Calling InitDistTensorWithNumpyValue of Eager Tensor without "
-          "EmptyDistTensorInitializer is "
-          "forbidden. Please check your code and make sure you new a "
-          "eager tensor before init it with NumPy."));
-  DistTensor* dist_tensor_ptr =
-      static_cast<DistTensor*>(self->tensor.impl().get());
-  phi::DenseTensor* impl_ptr =
-      static_cast<phi::DenseTensor*>(dist_tensor_ptr->mutable_value());
+void CreateDistTensorWithNumpyValue(TensorObject* self,
+                                    const std::string& name,
+                                    const paddle::platform::Place& place,
+                                    const TensorDistAttr& dist_attr,
+                                    const py::object& array,
+                                    bool persistable = false,
+                                    int stop_gradient = -1,
+                                    bool zero_copy = false,
+                                    framework::proto::VarType::Type dtype =
+                                        paddle::framework::proto::VarType::FP32,
+                                    const std::vector<int>& dims = {0}) {
+  auto ddims = phi::make_ddim(dims);
+  self->tensor.set_name(name);
+  auto autograd_meta = egr::EagerUtils::autograd_meta(&(self->tensor));
+  autograd_meta->SetPersistable(persistable);
+  if (stop_gradient != -1) {
+    autograd_meta->SetStopGradient(static_cast<bool>(stop_gradient));
+  }
+
+  phi::DenseTensor dense_tensor;
+  if (dims.size() == 1 && dims[0] == 0) {
+    std::shared_ptr<phi::Allocation> allocation_ptr = nullptr;
+    dense_tensor = phi::DenseTensor(
+        nullptr,
+        phi::DenseTensorMeta(paddle::framework::TransToPhiDataType(dtype),
+                             ddims));
+  } else {
+    dense_tensor = phi::DenseTensor(
+        std::make_shared<phi::Allocation>(),
+        phi::DenseTensorMeta(paddle::framework::TransToPhiDataType(dtype),
+                             ddims));
+  }
 
   if (platform::is_cpu_place(place)) {
-    SetTensorFromPyArray<platform::CPUPlace>(impl_ptr, array, place, zero_copy);
+    SetTensorFromPyArray<platform::CPUPlace>(
+        &dense_tensor, array, place, zero_copy);
   } else if (platform::is_xpu_place(place)) {
-    SetTensorFromPyArray<platform::XPUPlace>(impl_ptr, array, place, zero_copy);
+    SetTensorFromPyArray<platform::XPUPlace>(
+        &dense_tensor, array, place, zero_copy);
   } else if (platform::is_gpu_place(place)) {
     SetTensorFromPyArray<platform::CUDAPlace>(
-        impl_ptr, array, place, zero_copy);
+        &dense_tensor, array, place, zero_copy);
   } else if (platform::is_cuda_pinned_place(place)) {
     SetTensorFromPyArray<platform::CUDAPinnedPlace>(
-        impl_ptr, array, place, zero_copy);
+        &dense_tensor, array, place, zero_copy);
   } else if (platform::is_custom_place(place)) {
     SetTensorFromPyArray<platform::CustomPlace>(
-        impl_ptr, array, place, zero_copy);
+        &dense_tensor, array, place, zero_copy);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Place should be one of "
         "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/CustomPlace"));
   }
 
-  // TODO(dev): dist_tensor meta is not equal to dense tensor meta
-  dist_tensor_ptr->set_meta(impl_ptr->meta());
+  auto dist_tensor =
+      std::make_shared<phi::distributed::DistTensor>(dense_tensor, dist_attr);
+  self->tensor.set_impl(dist_tensor);
+
+  if (!autograd_meta->GetMutableGradNode()) {
+    autograd_meta->SetGradNode(
+        std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
+    VLOG(3) << "Tensor(" << name
+            << ") have not GradNode, add GradNodeAccumulation"
+            << autograd_meta->GradNode() << " for it.";
+  }
 }
 #endif
 
@@ -284,28 +265,25 @@ void InitStringTensorWithNumpyValue(TensorObject* self, const py::object& obj) {
 }
 
 #ifdef PADDLE_WITH_DISTRIBUTE
-void InitDistTensorWithTensor(
-    TensorObject* self,
-    const paddle::Tensor& src,
-    const paddle::platform::Place& place,
-    const std::string& name,
-    const std::shared_ptr<TensorDistAttr>& dist_attr) {
+void InitDistTensorWithTensor(TensorObject* self,
+                              const paddle::Tensor& src,
+                              const paddle::platform::Place& place,
+                              const std::string& name,
+                              const TensorDistAttr& dist_attr) {
   PADDLE_ENFORCE(src.is_dense_tensor(),
                  paddle::platform::errors::InvalidArgument(
                      "DistTensor can only initialize by DenseTensor"));
   self->tensor.set_name(name);
   if (place == src.place()) {
     std::shared_ptr<phi::DenseTensor> tensor =
         std::static_pointer_cast<phi::DenseTensor>(src.impl());
-    self->tensor.set_impl(
-        std::make_shared<DistTensor>(tensor, tensor->meta(), dist_attr));
+    self->tensor.set_impl(std::make_shared<DistTensor>(*tensor, dist_attr));
     VLOG(4) << "Same place, do ShareDataWith for DistTensor.";
   } else {
     std::shared_ptr<phi::DenseTensor> tensor =
         std::static_pointer_cast<phi::DenseTensor>(
             src.copy_to(place, true).impl());
-    self->tensor.set_impl(
-        std::make_shared<DistTensor>(tensor, tensor->meta(), dist_attr));
+    self->tensor.set_impl(std::make_shared<DistTensor>(*tensor, dist_attr));
     VLOG(4) << "Different place, do TensorCopy for DistTensor.";
   }
   if (src.get_autograd_meta()) {
@@ -416,13 +394,13 @@ paddle::platform::Place ParsePlace(
 }
 
 #ifdef PADDLE_WITH_DISTRIBUTE
-std::shared_ptr<TensorDistAttr> ParseDistAttrArgs(
+TensorDistAttr ParseDistAttrArgs(
     std::unordered_map<std::string, PyObject*> kws_map,
     std::unordered_map<std::string, Py_ssize_t> kw_order_map,
     PyObject* args,
     bool flag_kwargs,
     Py_ssize_t args_num) {
-  std::shared_ptr<TensorDistAttr> dist_attr = nullptr;
+  TensorDistAttr dist_attr;
   if (kw_order_map["dist_attr"] <= args_num) {
     dist_attr = CastPyArg2DistAttr(
         PyTuple_GET_ITEM(args, kw_order_map["dist_attr"] - 1),
@@ -530,13 +508,18 @@ void AutoInitTensorByPyArray(TensorObject* py_tensor_ptr,
       "stop_gradient", kws_map, kw_order_map, args, flag_kwargs, args_num);
 
 #ifdef PADDLE_WITH_DISTRIBUTE
-  std::shared_ptr<TensorDistAttr> dist_attr =
+  TensorDistAttr dist_attr =
       ParseDistAttrArgs(kws_map, kw_order_map, args, flag_kwargs, args_num);
 
-  if (dist_attr) {
-    EmptyDistTensorInitializer(
-        py_tensor_ptr, act_name, place, dist_attr, persistable, stop_gradient);
-    InitDistTensorWithNumpyValue(py_tensor_ptr, numpy_value, place, zero_copy);
+  if (!dist_attr.empty()) {
+    CreateDistTensorWithNumpyValue(py_tensor_ptr,
+                                   act_name,
+                                   place,
+                                   dist_attr,
+                                   numpy_value,
+                                   persistable,
+                                   stop_gradient,
+                                   zero_copy);
     return;
   }
 #endif
@@ -572,7 +555,7 @@ void AutoInitTensorByTensor(TensorObject* py_tensor_ptr,
   act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num);
 
 #ifdef PADDLE_WITH_DISTRIBUTE
-  std::shared_ptr<TensorDistAttr> dist_attr =
+  TensorDistAttr dist_attr =
       ParseDistAttrArgs(kws_map, kw_order_map, args, flag_kwargs, args_num);
 #endif
 
@@ -595,7 +578,7 @@ void AutoInitTensorByTensor(TensorObject* py_tensor_ptr,
       }
     }
 #ifdef PADDLE_WITH_DISTRIBUTE
-    if (dist_attr) {
+    if (!dist_attr.empty()) {
       InitDistTensorWithTensor(
           py_tensor_ptr, src_tensor, place, act_name, dist_attr);
     } else {

diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
@@ -147,6 +147,15 @@ static PyObject* tensor_method_numpy(TensorObject* self,
     return array;
   }
   auto tensor_dims = self->tensor.shape();
+#ifdef PADDLE_WITH_DISTRIBUTE
+  // Now the DistTensor's numpy() return the local tensor value
+  if (self->tensor.is_dist_tensor()) {
+    tensor_dims = phi::vectorize(
+        static_cast<phi::distributed::DistTensor*>(self->tensor.impl().get())
+            ->value()
+            .dims());
+  }
+#endif
   auto numpy_dtype = TensorDtype2NumpyDtype(self->tensor.type());
   auto sizeof_dtype = phi::SizeOf(self->tensor.type());
   Py_intptr_t py_dims[paddle::framework::DDim::kMaxRank];     // NOLINT

diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
@@ -378,7 +378,7 @@ PyObject* tensor_properties_get_dist_attr(TensorObject* self, void* closure) {
 #ifdef PADDLE_WITH_DISTRIBUTE
     phi::distributed::DistTensor* dist_tensor =
         static_cast<phi::distributed::DistTensor*>(self->tensor.impl().get());
-    return ToPyObject(dist_tensor->dist_attr().get());
+    return ToPyObject(&dist_tensor->dist_attr());
 #else
     RETURN_PY_NONE
 #endif

diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
@@ -547,11 +547,10 @@ platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos) {
 
 #ifdef PADDLE_WITH_DISTRIBUTE
 using phi::distributed::TensorDistAttr;
-std::shared_ptr<TensorDistAttr> CastPyArg2DistAttr(PyObject* obj,
-                                                   ssize_t arg_pos) {
+TensorDistAttr CastPyArg2DistAttr(PyObject* obj, ssize_t arg_pos) {
   if (PyObject_IsInstance(
           obj, reinterpret_cast<PyObject*>(g_tensor_dist_attr_pytype))) {
-    return ::pybind11::handle(obj).cast<std::shared_ptr<TensorDistAttr>>();
+    return ::pybind11::handle(obj).cast<TensorDistAttr>();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "

diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
@@ -313,8 +313,8 @@ paddle::DataType CastPyArg2DataTypeDirectly(PyObject* obj,
                                             ssize_t arg_pos);
 
 #ifdef PADDLE_WITH_DISTRIBUTE
-std::shared_ptr<phi::distributed::TensorDistAttr> CastPyArg2DistAttr(
-    PyObject* obj, ssize_t arg_pos);
+phi::distributed::TensorDistAttr CastPyArg2DistAttr(PyObject* obj,
+                                                    ssize_t arg_pos);
 #endif
 
 paddle::optional<paddle::Tensor> GetOptionalTensorFromArgs(

diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
@@ -1029,7 +1029,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
   py::class_<DistTensor>(m, "DistTensor")
       .def(
           "get_tensor",
-          [](DistTensor &self) { return self.mutable_value(); },
+          [](DistTensor &self) { return self.value(); },
           py::return_value_policy::reference)
       .def("numel",
            [](DistTensor &self) -> int64_t { return self.value().numel(); });

diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
@@ -539,11 +539,9 @@ phi::distributed::DistTensor* SetKernelDistOutput(Tensor* out) {
   if (out) {
     // TODO(chenweihang): now all dist case are nullptr
     if (out->impl() == nullptr) {
-      auto dense_t = std::make_shared<phi::DenseTensor>();
       // TODO(chenweihang): polish code, dist_attr is null now
-      auto dist_attr = std::make_shared<phi::distributed::TensorDistAttr>();
       auto dist_t = std::make_shared<phi::distributed::DistTensor>(
-          dense_t, phi::DenseTensorMeta(), dist_attr);
+          phi::DDim(), phi::distributed::TensorDistAttr());
       out->set_impl(dist_t);
     }
     return static_cast<phi::distributed::DistTensor*>(out->impl().get());