{BugFix} Add check in ADT downloader which looks for removed datasets

Summary: In ADT data version 1.2 (to be announced shortly), we removed 5 datasets that were found to have bad IMU data This diff adds a check in the downloader to warn users that these datasets are no longer available for download Reviewed By: chpeng-fb Differential Revision: D52384544 fbshipit-source-id: 762df3abb95038959f8976aea60a99a643808756
facebookresearch · Dec 27, 2023 · 237cdd9 · 237cdd9
1 parent a314f72
commit 237cdd9
Show file tree

Hide file tree

Showing 6 changed files with 20 additions and 9 deletions.
diff --git a/projectaria_tools/projects/adt/adt_benchmark_dataset_downloader.py b/projectaria_tools/projects/adt/adt_benchmark_dataset_downloader.py
@@ -175,7 +175,7 @@ def main():
                     5: Total MPS SLAM trajectories: ~15GB
                     6: Total MPS SLAM semidense points and observations: ~140GB
                     7: Total MPS SLAM online calibration: ~5GB
-                    Do you want to download all 222 sequences? [y/N]
+                    Do you want to download all 217 sequences? [y/N]
                     """
                 ).lower()
                 == "y"

diff --git a/projectaria_tools/projects/adt/aria_digital_twin_downloader.py b/projectaria_tools/projects/adt/aria_digital_twin_downloader.py
@@ -23,6 +23,8 @@
 from zipfile import is_zipfile, ZipFile
 
 import requests
+
+from projectaria_tools.projects.adt import is_dataset_corrupt
 from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry
 from tqdm import tqdm
@@ -230,6 +232,12 @@ def __init__(
         self.sequences = sequences
         if sequences is None:
             self.sequences = self.__get_sequences_of_group(data_group)
+        else:
+            for sequence in sequences:
+                if is_dataset_corrupt(sequence):
+                    raise ValueError(
+                        f"Sequence {sequence} has been removed from downloads list due to corrupt data"
+                    )
         self.overwrite = overwrite
 
     def download_data(self, output_folder: str):

diff --git a/projects/AriaDigitalTwinDatasetTools/data_provider/AriaDigitalTwinDataFileKeys.h b/projects/AriaDigitalTwinDatasetTools/data_provider/AriaDigitalTwinDataFileKeys.h
@@ -73,6 +73,12 @@ inline const std::string kDatasetVersionUnknown = "Unknown";
 inline const std::string kDatasetNameKey = "dataset_name";
 inline const std::unordered_map<std::string, std::string> kLatestDatasetVersions{
     {"ADT_2023", "1.2"}};
+inline const std::unordered_map<std::string, std::string> kCorruptDatasets{
+    {"Apartment_release_multiuser_party_seq145", "IMU data corrupted"},
+    {"Apartment_release_multiuser_clean_seq115", "IMU data corrupted"},
+    {"Apartment_release_clean_seq139", "IMU data corrupted"},
+    {"Apartment_release_multiskeleton_party_seq112", "IMU data corrupted"},
+    {"Apartment_release_multiskeleton_party_seq109", "IMU data corrupted"}};
 
 // data values
 constexpr int64_t kInvalidDeviceTimestampNs = -1;

diff --git a/projects/AriaDigitalTwinDatasetTools/data_provider/AriaDigitalTwinDataProvider.cpp b/projects/AriaDigitalTwinDatasetTools/data_provider/AriaDigitalTwinDataProvider.cpp
@@ -50,13 +50,6 @@ namespace projectaria::dataset::adt {
 constexpr auto kInstanceFileErrorTemplate =
     "invalid instance file. key: '{}' not available in instances json file for instance id {}";
 
-const std::unordered_map<std::string, std::string> kCorruptDatasets{
-    {"Apartment_release_multiuser_party_seq145", "IMU data corrupted"},
-    {"Apartment_release_multiuser_clean_seq115", "IMU data corrupted"},
-    {"Apartment_release_clean_seq139", "IMU data corrupted"},
-    {"Apartment_release_multiskeleton_party_seq112", "IMU data corrupted"},
-    {"Apartment_release_multiskeleton_party_seq109", "IMU data corrupted"}};
-
 namespace {
 std::ifstream openFile(const fs::path& filePath, bool skipHeader = true) {
   std::ifstream fileStream(filePath);

diff --git a/projects/AriaDigitalTwinDatasetTools/python/AriaDigitalTwinDatasetToolsPyBind.h b/projects/AriaDigitalTwinDatasetTools/python/AriaDigitalTwinDatasetToolsPyBind.h
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "AriaDigitalTwinDataFileKeys.h"
 #include "AriaDigitalTwinDataPathsProvider.h"
 #include "AriaDigitalTwinDataProvider.h"
 
@@ -693,6 +694,9 @@ void exportAriaDigitalTwin(py::module& m) {
       "[b1, b2, b3, b4, b1, t1, t2, t3, t4, t1, t2, b2, b3, t3, t4, b4] where b is for "
       "bottom and t is for top",
       py::arg("bbox"));
+  m.def("is_dataset_corrupt", [](const std::string& seq) {
+    return kCorruptDatasets.find(seq) != kCorruptDatasets.end();
+  });
 }
 
 } // namespace projectaria::dataset::adt
diff --git a/website/docs/open_datasets/aria_digital_twin_dataset/dataset_download.mdx b/website/docs/open_datasets/aria_digital_twin_dataset/dataset_download.mdx
@@ -64,7 +64,7 @@ For more information on the content in the other sequences, see the Data Content
 ## Download the Aria Digital Twin (ADT) benchmark dataset
 ### Data size
 
-Aria Digital Twin dataset consists of 222 sequences in total. The total size of the dataset is about 3.5TB. The dataset is split into 4 data types that can be downloaded individually. The size of each data type is below.
+Aria Digital Twin dataset consists of 217 sequences in total. The total size of the dataset is about 3.5TB. The dataset is split into 4 data types that can be downloaded individually. The size of each data type is below.
 
 | **Data type** | What's included | Per sequence size | Total size for all sequences |
 | --- | --- | --- | --- |