forked from waymo-research/waymo-open-dataset
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.proto
420 lines (372 loc) · 16.5 KB
/
dataset.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
/* Copyright 2019 The Waymo Open Dataset Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
syntax = "proto2";
package waymo.open_dataset;
import "waymo_open_dataset/label.proto";
message MatrixShape {
// Dimensions for the Matrix messages defined below. Must not be empty.
//
// The order of entries in 'dims' matters, as it indicates the layout of the
// values in the tensor in-memory representation.
//
// The first entry in 'dims' is the outermost dimension used to lay out the
// values; the last entry is the innermost dimension. This matches the
// in-memory layout of row-major matrices.
repeated int32 dims = 1;
}
// Row-major matrix.
// Requires: data.size() = product(shape.dims()).
message MatrixFloat {
repeated float data = 1 [packed = true];
optional MatrixShape shape = 2;
}
// Row-major matrix.
// Requires: data.size() = product(shape.dims()).
message MatrixInt32 {
repeated int32 data = 1 [packed = true];
optional MatrixShape shape = 2;
}
message CameraName {
enum Name {
UNKNOWN = 0;
FRONT = 1;
FRONT_LEFT = 2;
FRONT_RIGHT = 3;
SIDE_LEFT = 4;
SIDE_RIGHT = 5;
}
}
// 'Laser' is used interchangeably with 'Lidar' in this file.
message LaserName {
enum Name {
UNKNOWN = 0;
TOP = 1;
FRONT = 2;
SIDE_LEFT = 3;
SIDE_RIGHT = 4;
REAR = 5;
}
}
// 4x4 row major transform matrix that tranforms 3d points from one frame to
// another.
message Transform {
repeated double transform = 1;
}
message Velocity {
// Velocity in m/s.
optional float v_x = 1;
optional float v_y = 2;
optional float v_z = 3;
// Angular velocity in rad/s.
optional double w_x = 4;
optional double w_y = 5;
optional double w_z = 6;
}
message CameraCalibration {
optional CameraName.Name name = 1;
// 1d Array of [f_u, f_v, c_u, c_v, k{1, 2}, p{1, 2}, k{3}].
// Note that this intrinsic corresponds to the images after scaling.
// Camera model: pinhole camera.
// Lens distortion:
// Radial distortion coefficients: k1, k2, k3.
// Tangential distortion coefficients: p1, p2.
// k_{1, 2, 3}, p_{1, 2} follows the same definition as OpenCV.
// https://en.wikipedia.org/wiki/Distortion_(optics)
// https://docs.opencv.org/2.4/doc/tutorials/calib3d/camera_calibration/camera_calibration.html
repeated double intrinsic = 2;
// Camera frame to vehicle frame.
optional Transform extrinsic = 3;
// Camera image size.
optional int32 width = 4;
optional int32 height = 5;
enum RollingShutterReadOutDirection {
UNKNOWN = 0;
TOP_TO_BOTTOM = 1;
LEFT_TO_RIGHT = 2;
BOTTOM_TO_TOP = 3;
RIGHT_TO_LEFT = 4;
GLOBAL_SHUTTER = 5;
}
optional RollingShutterReadOutDirection rolling_shutter_direction = 6;
}
message LaserCalibration {
optional LaserName.Name name = 1;
// If non-empty, the beam pitch (in radians) is non-uniform. When constructing
// a range image, this mapping is used to map from beam pitch to range image
// row. If this is empty, we assume a uniform distribution.
repeated double beam_inclinations = 2;
// beam_inclination_{min,max} (in radians) are used to determine the mapping.
optional double beam_inclination_min = 3;
optional double beam_inclination_max = 4;
// Lidar frame to vehicle frame.
optional Transform extrinsic = 5;
}
message Context {
// A unique name that identifies the frame sequence.
optional string name = 1;
repeated CameraCalibration camera_calibrations = 2;
repeated LaserCalibration laser_calibrations = 3;
// Some stats for the run segment used.
message Stats {
message ObjectCount {
optional Label.Type type = 1;
// The number of unique objects with the type in the segment.
optional int32 count = 2;
}
repeated ObjectCount laser_object_counts = 1;
repeated ObjectCount camera_object_counts = 5;
// Day, Dawn/Dusk, or Night, determined from sun elevation.
optional string time_of_day = 2;
// Human readable location (e.g. CHD, SF) of the run segment.
optional string location = 3;
// Currently either Sunny or Rain.
optional string weather = 4;
}
optional Stats stats = 4;
}
// Range image is a 2d tensor. The first dim (row) represents pitch. The second
// dim represents yaw.
// There are two types of range images:
// 1. Raw range image: Raw range image with a non-empty
// 'range_image_pose_compressed' which tells the vehicle pose of each
// range image cell.
// 2. Virtual range image: Range image with an empty
// 'range_image_pose_compressed'. This range image is constructed by
// transforming all lidar points into a fixed vehicle frame (usually the
// vehicle frame of the middle scan).
// NOTE: 'range_image_pose_compressed' is only populated for the first range
// image return. The second return has the exact the same range image pose as
// the first one.
message RangeImage {
// Zlib compressed [H, W, 4] serialized version of MatrixFloat.
// To decompress:
// string val = ZlibDecompress(range_image_compressed);
// MatrixFloat range_image;
// range_image.ParseFromString(val);
// Inner dimensions are:
// * channel 0: range
// * channel 1: intensity
// * channel 2: elongation
// * channel 3: is in any no label zone.
optional bytes range_image_compressed = 2;
// Lidar point to camera image projections. A point can be projected to
// multiple camera images. We pick the first two at the following order:
// [FRONT, FRONT_LEFT, FRONT_RIGHT, SIDE_LEFT, SIDE_RIGHT].
//
// Zlib compressed [H, W, 6] serialized version of MatrixInt32.
// To decompress:
// string val = ZlibDecompress(camera_projection_compressed);
// MatrixInt32 camera_projection;
// camera_projection.ParseFromString(val);
// Inner dimensions are:
// * channel 0: CameraName.Name of 1st projection. Set to UNKNOWN if no
// projection.
// * channel 1: x (axis along image width)
// * channel 2: y (axis along image height)
// * channel 3: CameraName.Name of 2nd projection. Set to UNKNOWN if no
// projection.
// * channel 4: x (axis along image width)
// * channel 5: y (axis along image height)
// Note: pixel 0 corresponds to the left edge of the first pixel in the image.
optional bytes camera_projection_compressed = 3;
// Zlib compressed [H, W, 6] serialized version of MatrixFloat.
// To decompress:
// string val = ZlibDecompress(range_image_pose_compressed);
// MatrixFloat range_image_pose;
// range_image_pose.ParseFromString(val);
// Inner dimensions are [roll, pitch, yaw, x, y, z] represents a transform
// from vehicle frame to global frame for every range image pixel.
// This is ONLY populated for the first return. The second return is assumed
// to have exactly the same range_image_pose_compressed.
//
// The roll, pitch and yaw are specified as 3-2-1 Euler angle rotations,
// meaning that rotating from the navigation to vehicle frame consists of a
// yaw, then pitch and finally roll rotation about the z, y and x axes
// respectively. All rotations use the right hand rule and are positive
// in the counter clockwise direction.
optional bytes range_image_pose_compressed = 4;
// Zlib compressed [H, W, 5] serialized version of MatrixFloat.
// To decompress:
// string val = ZlibDecompress(range_image_flow_compressed);
// MatrixFloat range_image_flow;
// range_image_flow.ParseFromString(val);
// Inner dimensions are [vx, vy, vz, pointwise class].
//
// If the point is not annotated with scene flow information, class is set
// to -1. A point is not annotated if it is in a no-label zone or if its label
// bounding box does not have a corresponding match in the previous frame,
// making it infeasible to estimate the motion of the point.
// Otherwise, (vx, vy, vz) are velocity along (x, y, z)-axis for this point
// and class is set to one of the following values:
// -1: no-flow-label, the point has no flow information.
// 0: unlabeled or "background,", i.e., the point is not contained in a
// bounding box.
// 1: vehicle, i.e., the point corresponds to a vehicle label box.
// 2: pedestrian, i.e., the point corresponds to a pedestrian label box.
// 3: sign, i.e., the point corresponds to a sign label box.
// 4: cyclist, i.e., the point corresponds to a cyclist label box.
optional bytes range_image_flow_compressed = 5;
// Zlib compressed [H, W, 2] serialized version of MatrixInt32.
// To decompress:
// string val = ZlibDecompress(segmentation_label_compressed);
// MatrixInt32 segmentation_label.
// segmentation_label.ParseFromString(val);
// Inner dimensions are [instance_id, semantic_class].
//
// NOTE:
// 1. Only TOP LiDAR has segmentation labels.
// 2. Not every frame has segmentation labels. This field is not set if a
// frame is not labeled.
// 3. There can be points missing segmentation labels within a labeled frame.
// Their label are set to TYPE_NOT_LABELED when that happens.
optional bytes segmentation_label_compressed = 6;
// Deprecated, do not use.
optional MatrixFloat range_image = 1 [deprecated = true];
}
// Panoptic (instance + semantic) segmentation labels for a given camera image.
// Associations can also be provided between each instance ID and a globally
// unique ID across all frames.
message CameraSegmentationLabel {
// The value used to separate instance_ids from different semantic classes.
// See the panoptic_label field for how this is used. Must be set to be
// greater than the maximum instance_id.
optional int32 panoptic_label_divisor = 1;
// A uint16 png encoded image, with the same resolution as the corresponding
// camera image. Each pixel contains a panoptic segmentation label, which is
// computed as:
// semantic_class_id * panoptic_label_divisor + instance_id.
// We set instance_id = 0 for pixels for which there is no instance_id.
// NOTE: Instance IDs in this label are only consistent within this camera
// image. Use instance_id_to_global_id_mapping to get cross-camera consistent
// instance IDs.
optional bytes panoptic_label = 2;
// A mapping between each panoptic label with an instance_id and a globally
// unique ID across all frames within the same sequence. This can be used to
// match instances across cameras and over time. i.e. instances belonging to
// the same object will map to the same global ID across all frames in the
// same sequence.
// NOTE: These unique IDs are not consistent with other IDs in the dataset,
// e.g. the bounding box IDs.
message InstanceIDToGlobalIDMapping {
optional int32 local_instance_id = 1;
optional int32 global_instance_id = 2;
// If false, the corresponding instance will not have consistent global ids
// between frames.
optional bool is_tracked = 3;
}
repeated InstanceIDToGlobalIDMapping instance_id_to_global_id_mapping = 3;
// The sequence id for this label. The above instance_id_to_global_id_mapping
// is only valid with other labels with the same sequence id.
optional string sequence_id = 4;
}
// All timestamps in this proto are represented as seconds since Unix epoch.
message CameraImage {
optional CameraName.Name name = 1;
// JPEG image.
optional bytes image = 2;
// SDC pose.
optional Transform pose = 3;
// SDC velocity at 'pose_timestamp' below. The velocity value is represented
// at *global* frame.
// With this velocity, the pose can be extrapolated.
// r(t+dt) = r(t) + dr/dt * dt where dr/dt = v_{x,y,z}.
// dR(t)/dt = W*R(t) where W = SkewSymmetric(w_{x,y,z})
// This differential equation solves to: R(t) = exp(Wt)*R(0) if W is constant.
// When dt is small: R(t+dt) = (I+W*dt)R(t)
// r(t) = (x(t), y(t), z(t)) is vehicle location at t in the global frame.
// R(t) = Rotation Matrix (3x3) from the body frame to the global frame at t.
// SkewSymmetric(x,y,z) is defined as the cross-product matrix in the
// following:
// https://en.wikipedia.org/wiki/Cross_product#Conversion_to_matrix_multiplication
optional Velocity velocity = 4;
// Timestamp of the `pose` above.
optional double pose_timestamp = 5;
// Rolling shutter params.
// The following explanation assumes left->right rolling shutter.
//
// Rolling shutter cameras expose and read the image column by column, offset
// by the read out time for each column. The desired timestamp for each column
// is the middle of the exposure of that column as outlined below for an image
// with 3 columns:
// ------time------>
// |---- exposure col 1----| read |
// -------|---- exposure col 2----| read |
// --------------|---- exposure col 3----| read |
// ^trigger time ^readout end time
// ^time for row 1 (= middle of exposure of row 1)
// ^time image center (= middle of exposure of middle row)
// Shutter duration in seconds. Exposure time per column.
optional double shutter = 6;
// Time when the sensor was triggered and when last readout finished.
// The difference between trigger time and readout done time includes
// the exposure time and the actual sensor readout time.
optional double camera_trigger_time = 7;
optional double camera_readout_done_time = 8;
// Panoptic segmentation labels for this camera image.
// NOTE: Not every image has panoptic segmentation labels.
optional CameraSegmentationLabel camera_segmentation_label = 10;
}
// The camera labels associated with a given camera image. This message
// indicates the ground truth information for the camera image
// recorded by the given camera. If there are no labeled objects in the image,
// then the labels field is empty.
message CameraLabels {
optional CameraName.Name name = 1;
repeated Label labels = 2;
}
message Laser {
optional LaserName.Name name = 1;
optional RangeImage ri_return1 = 2;
optional RangeImage ri_return2 = 3;
}
message Frame {
// The following field numbers are reserved for third-party extensions. Users
// may declare new fields in that range in their own .proto files without
// having to edit the original file.
extensions 1000 to max;
// This context is the same for all frames belong to the same driving run
// segment. Use context.name to identify frames belong to the same driving
// segment. We do not store all frames from one driving segment in one proto
// to avoid huge protos.
optional Context context = 1;
// Frame start time, which is the timestamp of the first top LiDAR scan
// within this frame. Note that this timestamp does not correspond to the
// provided vehicle pose (pose).
optional int64 timestamp_micros = 2;
// Frame vehicle pose. Note that unlike in CameraImage, the Frame pose does
// not correspond to the provided timestamp (timestamp_micros). Instead, it
// roughly (but not exactly) corresponds to the vehicle pose in the middle of
// the given frame. The frame vehicle pose defines the coordinate system which
// the 3D laser labels are defined in.
optional Transform pose = 3;
// The camera images.
repeated CameraImage images = 4;
// The LiDAR sensor data.
repeated Laser lasers = 5;
// Native 3D labels that correspond to the LiDAR sensor data. The 3D labels
// are defined w.r.t. the frame vehicle pose coordinate system (pose).
repeated Label laser_labels = 6;
// The native 3D LiDAR labels (laser_labels) projected to camera images. A
// projected label is the smallest image axis aligned rectangle that can cover
// all projected points from the 3d LiDAR label. The projected label is
// ignored if the projection is fully outside a camera image. The projected
// label is clamped to the camera image if it is partially outside.
repeated CameraLabels projected_lidar_labels = 9;
// Native 2D camera labels. Note that if a camera identified by
// CameraLabels.name has an entry in this field, then it has been labeled,
// even though it is possible that there are no labeled objects in the
// corresponding image, which is identified by a zero sized
// CameraLabels.labels.
repeated CameraLabels camera_labels = 8;
// No label zones in the *global* frame.
repeated Polygon2dProto no_label_zones = 7;
}