Skip to content

Commit

Permalink
Add benchmark data for elastic AllReduce (sql-machine-learning#2306)
Browse files Browse the repository at this point in the history
* Add benchmark data for elastic AllReduce

* Add benchmark data for elastic AllReduce

* Add experiment result using 2 gpus to train Resnet20

* Pre-commit

* Set range of y-axis
  • Loading branch information
workingloong authored Sep 25, 2020
1 parent 2f13115 commit eee2aab
Show file tree
Hide file tree
Showing 6 changed files with 245 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .isort.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[settings]
multi_line_output=3
line_length=79
known_third_party = PIL,deepctr,docker,google,grpc,horovod,jinja2,kubernetes,numpy,odps,pandas,recordio,requests,setuptools,sklearn,tensorflow,yaml
known_third_party = PIL,deepctr,docker,google,grpc,horovod,jinja2,kubernetes,matplotlib,numpy,odps,pandas,recordio,requests,setuptools,sklearn,tensorflow,yaml
include_trailing_comma=True
100 changes: 100 additions & 0 deletions docs/benchmark/allreduce/data/cifar10_resnet20_acc.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
epoch_index,baseline_4workers_0,baseline_4workers_1,baseline_2workers_0,baseline_2workers_1,elastic_2_4workers_0,elastic_2_4workers_1
1,0.471,0.4483,0.5925,0.5335,0.4977,0.2978
2,0.6064,0.5948,0.5806,0.5032,0.6109,0.5771
3,0.6727,0.5889,0.6927,0.7207,0.6388,0.6354
4,0.704,0.6604,0.7312,0.6148,0.6086,0.6521
5,0.656,0.4814,0.7713,0.668,0.6903,0.7379
6,0.7523,0.7257,0.7538,0.708,0.7321,0.7298
7,0.7106,0.6408,0.7106,0.7436,0.7614,0.7124
8,0.7697,0.7177,0.7285,0.7581,0.7613,0.5969
9,0.7459,0.6214,0.7787,0.7616,0.7818,0.7701
10,0.7174,0.7652,0.7795,0.7836,0.7321,0.7367
11,0.7054,0.6265,0.7431,0.7961,0.743,0.7602
12,0.7953,0.7027,0.7797,0.7649,0.7822,0.7681
13,0.7655,0.7013,0.7486,0.745,0.7589,0.7242
14,0.7566,0.7227,0.7503,0.7567,0.7271,0.7133
15,0.7749,0.7205,0.7605,0.8076,0.7021,0.7831
16,0.7889,0.7771,0.7771,0.8203,0.7811,0.7008
17,0.7423,0.7768,0.7535,0.8253,0.7868,0.7702
18,0.7582,0.725,0.8015,0.807,0.7322,0.7451
19,0.7578,0.8168,0.7949,0.7622,0.7886,0.751
20,0.7396,0.7437,0.7714,0.8412,0.7723,0.7957
21,0.7955,0.8197,0.8006,0.7524,0.8028,0.7499
22,0.7955,0.7417,0.773,0.8137,0.6793,0.729
23,0.7376,0.7943,0.8122,0.7876,0.7694,0.7571
24,0.795,0.7528,0.7824,0.7728,0.7936,0.7303
25,0.749,0.8237,0.7737,0.8004,0.6773,0.8187
26,0.8371,0.8221,0.8176,0.8052,0.8457,0.8588
27,0.7351,0.8384,0.8341,0.788,0.8285,0.8485
28,0.7979,0.8134,0.8371,0.775,0.8236,0.8258
29,0.7158,0.8255,0.8034,0.8287,0.8095,0.8331
30,0.701,0.82,0.7333,0.797,0.7948,0.8204
31,0.7896,0.8348,0.8283,0.8009,0.8401,0.852
32,0.816,0.8418,0.7545,0.7684,0.849,0.8387
33,0.7836,0.8447,0.8178,0.8153,0.8254,0.8368
34,0.7838,0.8563,0.7045,0.8009,0.8389,0.8115
35,0.8208,0.8287,0.8646,0.8042,0.8218,0.8243
36,0.8472,0.7953,0.8658,0.8617,0.8287,0.8011
37,0.8495,0.8438,0.8711,0.8402,0.8385,0.8522
38,0.8015,0.8205,0.8631,0.8564,0.8075,0.8399
39,0.8428,0.7906,0.8578,0.8696,0.8409,0.8521
40,0.8595,0.8381,0.8667,0.8659,0.822,0.8415
41,0.8732,0.8724,0.8774,0.8718,0.832,0.8284
42,0.8805,0.86,0.8771,0.877,0.8215,0.8549
43,0.875,0.8709,0.8729,0.8817,0.8459,0.844
44,0.8755,0.8698,0.8797,0.8825,0.8283,0.8518
45,0.8756,0.8635,0.8804,0.8725,0.8343,0.8804
46,0.8607,0.8504,0.8705,0.8731,0.8508,0.8776
47,0.8656,0.86,0.8698,0.8673,0.829,0.8762
48,0.8774,0.8585,0.8691,0.8672,0.8453,0.8796
49,0.8749,0.8494,0.8784,0.8661,0.8357,0.878
50,0.8635,0.8618,0.8764,0.8684,0.8497,0.8838
51,0.8716,0.8524,0.8768,0.8649,0.882,0.8781
52,0.876,0.8551,0.8715,0.8693,0.8815,0.8739
53,0.8735,0.8587,0.8656,0.873,0.8797,0.8713
54,0.8724,0.8582,0.8741,0.8723,0.888,0.8762
55,0.8732,0.8678,0.8669,0.8674,0.8779,0.8779
56,0.8614,0.8577,0.8683,0.8694,0.8766,0.8694
57,0.8681,0.8618,0.8726,0.8695,0.8777,0.8711
58,0.8686,0.8699,0.8731,0.8674,0.8786,0.8822
59,0.8711,0.859,0.87,0.8636,0.8812,0.8796
60,0.8749,0.8751,0.8784,0.8671,0.8733,0.8799
61,0.8849,0.8787,0.8844,0.8624,0.8715,0.8813
62,0.8827,0.8791,0.88,0.8777,0.8705,0.8707
63,0.8766,0.8832,0.8786,0.8804,0.8712,0.8747
64,0.8783,0.8798,0.8796,0.8807,0.8876,0.8769
65,0.8769,0.865,0.8785,0.8788,0.8867,0.875
66,0.8808,0.8717,0.8724,0.8741,0.8838,0.8752
67,0.8814,0.8784,0.8804,0.8781,0.8854,0.8759
68,0.8777,0.8718,0.8788,0.877,0.8822,0.8777
69,0.8799,0.8728,0.8741,0.8747,0.8841,0.8762
70,0.8795,0.8764,0.8806,0.8765,0.8783,0.8696
71,0.8823,0.8742,0.8758,0.8661,0.8739,0.8745
72,0.8777,0.8704,0.8769,0.8715,0.8803,0.8754
73,0.8776,0.8734,0.8782,0.8707,0.8775,0.8769
74,0.8815,0.8703,0.8799,0.8772,0.8768,0.8736
75,0.878,0.8714,0.8756,0.8752,0.8787,0.8751
76,0.8778,0.8737,0.876,0.869,0.8707,0.8718
77,0.8765,0.8706,0.8777,0.8798,0.8815,0.8727
78,0.8778,0.8532,0.8744,0.8777,0.8735,0.8646
79,0.8814,0.8693,0.8721,0.8758,0.8764,0.8722
80,0.8832,0.878,0.8832,0.8742,0.8817,0.8737
81,0.8902,0.8807,0.8853,0.8733,0.8705,0.8699
82,0.8915,0.8816,0.8864,0.8774,0.8809,0.8836
83,0.8903,0.8805,0.8862,0.881,0.8757,0.8857
84,0.8906,0.8814,0.8861,0.8822,0.8727,0.8849
85,0.8908,0.8821,0.8869,0.8828,0.8866,0.8867
86,0.8904,0.8818,0.8867,0.8813,0.8884,0.886
87,0.8924,0.8833,0.8856,0.8822,0.8885,0.887
88,0.8903,0.8839,0.8854,0.8815,0.889,0.8872
89,0.8911,0.8836,0.8855,0.88,0.889,0.8876
90,0.8905,0.8834,0.8858,0.8805,0.8897,0.8866
91,0.8898,0.885,0.8854,0.8809,0.8895,0.8865
92,0.8911,0.8859,0.886,0.88,0.8894,0.8864
93,0.8909,0.8839,0.8858,0.8813,0.8904,0.888
94,0.8903,0.8847,0.8861,0.8829,0.8908,0.8878
95,0.8908,0.883,0.8854,0.8818,0.8898,0.8875
96,0.8906,0.8847,0.8856,0.8835,0.8901,0.8866
97,0.8897,0.8848,0.8867,0.8836,0.8894,0.8877
98,0.8905,0.8827,0.887,0.8825,0.8885,0.887
99,0.891,0.8853,0.8852,0.8844,0.8884,0.8893
30 changes: 30 additions & 0 deletions docs/benchmark/allreduce/data/elastic_allreduce_gpu.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
time,gpu
0,0
1,0
1,2
4,2
4,3
8,3
8,4
143,4
143,5
152,5
152,6
474,6
474,5
484,5
484,4
485,4
485,3
492,3
492,4
496,4
496,3
506,3
506,4
773,4
773,2
778,2
778,1
809,1
809,0
18 changes: 18 additions & 0 deletions docs/benchmark/allreduce/data/gang_allreduce_gpu.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
time,gpu
0,0
1,0
1,1
13,1
13,3
16,3
16,4
465,4
465,0
518,0
518,1
521,1
521,3
522,3
522,4
968,4
968,0
48 changes: 48 additions & 0 deletions docs/benchmark/allreduce/scripts/acc_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2020 The ElasticDL Authors. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas as pd
from matplotlib import pyplot as plt

if __name__ == "__main__":
df = pd.read_csv("../data/cifar10_resnet20_acc.csv")

f = plt.figure(figsize=(10, 8))

x = df["epoch_index"]
y = df["baseline_4workers_0"]
plt.plot(x, y, "--", label="baseline_4workers")

y = df["baseline_4workers_1"]
plt.plot(x, y, "--", label="baseline_4workers")

y = df["baseline_2workers_0"]
plt.plot(x, y, "-.", label="baseline_2workers")

y = df["baseline_2workers_1"]
plt.plot(x, y, "-.", label="baseline_2workers")

y = df["elastic_2_4workers_0"]
plt.plot(x, y, label="elastic_2_4workers")

y = df["elastic_2_4workers_1"]
plt.plot(x, y, label="elastic_2_4workers")

plt.title("The Accuracy of Resnet20 on cifar10 test dataset")
plt.xlabel(
xlabel="Iteration epoches", fontsize=18, fontfamily="Times New Roman"
)
plt.ylabel(ylabel="Accuracy", fontsize=18, fontfamily="Times New Roman")
plt.legend(loc="upper left", bbox_to_anchor=(0.02, 0.98))
plt.ylim((0.3, 1))
plt.show()
48 changes: 48 additions & 0 deletions docs/benchmark/allreduce/scripts/gpu_uilization_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2020 The ElasticDL Authors. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas as pd
from matplotlib import pyplot as plt

gang_gpu_df = pd.read_csv("../data/gang_allreduce_gpu.csv")
elastic_gpu_df = pd.read_csv("../data/elastic_allreduce_gpu.csv")

# fig = plt.figure(figsize=(15, 4))
fig = plt.figure(figsize=(13, 6))
ax = fig.add_subplot(211)
ax.plot(gang_gpu_df["time"], gang_gpu_df["gpu"])

plt.title(
"Gang scheduling -- two jobs one after another",
fontsize=14,
fontfamily="Times New Roman",
)
plt.xlabel(xlabel="time(sec)", fontsize=14, fontfamily="Times New Roman")
plt.ylabel(ylabel="Utilized GPUs", fontsize=14, fontfamily="Times New Roman")
plt.xlim(0, 1000)
plt.ylim(0, 7)

# plt.subplot(2, 1, 2)
ax = fig.add_subplot(212)
ax.plot(elastic_gpu_df["time"], elastic_gpu_df["gpu"])
plt.title(
"Elastic scheduling -- two jobs overlap and fully use the cluster",
fontsize=14,
fontfamily="Times New Roman",
)
plt.xlabel(xlabel="time(sec)", fontsize=14, fontfamily="Times New Roman")
plt.ylabel(ylabel="Utilized GPU", fontsize=14, fontfamily="Times New Roman")
plt.xlim(0, 1000)
plt.ylim(0, 7)
plt.subplots_adjust(hspace=0.5)
plt.show()

0 comments on commit eee2aab

Please sign in to comment.