-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathresilience_etal.bib
4216 lines (4002 loc) · 245 KB
/
resilience_etal.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@inproceedings{FTI,
author = {Bautista-Gomez et al., Leonardo},
title = {{FTI: High Performance Fault Tolerance Interface for Hybrid Systems}},
booktitle = {Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis},
series = {SC '11},
year = {2011},
isbn = {978-1-4503-0771-0},
location = {Seattle, Washington},
pages = {32:1--32:32},
articleno = {32},
numpages = {32},
url = {http://doi.acm.org/10.1145/2063384.2063427},
doi = {10.1145/2063384.2063427},
acmid = {2063427},
publisher = {ACM},
address = {New York, NY, USA},
}
@inproceedings{2009_google_memory,
title = {DRAM Errors in the Wild: A Large-Scale Field Study},
author = {Bianca Schroeder and Eduardo Pinheiro and Wolf-Dietrich Weber},
year = 2009,
booktitle = {SIGMETRICS}
}
@book{2013_mh,
title={Exploring Memory Hierarchy Design with Emerging Memory Technologies},
author={Sun, G.},
isbn={9783319006819},
series={Lecture Notes in Electrical Engineering},
url={https://books.google.es/books?id=DaHjAAAAQBAJ},
year={2013},
publisher={Springer}
}
@inproceedings{Teranishi:2014,
author = {Teranishi, Keita and Heroux, Michael A.},
title = {{Toward Local Failure Local Recovery Resilience Model Using {MPI-ULFM}}},
booktitle = {Proceedings of the 21st European {MPI} Users' Group Meeting},
series = {EuroMPI/ASIA '14},
year = {2014},
isbn = {978-1-4503-2875-3},
location = {Kyoto, Japan},
pages = {51:51--51:56},
articleno = {51},
numpages = {6},
url = {http://doi.acm.org/10.1145/2642769.2642774},
doi = {10.1145/2642769.2642774},
acmid = {2642774},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {Fault Tolerance, MPI, PDE solvers, Scientific Computing, User Level Fault Mitigation},
}
@inproceedings{Sato:2014,
author = {Sato et al., Kento},
title = {FMI: Fault Tolerant Messaging Interface for Fast and Transparent Recovery},
booktitle = {Proceedings of the 2014 IEEE 28th International Parallel and Distributed Processing Symposium},
series = {IPDPS '14},
year = {2014},
isbn = {978-1-4799-3800-1},
pages = {1225--1234},
numpages = {10},
url = {http://dx.doi.org/10.1109/IPDPS.2014.126},
doi = {10.1109/IPDPS.2014.126},
acmid = {2650537},
publisher = {IEEE Computer Society},
address = {Washington, DC, USA},
keywords = {Fault tolerance, MPI, Checkpoint/Restart},
}
@inproceedings{Gamell:2014,
author = {Gamell et al., Marc},
title = {{Exploring Automatic, Online Failure Recovery for Scientific Applications at Extreme Scales}},
booktitle = {Proc. Int. Conf. High Performance Computing, Networking, Storage and Analysis},
series = {SC '14},
year = {2014},
location = {New Orleans, LA}
}
@inproceedings{Dinan:2011,
author = {Dinan et al, James},
booktitle = {EuroMPI},
crossref = {conf/pvm/2011},
editor = {Cotronis, Yiannis and Danalis, Anthony and Nikolopoulos, Dimitrios S. and Dongarra, Jack},
ee = {http://dx.doi.org/10.1007/978-3-642-24449-0_32},
interhash = {a093f457010481facd376d8b78472979},
intrahash = {16a62a2422f40ec1ff9b3a83f6c5a861},
isbn = {978-3-642-24448-3},
keywords = {dblp},
pages = {282-291},
publisher = {Springer},
series = {Lecture Notes in Computer Science},
timestamp = {2011-09-13T00:00:00.000+0200},
title = {{Noncollective Communicator Creation in {MPI}.}},
volume = 6960,
year = 2011
}
@inproceedings{Jin,
author = {Jin, Tong and Zhang, Fan and Sun, Qian and Bui, Hoang and Parashar, Manish and Yu, Hongfeng and Klasky, Scott and Podhorszki, Norbert and Abbasi, Hasan},
title = {{Using Cross-layer Adaptations for Dynamic Data Management in Large Scale Coupled Scientific Workflows}},
booktitle = {{Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis}},
series = {SC 2013},
year = {2013},
isbn = {978-1-4503-2378-9},
location = {Denver, Colorado},
pages = {74:1--74:12},
articleno = {74},
numpages = {12},
url = {http://doi.acm.org/10.1145/2503210.2503301},
doi = {10.1145/2503210.2503301},
acmid = {2503301},
publisher = {ACM},
address = {New York, NY, USA},
keywords = {coupled simulation workflows, cross-layer adaptation, data management, in-situ/in-transit, staging},
}
@inproceedings{Bennett,
author = {Bennett, Janine C. and Abbasi, Hasan and Bremer, Peer-Timo and Grout, Ray and Gyulassy, Attila and Jin, Tong and Klasky, Scott and Kolla, Hemanth and Parashar, Manish and Pascucci, Valerio and Pebay, Philippe and Thompson, David and Yu, Hongfeng and Zhang, Fan and Chen, Jacqueline},
title = {{Combining In-situ and In-transit Processing to Enable Extreme-scale Scientific Analysis}},
booktitle = {{Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis}},
series = {SC 2012},
year = {2012},
isbn = {978-1-4673-0804-5},
location = {Salt Lake City, Utah},
pages = {49:1--49:9},
articleno = {49},
numpages = {9},
url = {http://dl.acm.org/citation.cfm?id=2388996.2389063},
acmid = {2389063},
publisher = {IEEE Computer Society Press},
address = {Los Alamitos, CA, USA},
}
@proceedings {pubsub,
title = {A Scalable Messaging System for Accelerating Discovery from Large Scale Scientific Simulations},
journal = {Proceedings of the 19th Annual International Conference on High Performance Computing (HiPC 2012)},
year = {2012},
month = {12/2012},
publisher = {IEEE Computer Society Press},
address = {Pune, India},
author = {T. Jin and F. Zhang and Manish Parashar and S Klasky and N. Podhorszki and H. Abbasi}
}
@inproceedings{Zhang,
author = {Zhang, Fan and Lasluisa, Solomon and Jin, Tong and Rodero, Ivan and Bui, Hoang and Parashar, Manish},
title = {{In-situ Feature-Based Objects Tracking for Large-Scale Scientific Simulations}},
booktitle = {{Proceedings of the 2012 SC Companion: High Performance Computing, Networking Storage and Analysis}},
series = {SCC 2012},
year = {2012},
isbn = {978-0-7695-4956-9},
pages = {736--740},
numpages = {5},
url = {http://dx.doi.org/10.1109/SC.Companion.2012.100},
doi = {10.1109/SC.Companion.2012.100},
acmid = {2477107},
publisher = {IEEE Computer Society},
address = {Washington, DC, USA},
keywords = {Scientific data analysis, scalable in-situ data analytics, feature-based object tracking},
}
@inproceedings{2013-33,
Acmid = {2465821},
Address = {New York, NY, USA},
Author = {Lu, Guoming and Zheng, Ziming and Chien, Andrew A.},
Booktitle = {Proceedings of the 3rd Workshop on Fault-tolerance for HPC at Extreme Scale},
Date-Added = {2014-07-23 18:06:20 +0000},
Date-Modified = {2014-07-23 18:08:04 +0000},
Doi = {10.1145/2465813.2465821},
Isbn = {978-1-4503-1983-6},
Keywords = {checkpointing, error recovery, high-performance computing, reliability},
Location = {New York, New York, USA},
Numpages = {8},
Pages = {49--56},
Publisher = {ACM},
Rating = {4},
Series = {FTXS '13},
Title = {When is Multi-version Checkpointing Needed?},
Url = {http://doi.acm.org/10.1145/2465813.2465821},
Year = {2013},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/2465813.2465821},
Bdsk-Url-2 = {http://dx.doi.org/10.1145/2465813.2465821}}
@inproceedings{2014-5,
Author = {Ziming Zheng and Andrew A. Chien and Keita Teranishi},
Booktitle = {VECPAR 2014},
Date-Added = {2014-07-23 17:54:57 +0000},
Date-Modified = {2014-07-23 18:08:27 +0000},
Month = {June},
Title = {Fault Tolerance in an Inner-Outer Solver: a GVR-enabled Case Study},
Year = {2014}}
@inproceedings{2014-4,
Author = {Kento Sato and A. Moody and K. Mohror and T. Gamblin and B. R. de Supinski and N. Maruyama and S. Matsuoka},
Booktitle = {28th IEEE International Parallel & Distributed Processing Symposium (IPDPS 2014)},
Date-Added = {2014-07-23 17:54:40 +0000},
Date-Modified = {2014-07-23 18:08:18 +0000},
Month = {5},
Title = {{FMI: Fault Tolerant Messaging Interface for Fast and Transparent Recovery}},
Year = {2014}}
@article{1994-2,
Abstract = {Presents the results of an implementation of several algorithms for checkpointing and restarting parallel programs on shared-memory multiprocessors. The algorithms are compared according to the metrics of overall checkpointing time, overhead imposed by the checkpointer on the target program, and amount of time during which the checkpointer interrupts the target program. The best algorithm measured achieves its efficiency through a variation of copy-on-write, which allows the most time-consuming operations of the checkpoint to be overlapped with the running of the program being checkpointed},
Author = {Li, K. and Naughton, J.F. and Plank, J.S.},
Date-Added = {2014-07-23 17:54:15 +0000},
Date-Modified = {2014-07-23 18:07:52 +0000},
Journal = {Parallel and Distributed Systems, IEEE Transactions on},
Keywords = {fault tolerant computing;parallel programming;program diagnostics;software reliability;system recovery;backward error recovery;copy-on-write;efficiency;fault tolerance;interruption time;low latency concurrent checkpointing;metrics;overall checkpointing time;overhead;overlapping operations;parallel programs;program restarting;shared-memory multiprocessors;Benchmark testing;Central Processing Unit;Checkpointing;Computer science;Concurrent computing;Delay;Fault tolerance;Fault tolerant systems;Registers},
Month = {Aug},
Number = {8},
Pages = {874-879},
Title = {Low-latency, concurrent checkpointing for parallel programs},
Volume = {5},
Year = {1994},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/71.298215}}
@article{2007-4,
Author = {G. Bosilca and Z. Chen and J. Dongarra and J. Langou},
Date-Added = {2014-07-23 17:52:33 +0000},
Date-Modified = {2014-07-23 18:07:21 +0000},
Journal = {SIAM Journal on Scientific Computing},
Month = {November},
Number = {1},
Pages = {102-116},
Title = {Recovery patterns for iterative methods in a parallel unstable environment},
Volume = {30},
Year = {2007}}
@inproceedings{2010-10,
Author = {Dinan, J. and Singri, A. and Sadayappan, P. and Krishnamoorthy, S.},
Booktitle = {Cluster, Cloud and Grid Computing (CCGrid), 2010 10th IEEE/ACM International Conference on},
Date-Added = {2014-07-23 17:51:31 +0000},
Date-Modified = {2014-07-23 18:07:41 +0000},
Doi = {10.1109/CCGRID.2010.34},
Keywords = {Chemistry;Clouds;Computer science;Electronics packaging;Fault tolerance;Grid computing;Hardware;Kernel;Parallel processing;Parallel programming;Global Arrays;PGAS;Parallel processing;fault tolerance;selective recovery;task parallelism},
Pages = {709-714},
Title = {Selective Recovery from Failures in a Task Parallel Programming Model},
Year = {2010},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/CCGRID.2010.34}}
@inproceedings{2013-34,
Address = {New York, NY, USA},
Author = {Sao, Piyush and Vuduc, Richard},
Booktitle = {Proceedings of the Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems},
Date-Added = {2014-07-23 17:51:09 +0000},
Date-Modified = {2014-07-23 18:08:11 +0000},
Keywords = {fault-tolerance, iterative linear solvers, self-stabilization, transient soft faults},
Location = {Denver, Colorado},
Numpages = {8},
Pages = {4:1--4:8},
Publisher = {ACM},
Series = {ScalA '13},
Title = {Self-stabilizing Iterative Solvers},
Year = {2013},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/2530268.2530272},
Bdsk-Url-2 = {http://dx.doi.org/10.1145/2530268.2530272}}
@inproceedings{2013-22-slides,
Abstract = {Energy consumption and fault tolerance are two interrelated issues
to address for designing future exascale systems. Fault tolerance
protocols used for check pointing have different energy consumption
depending on parameters like application features, number of processes
in the execution and platform characteristics. Currently, the only
way to select a protocol for a given execution is to run the application
and monitor the energy consumption of different fault tolerance protocols.
This is needed for any variation of the execution setting. To avoid
this time and energy consuming process, we propose an energy estimation
framework. It relies on an energy calibration of the considered platform
and a user description of the execution setting. We evaluate the
accuracy of our estimations with real applications running on a real
platform with energy consumption monitoring. Results show that our
estimations are highly accurate and allow selecting the best fault
tolerant protocol without pre-executing the application.},
Added-At = {2013-06-27T00:00:00.000+0200},
Author = {el Mehdi Diouri, Mohammed and Gl{\"u}ck, Olivier and Lef{\`e}vre, Laurent and Cappello, Franck},
Biburl = {http://www.bibsonomy.org/bibtex/22357d1e076ed904762b76fe705c285aa/dblp},
Booktitle = {{CCGRID}},
Crossref = {conf/ccgrid/2013},
Ee = {http://doi.ieeecomputersociety.org/10.1109/CCGrid.2013.80},
Interhash = {8e635a076a6276720aa523e61f0ef738},
Intrahash = {2357d1e076ed904762b76fe705c285aa},
Isbn = {978-1-4673-6465-2},
Keywords = {dblp},
Pages = {522-529},
Publisher = {IEEE Computer Society},
Series = {CCGRID 2013},
Timestamp = {2013-06-27T00:00:00.000+0200},
Title = {{ECOFIT: A Framework to Estimate Energy Consumption of Fault Tolerance Protocols for HPC Applications.}},
Url = {http://dblp.uni-trier.de/db/conf/ccgrid/ccgrid2013.html#DiouriGLC13},
Year = {2013},
Bdsk-Url-1 = {http://dblp.uni-trier.de/db/conf/ccgrid/ccgrid2013.html#DiouriGLC13}}
@inproceedings{2011-9,
Abstract = {Fault tolerance is becoming a major concern in HPC systems. The two
traditional approaches for message passing applications, coordinated
checkpointing and message logging, have severe scalability issues.
Coordinated checkpointing protocols make all processes roll back
after a failure. Message logging protocols log a huge amount of data
and can induce an overhead on communication performance. Hierarchical
rollback-recovery protocols based on the combination of coordinated
checkpointing and message logging are an alternative. These partial
message logging protocols are based on process clustering: only messages
between clusters are logged to limit the consequence of a failure
to one cluster. These protocols would work efficiently only if one
can find clusters of processes in the applications such that the
ratio of logged messages is very low. We study the communication
patterns of message passing HPC applications to show that partial
message logging is suitable in most cases. We propose a partitioning
algorithm to find suitable clusters of processes given the communication
pattern of an application. Finally, we evaluate the efficiency of
partial message logging using two state of the art protocols on a
set of representative applications.},
Author = {Thomas Ropars and Amina Guermouche and Bora U{\c c}ar and Esteban Meneses and Laxmikant V. Kal{\'e} and Franck Cappello},
Booktitle = {{Euro-Par (1)}},
Crossref = {2011},
Ee = {http://dx.doi.org/10.1007/978-3-642-23400-2_53},
Pages = {567-578},
Series = {Euro-Par 2011},
Title = {{On the Use of Cluster-Based Partial Message Logging to Improve Fault Tolerance for {MPI HPC} Applications.}},
Year = {2011}}
@inproceedings{2006-8,
Added-At = {2007-01-02T00:00:00.000+0100},
Author = {Subramaniyan, Rajagopal and Aggarwal, Vikas and Jacobs, Adam and George, Alan},
Biburl = {http://www.bibsonomy.org/bibtex/2bc708e5f120e51efac8bd1174d20eca2/dblp},
Booktitle = {{14th Annual European Symposium on Algorithms}},
Date = {2007-01-02},
Description = {dblp},
Editor = {Arabnia, Hamid R.},
Interhash = {63e4239ea4da2865c2793dcc9872a918},
Intrahash = {bc708e5f120e51efac8bd1174d20eca2},
Isbn = {1-60132-017-5},
Keywords = {dblp},
Pages = {3-9},
Publisher = {CSREA Press},
Series = {ESA 2006},
Timestamp = {2007-01-02T00:00:00.000+0100},
Title = {{FEMPI: A Lightweight Fault-tolerant MPI for Embedded Cluster Systems.}},
Url = {http://dblp.uni-trier.de/db/conf/csreaESA/csreaESA2006.html#SubramaniyanAJG06},
Year = {2006},
Bdsk-Url-1 = {http://dblp.uni-trier.de/db/conf/csreaESA/csreaESA2006.html#SubramaniyanAJG06}}
@inproceedings{2004-5,
Acmid = {1006248},
Address = {New York, NY, USA},
Author = {Agarwal, Saurabh and Garg, Rahul and Gupta, Meeta S. and Moreira, Jose E.},
Booktitle = {{Proceedings of the 18th annual international conference on Supercomputing}},
Doi = {10.1145/1006209.1006248},
Isbn = {1-58113-839-3},
Keywords = {fault-tolerance, incremental checkpoint, large scale systems, probabilistic checkpoint},
Location = {Malo, France},
Numpages = {10},
Pages = {277--286},
Publisher = {ACM},
Series = {ICS 2004},
Title = {{Adaptive incremental checkpointing for massively parallel systems}},
Url = {http://doi.acm.org/10.1145/1006209.1006248},
Year = {2004},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/1006209.1006248},
Bdsk-Url-2 = {http://dx.doi.org/10.1145/1006209.1006248}}
@inproceedings{2010-9,
Abstract = {The Gemini System Interconnect is a new network for Cray's supercomputer
systems. It provides improved network functionality, latency and
issue rate. Latency is reduced with OS bypass for sends and direct
user completion notification on receives. Atomic memory operations
support the construction of fast synchronization and reduction primitives.},
Author = {Alverson, R. and Roweth, D. and Kaplan, L.},
Booktitle = {{IEEE 18th Annual Symposium on High Performance Interconnects (HOTI)}},
Doi = {10.1109/HOTI.2010.23},
Keywords = {mainframes;multiprocessor interconnection networks;network computers;parallel machines;Crays supercomputer system;Gemini system interconnect;OS bypass;atomic memory operations;direct user completion notification;operating system;Bandwidth;Hardware;Kernel;Payloads;Routing;Synchronization;Tiles},
Month = {Aug},
Pages = {83-87},
Title = {{The Gemini System Interconnect}},
Year = {2010},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/HOTI.2010.23}}
@inproceedings{1999-3,
Abstract = {Communication induced checkpointing (CIC) allows processes in a distributed
computation to take independent checkpoints and to avoid the domino
effect. This paper presents an analysis of CIC protocols based on
a prototype implementation and validated simulations. Our result
indicate that there is sufficient evidence to suspect that much of
the conventional wisdom about these protocols is questionable.},
Author = {Alvisi, L. and Elnozahy, E. and Rao, S. and Husain, S.A. and de Mel, A.},
Booktitle = {{Twenty-Ninth Annual International Symposium on Fault-Tolerant Computing. Digest of Papers}},
Doi = {10.1109/FTCS.1999.781058},
Isbn = {0731-3071},
Keywords = {distributed programming, protocols, system recovery, CIC, CIC protocols, distributed computation, independent checkpoints, Analytical models, Checkpointing, Computational modeling, Electrical capacitance tomography, Protocols, Prototypes, Scalability, Virtual prototyping},
Pages = {242-249},
Series = {FTCS 1999 - DSN},
Title = {{An analysis of communication induced checkpointing}},
Year = {1999},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/FTCS.1999.781058}}
@techreport{2009-6,
Authors = {Amarasinghe, Saman and Campbell, Dan and Carlson, William and Chien, Andrew and Dally, William and Elnohazy, Elmootazbellah and Hall, Mary and Harrison, Robert and Harrod, William and Hill, Kerry and Hiller, Jon and Karp, Sherman and Koelbel, Charles and Koester, David and Kogge, Peter and Levesque, John and Reed, Daniel and Sarkar, Vivek and Schreiber, Robert and Richards, Mark and Scarpelli, Al and Shalf, John and Snavely, Allan and Sterling, Thomas},
Author = {Saman Amarasinghe and et al},
Citeulike-Article-Id = {9804599},
Day = {14},
Institution = {DARPA IPTO, Air Force Reserach Lab},
Keywords = {exascale, scalability, supercomputer, supercomputing},
Month = sep,
Posted-At = {2011-09-25 12:36:24},
Priority = {2},
Title = {{ExaScale Software Study: Software Challenges in Extreme Scale Systems}},
Year = {2009}}
@inproceedings{2011-17,
Authorss = {Saman Amarasinghe and Mary Hall and Richard Lethin and Keshav Pingali and Dan Quinlan and Vivek Sarkar and John Shalf and Robert Lucas and Katherine Yelick and Pavan Balanji and Pedro C. Diniz and Alice Koniges and Marc Snir},
Author = {Saman Amarasinghe and et al},
Booktitle = {{Proceedings of the Workshop on Exascale Programming Challenges, Marina del Rey, CA, USA}},
Month = {Jul},
Publisher = {U.S Department of Energy, Office of Science, Office of Advanced Scientific Computing Research (ASCR)},
Title = {{Exascale Programming Challenges}},
Url = {http://science.energy.gov/~/media/ascr/pdf/program-documents/docs/ProgrammingChallengesWorkshopReport.pdf},
Year = {2011},
Bdsk-Url-1 = {http://science.energy.gov/~/media/ascr/pdf/program-documents/docs/ProgrammingChallengesWorkshopReport.pdf}}
@inproceedings{2013-9,
Abstract = {In this paper, we revisit traditional checkpointing and rollback recovery
strategies, with a focus on silent data corruption errors. Contrarily
to fail-stop failures, such latent errors cannot be detected immediately,
and a mechanism to detect them must be provided. We consider two
models: (i) errors are detected after some delays following a probability
distribution (typically, an Exponential distribution); (ii) errors
are detected through some verification mechanism. In both cases,
we compute the optimal period in order to minimize the waste, i.e.,
the fraction of time where nodes do not perform useful computations.
In practice, only a fixed number of checkpoints can be kept in memory,
and the first model may lead to an irrecoverable failure. In this
case, we compute the minimum period required for an acceptable risk.
For the second model, there is no risk of irrecoverable failure,
owing to the verification mechanism, but the corresponding overhead
is included in the waste. Finally, both models are instantiated using
realistic scenarios and application/architecture parameters.},
Address = {Vancouver, Canada},
Affiliation = {Laboratoire de l'Informatique du Parallelisme - LIP , ROMA - ENS Lyon / CNRS / Inria Grenoble Rh{\^o}ne-Alpes , Innovative Computing Laboratory - ICL},
Author = {Aupy, Guillaume and Benoit, Anne and Herault, Thomas and Robert, Yves and Vivien, Frederic and Zaidouni, Dounia},
Booktitle = {{The 19th IEEE Pacific Rim International Symposium on Dependable Computing - 2013}},
Keywords = {High-performance computing, checkpointing, silent data corruption, verification, error recovery},
Month = {Dec},
Pdf = {http://hal.inria.fr/hal-00847620/PDF/resilience2013.pdf},
Publisher = {IEEE},
Series = {PRDC 2013},
Title = {{On the Combination of Silent Error Detection and Checkpointing}},
Url = {http://hal.inria.fr/hal-00847620},
Year = {2013},
Bdsk-Url-1 = {http://hal.inria.fr/hal-00847620}}
@inproceedings{2001-3,
Abstract = {MPI has proven effective for parallel applications in situations with
neither QoS nor fault handling. Emerging environments motivate fault-tolerant
MPI middleware. Environments include space-based, wide-area/web/meta
computing and scalable clusters. MPI/FT, the system described in
the paper, trades off sufficient MPI fault coverage against acceptable
parallel performance, based on mission requirements and constraints.
MPI codes are evolved to use MPI/FT features. Non-portable code for
event handlers and recovery management is isolated. User-coordinated
recovery, checkpointing, transparency and event handling, as well
as evolvability of legacy MPI codes form key design criteria. Parallel
self-checking threads address four levels of MPI implementation robustness,
three of which are portable to any multithreaded MPI. A taxonomy
of application types provides six initial fault-relevant models;
user-transparent parallel nMR computation is thereby considered.
Key concepts from MPI/RT-real-time MPI-are also incorporated into
MPI/FT, with further overt support for MPI/RT and MPI/FT in applications
possible in future},
Author = {Batchu, R. and Neelamegam, J.P. and Zhenqian Cui and Beddhu, M. and Skjellum, A. and Dandass, Y. and Apte, M.},
Booktitle = {{Proceedings of the First IEEE/ACM International Symposium on Cluster Computing and the Grid}},
Doi = {10.1109/CCGRID.2001.923171},
Keywords = {client-server systems;message passing;parallel programming;software architecture;software fault tolerance;system recovery;MPI/FT;checkpointing;event handlers;event handling;fault-tolerant middleware;message passing;meta computing;parallel performance;parallel self-checking threads;performance-portable parallel computing;real-time MPI;recovery management;scalable clusters;wide-area network;Checkpointing;Communication standards;Fault tolerance;Fault tolerant systems;Middleware;Operating systems;Process control;Protocols;Quality of service;Taxonomy},
Pages = {26-33},
Series = {CCGRID 2011},
Title = {{MPI/FTTM: architecture and taxonomies for fault-tolerant, message-passing middleware for performance-portable parallel computing}},
Year = {2001},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/CCGRID.2001.923171}}
@inproceedings{2011-1,
Author = {Bautista-Gomez, Leonardo and Tsuboi, Seiji and Komatitsch, Dimitri and Cappello, Franck and Maruyama, Naoya and Matsuoka, Satoshi},
Booktitle = {{Proceedings of International Conference for High Performance Computing, Networking, Storage and Analysis}},
Series = {SC 2011},
Summary = {FTI offers high-frequency (25s/300MB per node on 48nodes; 6min/400MB per node on 1152 nodes -8\%overhead-) multi-level checkpoint for systems using node-local storage (present in some HPC systems, such as TSUBAME2.0 which has SSDs in every node). \hide{They contribution is small, because the 3 ideas they implement (FT thread, topology-aware Reed Solomon, multilevel checkpoint), were already presented in the past.} They partition the system in groups of K processes, where each group will implement Reed-Solomon encoding to tolerate M process (not node) failures within a group. Each group can't contain more than one core of the same node, allowing therefore node failures (which they call topology-aware Reed-Solomon). They set M=K, tolerating half group failure because they store the encoded checkpoints locally \hide{THEY ACHIEVE THE SAME STORAGE COST OF NEIGHBORS CHECKPOINTING, THE SAME NETWORK COST, BUT WITH MUCH MORE COMPUTATION, BECAUSE THEY HAVE TO CALCULATE ALL THE ENCODED DATA, INSTEAD OF JUST SENDING THE UNMODIFIED CHECKPOINT TO A NEIGHBOR! (they don't mention this). No apparent benefit from Neighbor checkpointing, with added complexity.}. They use one dedicated thread per node to calculate the encoding. They implement three-level checkpoint scheme (L1 SSD, L2 RS encoding, L3 PFS) as presented in [2010-8]. They include a reliability study of the multilevel approach. Upon failure, they stop the whole job and require the user to relaunch it. They evaluate with a real application, SPECFEM3D, showing 8\% of checkpoint overhead in 1000 GPUs. Note about eval: FTI only shows FLOPS, checkpoint and encoding time... never time to solution. They briefly mention failure correlation causes.},
Title = {{FTI: High Performance Fault Tolerance Interface for Hybrid Systems}},
Year = {2011}}
@techreport{2012-12,
Author = {P. Beckman and R. Brightwell and B. R. de Supinski and M. Gokhale and S. Hofmeyr and S. Krishnamoorthy and M. Lang and B. Maccabe and J. Shalf and M. Snir},
Institution = {US Department of Energy},
Month = {December},
Title = {{Exascale Operating Systems and Runtime Software Report}},
Type = {Technical Report},
Url = {http://science.energy.gov/~/media/ascr/pdf/research/cs/Exascale%20Workshop/ExaOSR-Report-Final.pdf},
Year = {2012},
Bdsk-Url-1 = {http://science.energy.gov/~/media/ascr/pdf/research/cs/Exascale%20Workshop/ExaOSR-Report-Final.pdf}}
@inproceedings{2001-2,
Acmid = {654524},
Address = {London, UK, UK},
Author = {Bhandarkar, Milind A. and Kale, Laxmikant V. and Sturler, Eric de and Hoeflinger, Jay},
Booktitle = {{Proceedings of the International Conference on Computational Science-Part II}},
Isbn = {3-540-42233-1},
Numpages = {10},
Pages = {108--117},
Publisher = {Springer-Verlag},
Series = {ICCS 2001},
Title = {{Adaptive Load Balancing for MPI Programs}},
Url = {http://dl.acm.org/citation.cfm?id=645456.654524},
Year = {2001},
Bdsk-Url-1 = {http://dl.acm.org/citation.cfm?id=645456.654524}}
@article{2009-13,
Author = {Bianchini, Ricardo and Fox, Armando and Godfrey, Berkeley Forest and Hoisie, Adolfy and McKinley, Kathryn and Plank, James and Ranganathan, Partha and Simons, Josh},
Title = {{System Resilience at Extreme Scale White Paper}},
Year = {2013}}
@inproceedings{2012-5,
Author = {Bland, W.},
Booktitle = {{12th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing}},
Doi = {10.1109/CCGrid.2012.25},
Keywords = {application program interfaces, checkpointing, fault tolerant computing, message passing, MPI standard, Open MPI library, application resilience, concurrent state checkpointing, fail-stop failures, fault tolerance approach, message passing interface, minimalistic fault discovery, proof of concept, runtime process failure, Fault tolerance, Fault tolerant systems, Libraries, Routing, Runtime, Standards, Topology, Distributed Runtime, Fault Tolerance, Message Passing Interface},
Pages = {746-751},
Series = {CCGrid 2012},
Title = {{Enabling Application Resilience with and without the MPI Standard}},
Year = {2012},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/CCGrid.2012.25}}
@techreport{2012-10,
Author = {Bland et al., W.},
Booktitle = {{University of Tennessee Electrical Engineering and Computer Science Technical Report}},
Institution = {Innovative Computing Laboratory, University of Tennessee},
Month = {February},
Title = {{A Proposal for User-Level Failure Mitigation in the MPI-3 Standard}},
Year = {2012}}
@article{bland2013post,
Abstract = {As supercomputers are entering an era of massive parallelism where
the frequency of faults is increasing, the MPI Standard remains distressingly
vague on the consequence of failures on MPI communications. Advanced
fault-tolerance techniques have the potential to prevent full-scale
application restart and therefore lower the cost incurred for each
failure, but they demand from MPI the capability to detect failures
and resume communications afterward. In this paper, we present a
set of extensions to MPI that allow communication capabilities to
be restored, while maintaining the extreme level of performance to
which MPI users have become accustomed. The motivation behind the
design choices are weighted against alternatives, a task that requires
simultaneously considering MPI from the viewpoint of both the user
and the implementor. The usability of the interfaces for expressing
advanced recovery techniques is then discussed, including the difficult
issue of enabling separate software layers to coordinate their recovery.},
Author = {Bland et al, Wesley},
Doi = {10.1177/1094342013488238},
Eprint = {http://hpc.sagepub.com/content/early/2013/06/02/1094342013488238.full.pdf+html},
Journal = {Int. J. High Performance Computing Applications},
Title = {{Post-failure recovery of MPI communication capability: Design and rationale}},
Url = {http://hpc.sagepub.com/content/early/2013/06/02/1094342013488238.abstract},
Year = {2013},
Bdsk-Url-1 = {http://hpc.sagepub.com/content/early/2013/06/02/1094342013488238.abstract},
Bdsk-Url-2 = {http://dx.doi.org/10.1177/1094342013488238}}
@inproceedings{2012-9,
Acmid = {2404064},
Address = {Berlin, Heidelberg},
Author = {Bland et al., Wesley},
Booktitle = {{Proc. 19th European Conf. on Recent Advances in the Message Passing Interface}},
Doi = {10.1007/978-3-642-33518-1_24},
Isbn = {978-3-642-33517-4},
Location = {Vienna, Austria},
Numpages = {11},
Pages = {193--203},
Publisher = {Springer-Verlag},
Series = {EuroMPI 2012},
Title = {{An evaluation of user-level failure mitigation support in MPI}},
Url = {http://dx.doi.org/10.1007/978-3-642-33518-1_24},
Year = {2012},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-33518-1_24}}
@incollection{2012-18,
Author = {Bland, Wesley and Du, Peng and Bouteiller, Aurelien and Herault, Thomas and Bosilca, George and Dongarra, Jack},
Booktitle = {{Euro-Par 2012 Parallel Processing}},
Doi = {10.1007/978-3-642-32820-6_48},
Editor = {Kaklamanis, Christos and Papatheodorou, Theodore and Spirakis, PaulG.},
Isbn = {978-3-642-32819-0},
Pages = {477-488},
Publisher = {Springer Berlin Heidelberg},
Series = {Lecture Notes in Computer Science},
Title = {{A Checkpoint-on-Failure Protocol for Algorithm-Based Recovery in Standard MPI}},
Url = {http://dx.doi.org/10.1007/978-3-642-32820-6_48},
Volume = {7484},
Year = {2012},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-32820-6_48}}
@article{2013-7,
Author = {Bland, Wesley and Du, Peng and Bouteiller, Aurelien and Herault, Thomas and Bosilca, George and Dongarra, Jack J.},
Doi = {10.1002/cpe.3100},
Issn = {1532-0634},
Journal = {Concurrency and Computation: Practice and Experience},
Keywords = {fault tolerance, message passing interface, ABFT, Checkpoint-on-Failure},
Title = {{Extending the scope of the Checkpoint-on-Failure protocol for forward recovery in standard MPI}},
Url = {http://dx.doi.org/10.1002/cpe.3100},
Year = {2013},
Bdsk-Url-1 = {http://dx.doi.org/10.1002/cpe.3100}}
@techreport{2012-14,
Abstract = {In this article, we present a unified model for several well-known
checkpoint/restart protocols. The proposed model is generic enough
to encompass both extremes of the check- point/restart space, from
coordinated approaches to a variety of uncoordinated checkpoint strate-
gies (with message logging). We identify a set of crucial parameters,
instantiate them and compare the expected efficiency of the fault
tolerant protocols, for a given application/platform pair. We then
propose a detailed analysis of several scenarios, including some
of the most powerful currently available HPC platforms, as well as
anticipated Exascale designs. The results of this analytical comparison
are corroborated by a comprehensive set of simulations. Altogether,
they outline com- parative behaviors of checkpoint strategies at
very large scale, thereby providing insight that is hardly accessible
to direct experimentation.},
Affiliation = {Innovative Computing Laboratory - ICL , Departement Informatique - INF , GRAND-LARGE - INRIA Saclay - Ile de France , Joint Laboratory for Petascale Computing [Illinois] - JLPC , Laboratoire de Recherche en Informatique - LRI , ROMA - ENS Lyon / CNRS / Inria Grenoble Rh{\^o}ne-Alpes , Laboratoire de l'Informatique du Parallelisme - LIP},
Author = {Bosilca, George and Bouteiller, Aurelien and Brunet, Elisabeth and Cappello, Franck and Dongarra, Jack and Guermouche, Amina and Herault, Thomas and Robert, Yves and Vivien, Frederic and Zaidouni, Dounia},
Institution = {INRIA},
Keywords = {Fault-tolerance, checkpointing, coordinated, hierarchical, model, exascale},
Month = {Oct},
Number = {RR-7950},
Pdf = {http://hal.inria.fr/hal-00696154/PDF/RR-7950.pdf},
Title = {{Unified Model for Assessing Checkpointing Protocols at Extreme-Scale}},
Type = {Rapport de recherche},
Url = {http://hal.inria.fr/hal-00696154},
Year = {2012},
Bdsk-Url-1 = {http://hal.inria.fr/hal-00696154}}
@inproceedings{2002-5,
Abstract = {Global Computing platforms, large scale clusters and future TeraGRID
systems gather thousands of nodes for computing parallel scientific
applications. At this scale, node failures or disconnections are
frequent events. This Volatility reduces the MTBF of the whole system
in the range of hours or minutes. We present MPICH-V, an automatic
Volatility tolerant MPI environment based on uncoordinated checkpoint/roll-back
and distributed message logging. MPICH-V architecture relies on Channel
Memories, Checkpoint servers and theoretically proven protocols to
execute existing or new, SPMD and Master-Worker MPI applications
on volatile nodes. To evaluate its capabilities, we run MPICH-V within
a framework for which the number of nodes, Channels Memories and
Checkpoint Servers can be completely configured as well as the node
Volatility. We present a detailed performance evaluation of every
component of MPICH-V and its global performance for non-trivial parallel
applications. Experimental results demonstrate good scalability and
high tolerance to node volatility.},
Author = {Bosilca, G. and Bouteiller, A. and Cappello, F. and Djilali, S. and Fedak, G. and Germain, C. and Herault, T. and Lemarinier, P. and Lodygensky, O. and Magniette, F. and Neri, V. and Selikhov, A.},
Booktitle = {{Supercomputing, ACM/IEEE 2002 Conference}},
Doi = {10.1109/SC.2002.10048},
Issn = {1063-9535},
Keywords = {Application software;Computer applications;Computer architecture;Computer industry;Concurrent computing;Distributed computing;Fault tolerance;Large-scale systems;Message passing;Peer to peer computing},
Pages = {29-29},
Title = {{MPICH-V: Toward a Scalable Fault Tolerant MPI for Volatile Nodes}},
Year = {2002},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/SC.2002.10048}}
@article{2008-1,
Acmid = {1514767},
Address = {Orlando, FL, USA},
Author = {Bosilca, George and Delmas, R{\'e}mi and Dongarra, Jack and Langou, Julien},
Doi = {10.1016/j.jpdc.2008.12.002},
Issn = {0743-7315},
Journal = {J. Parallel Distrib. Comput.},
Keywords = {Fault tolerance, High performance computing, Linear algebra},
Month = {apr},
Number = {4},
Numpages = {7},
Pages = {410--416},
Publisher = {Academic Press, Inc.},
Title = {{Algorithm-based fault tolerance applied to high performance computing}},
Url = {http://dx.doi.org/10.1016/j.jpdc.2008.12.002},
Volume = {69},
Year = {2008},
Bdsk-Url-1 = {http://dx.doi.org/10.1016/j.jpdc.2008.12.002}}
@article{2013-6,
Abstract = {High performance computing applications must be resilient to faults.
The traditional fault-tolerance solution is checkpoint-recovery,
by which application state is saved to and recovered from secondary
storage throughout execution. It has been shown that, even when using
an optimal checkpointing strategy, the checkpointing overhead precludes
high parallel efficiency at large scale. Additional fault- tolerance
mechanisms must thus be used. Such a mechanism is replication, that
is, multiple processors performing the same computation so that a
processor failure does not necessarily imply an application failure.
In spite of resource waste, repli- cation can lead to higher parallel
efficiency when compared to using only checkpoint-recovery at large
scale.We propose to execute and checkpoint multiple application instances
concurrently, an approach we term group replication. For exponential
failures we give an upper bound on the expected application execution
time. This bound corresponds to a particular checkpointing period
that we derive. For general failures, we propose a dynamic programming
algorithm to determine non-periodic checkpoint dates as well as an
empirical periodic checkpointing solution whose period is found via
a numerical search. Using simulation we evaluate our proposed approaches,
including comparison to the non-replication case, for both exponential
and Weibull failure distributions. Our broad finding is that group
replication is useful in a range of realistic application and checkpointing
overhead scenarios for future exascale platforms.},
Author = {Bougeret, Marin and Casanova, Henri and Robert, Yves and Vivien, Fr{\'e}d{\'e}ric and Zaidouni, Dounia},
Doi = {10.1177/1094342013505348},
Eprint = {http://hpc.sagepub.com/content/early/2013/09/30/1094342013505348.full.pdf+html},
Journal = {International Journal of High Performance Computing Applications},
Title = {{Using group replication for resilience on exascale systems}},
Url = {http://hpc.sagepub.com/content/early/2013/09/30/1094342013505348.abstract},
Year = {2013},
Bdsk-Url-1 = {http://hpc.sagepub.com/content/early/2013/09/30/1094342013505348.abstract},
Bdsk-Url-2 = {http://dx.doi.org/10.1177/1094342013505348}}
@article{2013-13-slides,
Abstract = {As the failure frequency is increasing with the components count in
modern and future supercomputers, resilience is becoming critical
for extreme scale systems. The association of failure prediction
with proactive check pointing seeks to reduce the effect of failures
in the execution time of parallel applications. Unfortunately, proactive
check pointing does not systematically avoid restarting from scratch.
To mitigate this issue, failure prediction and proactive check pointing
can be coupled with periodic check pointing. However, blind use of
these techniques does not always improves system efficiency, because
everyone of them comes with a mix of overheads and benefits. In order
to study and understand the combination of these techniques and their
improvement in the system's efficiency, we developed: (i) a prototype
combining state of the art failure prediction, fast proactive check
pointing and preventive check pointing, (ii) a mathematical model
that reflects the expected computing efficiency of the combination
and computes the optimal check pointing interval in this context,
(iii) a discrete event simulator to evaluate the computing efficiency
of the combination for system parameters corresponding to the current
and projected large scale HPC systems. We evaluate our proposed technique
on a large supercomputer (i.e. TSUBAME2) with production-level HPC
applications and we show that failure prediction, proactive and preventive
check pointing can be coupled successfully, imposing only about 2\%
to 6\% of overhead in comparison with preventive check pointing only.
Moreover, our model-based simulations show that the optimal solution
improves the computing efficiency up to 30\% in comparison with classic
periodic check pointing. We show that the prediction recall has a
much higher impact on execution efficiency than the prediction precision.
This result suggests that researchers on failure prediction algorithms
should focus on improving the recall. We also show that the combination
of these techniques can significantly improve (by a factor 2, for
a particular configuration) the mean time between failures (MTBF)
perceived by the application. },
Address = {Los Alamitos, CA, USA},
Author = {Mohamed Slim Bouguerra and Ana Gainaru and Leonardo Bautista Gomez and Franck Cappello and Satoshi Matsuoka and Naoya Maruyam},
Doi = {http://doi.ieeecomputersociety.org/10.1109/IPDPS.2013.74},
Issn = {1530-2075},
Journal = {International Parallel and Distributed Processing Symposium},
Keywords = {Checkpointing;Fault tolerance;Fault tolerant systems;Correlation;Mathematical model;Predictive models;Computational modeling;large scale HPC systems;Failure prediction;multilevel checkpointing;resilience},
Pages = {501-512},
Publisher = {IEEE Computer Society},
Series = {IPDPS 2013},
Title = {{Improving the Computing Efficiency of HPC Systems Using a Combination of Proactive and Preventive Checkpointing}},
Volume = {0},
Year = {2013},
Bdsk-Url-1 = {http://doi.ieeecomputersociety.org/10.1109/IPDPS.2013.74}}
@article{2010-6,
Abstract = {Over the past decade the number of processors used in high performance
computing has increased to hundreds of thousands. As a direct consequence,
and while the computational power follows the trend, the mean time
between failures (MTBF) has suffered and is now being counted in
hours. In order to circumvent this limitation, a number of fault-tolerant
algorithms as well as execution environments have been developed
using the message passing paradigm. Among them, message logging has
been proved to achieve a better overall performance when the MTBF
is low, mainly due to a faster failure recovery. However, message
logging suffers from a high overhead when no failure occurs. Therefore,
in this paper we discuss a refinement of the message logging model
intended to improve the failure-free message logging performance.
The proposed approach simultaneously removes useless memory copies
and reduces the number of logged events. We present the implementation
of a pessimistic message logging protocol in Open MPI and compare
it with the previous reference implementation MPICH-V2. The results
outline a several order of magnitude improvement on the performance
and a zero overhead for most messages. Published in 2010 by John
Wiley & Sons, Ltd.},
Author = {Bouteiller, Aurelien and Bosilca, George and Dongarra, Jack},
Doi = {10.1002/cpe.1589},
Issn = {1532-0634},
Journal = {Concurrency and Computation: Practice and Experience},
Keywords = {high performance computing, fault tolerance, message logging, uncoordinated checkpoint},
Number = {16},
Pages = {2196--2211},
Publisher = {John Wiley \%\& Sons, Ltd.},
Title = {{Redesigning the message logging model for high performance}},
Url = {http://dx.doi.org/10.1002/cpe.1589},
Volume = {22},
Year = {2010},
Bdsk-Url-1 = {http://dx.doi.org/10.1002/cpe.1589}}
@inproceedings{2013-11,
Author = {Aurelien Bouteiller and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert},
Booktitle = {{Euro-Par}},
Ee = {http://dx.doi.org/10.1007/978-3-642-40047-6_43},
Pages = {420-431},
Series = {Euro-Par 2013},
Title = {{Multi-criteria Checkpointing Strategies: Response-Time versus Resource Utilization}},
Year = {2013}}
@inproceedings{2011-13,
Acmid = {2033415},
Address = {Berlin, Heidelberg},
Author = {Bouteiller, Aurelien and Herault, Thomas and Bosilca, George and Dongarra, Jack J.},
Booktitle = {{Proceedings of the 17th international conference on Parallel processing}},
Isbn = {978-3-642-23396-8},
Location = {Bordeaux, France},
Numpages = {14},
Pages = {51--64},
Publisher = {Springer-Verlag},
Series = {Euro-Par 2011},
Title = {{Correlated set coordination in fault tolerant message logging protocols}},
Url = {http://dl.acm.org/citation.cfm?id=2033408.2033415},
Volume = {Part II},
Year = {2011},
Bdsk-Url-1 = {http://dl.acm.org/citation.cfm?id=2033408.2033415}}
@article{2006-9,
Abstract = {High performance computing platforms like Clusters, Grid and Desktop
Grids are becoming larger and subject to more frequent failures.
MPI is one of the most used message passing library in HPC applications.
These two trends raise the need for fault tolerant MPI. The MPICH-V
project focuses on designing, implementing and comparing several
automatic fault tolerance protocols for MPI applications. We present
an extensive related work section highlighting the originality of
our approach and the proposed protocols. We present then four fault
tolerant protocols implemented in a new generic framework for fault
tolerant protocol comparison, covering a large spectrum of known
approaches from coordinated checkpoint, to uncoordinated checkpoint
associated with causal message logging. We measure the performance
of these protocols on a microbenchmark and compare them for the NAS
benchmark, using an original fault tolerance test. Finally, we outline
the lessons learned from this in depth fault tolerant protocol comparison
for MPI applications.},
Author = {Bouteiller, A. and Herault, T. and Krawezik, G. and Lemarinier, P. and Cappello, F.},
Doi = {10.1177/1094342006067469},
Eprint = {http://hpc.sagepub.com/content/20/3/319.full.pdf+html},
Journal = {International Journal of High Performance Computing Applications},
Number = {3},
Pages = {319-333},
Title = {{MPICH-V Project: A Multiprotocol Automatic Fault-Tolerant MPI}},
Url = {http://hpc.sagepub.com/content/20/3/319.abstract},
Volume = {20},
Year = {2006},
Bdsk-Url-1 = {http://hpc.sagepub.com/content/20/3/319.abstract},
Bdsk-Url-2 = {http://dx.doi.org/10.1177/1094342006067469}}
@inproceedings{2009-10,
Abstract = {With the growing scale of high performance computing platforms, fault
tolerance has become a major issue. Among the various approaches
for providing fault tolerance to MPI applications, message logging
has been proved to tolerate higher failure rate. However, this advantage
comes at the expense of a higher overhead on communications, due
to latency intrusive logging of events to a stable storage. Previous
work proposed and evaluated several protocols relaxing the synchronicity
of event logging to moderate this overhead. Recently, the model of
message logging has been refined to better match the reality of high
performance network cards, where message receptions are decomposed
in multiple interdependent events. According to this new model, deterministic
and non-deterministic events are clearly discriminated, reducing
the overhead induced by message logging. In this paper we compare,
experimentally, a pessimistic and an optimistic message logging protocol,
using this new model and implemented in the Open MPI library. Although
pessimistic and optimistic message logging are, respectively, the
most and less synchronous message logging paradigms, experiments
show that most of the time their performance is comparable.},
Author = {Bouteiller, A. and Ropars, T. and Bosilca, G. and Morin, C. and Dongarra, J.},
Booktitle = {{IEEE International Conference on Cluster Computing and Workshops, CLUSTER 2009}},
Doi = {10.1109/CLUSTR.2009.5289157},
Issn = {1552-5244},
Keywords = {fault tolerant computing;libraries;message passing;parallel machines;protocols;MPI failure recovery;Open MPI library;fault tolerance;high performance computing;high performance network cards;message logging protocol;message passing interface;Delay;Fault tolerance;High performance computing;Laboratories;Libraries;Lifting equipment;Message passing;Network interfaces;Protocols;Usability},
Pages = {1-9},
Title = {{Reasons for a pessimistic or optimistic message logging protocol in MPI uncoordinated failure, recovery}},
Year = {2009},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/CLUSTR.2009.5289157}}
@inproceedings{2003-5,
Abstract = {Execution of MPI applications on clusters and Grid deployments suffering
from node and network failures motivates the use of fault tolerant
MPI implementations. We present MPICH-V2 (the second protocol of
MPICH-V project), an automatic fault tolerant MPI implementation
using an innovative protocol that removes the most limiting factor
of the pessimistic message logging approach: reliable logging of
in transit messages. MPICH-V2 relies on uncoordinated checkpointing,
sender based message logging and remote reliable logging of message
logical clocks. This paper presents the architecture of MPICH-V2,
its theoretical foundation and the performance of the implementation.
We compare MPICH-V2 to MPICH-V1 and MPICH-P4 evaluating a) its point-to-point
performance, b) the performance for the NAS benchmarks, c) the application
performance when many faults occur during the execution. Experimental
results demonstrate that MPICH-V2 provides performance close to MPICH-P4
for applications using large messages while reducing dramatically
the number of reliable nodes compared to MPICH-V1.},
Author = {Bouteiller, B. and Cappello, F. and Herault, T. and Krawezik, K. and Lemarinier, P. and Magniette, M.},
Booktitle = {{Supercomputing, 2003 ACM/IEEE Conference}},
Doi = {10.1109/SC.2003.10027},
Keywords = {Checkpointing;Clocks;Costs;Fault tolerance;High performance computing;Message passing;Permission;Programming profession;Protocols;Uniform resource locators},
Pages = {25-25},
Title = {{MPICH-V2: a Fault Tolerant MPI for Volatile Nodes based on Pessimistic Sender Based Message Logging}},
Year = {2003},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/SC.2003.10027}}
@inproceedings{2003-6,
Abstract = {MPI is one of the most adopted programming models for large clusters
and grid deployments. However, these systems often suffer from network
or node failures. This raises the issue of selecting a fault tolerance
approach for MPI. Automatic and transparent ones are based on either
coordinated checkpointing or message logging associated with uncoordinated
checkpoint. There are many protocols, implementations and optimizations
for these approaches but few results about their comparison. Coordinated
checkpoint has the advantage of a very low overhead on fault free
executions. In contrary a message logging protocol systematically
adds a significant message transfer penalty. The drawbacks of coordinated
checkpoint come from its synchronization cost at checkpoint and restart
times. In this paper we implement, evaluate and compare the two kinds
of protocols with a special emphasis on their respective performance
according to fault frequency. The main conclusion (under our experimental
conditions) is that message logging becomes relevant for a large
scale cluster from one fault every hour for applications with large
dataset.},
Author = {Bouteiller, B. and Lemarinier, P. and Krawezik, K. and Cappello, F.},
Booktitle = {{Proceedings of the IEEE International Conference on Cluster Computing}},
Doi = {10.1109/CLUSTR.2003.1253321},
Keywords = {distributed programming;fault tolerant computing;grid computing;message passing;performance evaluation;system recovery;workstation clusters;PC clusters;coordinated checkpoint;coordinated checkpointing;fault free executions;fault frequency;fault tolerant MPI;grid computing;message log;message logging protocol;message transfer penalty;network failures;node failures;performance evaluation;programming models;restart times;synchronization cost;Checkpointing;Clouds;Computer fault tolerance;Costs;Electronic mail;Fault tolerance;Frequency synchronization;High performance computing;Large-scale systems;Message passing;Protocols;System recovery},
Pages = {242-250},
Title = {{Coordinated checkpoint versus message log for fault tolerant MPI}},
Year = {2003},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/CLUSTR.2003.1253321}}
@misc{2004-7,
Author = {P. J. Braam.},
Howpublished = {\url{https:// http://www.lustre.org/docs.html}},
Note = {Accessed: 2014-03-15},
Publisher = {Cluster File System, Inc.},
Title = {{ Lustre: A Scalable, High Performance File System.}},
Year = {2004}}
@article{2003-2,
Acmid = {781513},
Address = {New York, NY, USA},
Author = {Bronevetsky, Greg and Marques, Daniel and Pingali, Keshav and Stodghill, Paul},
Doi = {10.1145/966049.781513},
Issn = {0362-1340},
Issue_Date = {October 2003},
Journal = {SIGPLAN Not.},
Keywords = {MPI, application-level checkpointing, fault-tolerance, non-FIFO communication, scientific computing},
Month = {Jun},
Number = {10},
Numpages = {11},
Pages = {84--94},
Publisher = {ACM},
Title = {{Automated application-level checkpointing of MPI programs}},
Url = {http://doi.acm.org/10.1145/966049.781513},
Volume = {38},
Year = {2003},
Bdsk-Url-1 = {http://doi.acm.org/10.1145/966049.781513},
Bdsk-Url-2 = {http://dx.doi.org/10.1145/966049.781513}}
@article{2009-7,
Author = {Bronevetsky, Greg and Moody, Adam},
Journal = {Lawrence Livermore National Laboratory, Livermore, CA, USA, Tech. Rep. TR-JLPC-09-01},
Title = {{Scalable I/O systems via node-local storage: Approaching 1 TB/sec file I/O}},
Year = {2009}}
@inproceedings{2004-3,
Author = {David Callahan and Bradford L. Chamberlain and Hans P. Zima},
Booktitle = {{Ninth International Workshop on High-Level Parallel Programming Models and Supportive Environments}},
Pages = {52--60},
Series = {HIPS'04},
Title = {{The Cascade High Productivity Language}},
Year = {2004}}
@article{2014-1,
Abstract = {InfiniBand is widely used for low-latency, high-throughput cluster
computing. Saving the state of the InfiniBand network as part of
distributed checkpointing has been a long-standing challenge for
researchers. Because of a lack of a solution, typical MPI implementations
have included custom checkpoint-restart services that "tear down"
the network, checkpoint each node as if the node were a standalone
computer, and then re-connect the network again. We present the first
example of transparent, system-initiated checkpoint-restart that
directly supports InfiniBand. The new approach is independent of
any particular Linux kernel, thus simplifying the current practice
of using a kernel-based module, such as BLCR. This direct approach
results in checkpoints that are found to be faster than with the
use of a checkpoint-restart service. The generality of this approach
is shown not only by checkpointing an MPI computation, but also a
native UPC computation (Berkeley Unified Parallel C), which does
not use MPI. Scalability is shown by checkpointing 2,048 MPI processes
across 128 nodes (with 16 cores per node). In addition, a cost-effective
debugging approach is also enabled, in which a checkpoint image from
an InfiniBand-based production cluster is copied to a local Ethernet-based
cluster, where it can be restarted and an interactive debugger can