resilience_etal.bib

@inproceedings{FTI,
 author = {Bautista-Gomez et al., Leonardo},
 title = {{FTI: High Performance Fault Tolerance Interface for Hybrid Systems}},
 booktitle = {Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis},
 series = {SC '11},
 year = {2011},
 isbn = {978-1-4503-0771-0},
 location = {Seattle, Washington},
 pages = {32:1--32:32},
 articleno = {32},
 numpages = {32},
 url = {http://doi.acm.org/10.1145/2063384.2063427},
 doi = {10.1145/2063384.2063427},
 acmid = {2063427},
 publisher = {ACM},
 address = {New York, NY, USA},
} 

@inproceedings{2009_google_memory,
title = {DRAM Errors in the Wild: A Large-Scale Field Study},
author  = {Bianca Schroeder and Eduardo Pinheiro and Wolf-Dietrich Weber},
year  = 2009,
booktitle = {SIGMETRICS}
}

@book{2013_mh,
  title={Exploring Memory Hierarchy Design with Emerging Memory Technologies},
  author={Sun, G.},
  isbn={9783319006819},
  series={Lecture Notes in Electrical Engineering},
  url={https://books.google.es/books?id=DaHjAAAAQBAJ},
  year={2013},
  publisher={Springer}
}

@inproceedings{Teranishi:2014,
 author = {Teranishi, Keita and Heroux, Michael A.},
 title = {{Toward Local Failure Local Recovery Resilience Model Using {MPI-ULFM}}},
 booktitle = {Proceedings of the 21st European {MPI} Users' Group Meeting},
 series = {EuroMPI/ASIA '14},
 year = {2014},
 isbn = {978-1-4503-2875-3},
 location = {Kyoto, Japan},
 pages = {51:51--51:56},
 articleno = {51},
 numpages = {6},
 url = {http://doi.acm.org/10.1145/2642769.2642774},
 doi = {10.1145/2642769.2642774},
 acmid = {2642774},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {Fault Tolerance, MPI, PDE solvers, Scientific Computing, User Level Fault Mitigation},
} 


@inproceedings{Sato:2014,
 author = {Sato et al., Kento},
 title = {FMI: Fault Tolerant Messaging Interface for Fast and Transparent Recovery},
 booktitle = {Proceedings of the 2014 IEEE 28th International Parallel and Distributed Processing Symposium},
 series = {IPDPS '14},
 year = {2014},
 isbn = {978-1-4799-3800-1},
 pages = {1225--1234},
 numpages = {10},
 url = {http://dx.doi.org/10.1109/IPDPS.2014.126},
 doi = {10.1109/IPDPS.2014.126},
 acmid = {2650537},
 publisher = {IEEE Computer Society},
 address = {Washington, DC, USA},
 keywords = {Fault tolerance, MPI, Checkpoint/Restart},
}


@inproceedings{Gamell:2014,
 author = {Gamell et al., Marc},
 title = {{Exploring Automatic, Online Failure Recovery for Scientific Applications at Extreme Scales}},
 booktitle = {Proc. Int. Conf. High Performance Computing, Networking, Storage and Analysis},
 series = {SC '14},
 year = {2014},
 location = {New Orleans, LA}
} 


@inproceedings{Dinan:2011,
  author = {Dinan et al, James},
  booktitle = {EuroMPI},
  crossref = {conf/pvm/2011},
  editor = {Cotronis, Yiannis and Danalis, Anthony and Nikolopoulos, Dimitrios S. and Dongarra, Jack},
  ee = {http://dx.doi.org/10.1007/978-3-642-24449-0_32},
  interhash = {a093f457010481facd376d8b78472979},
  intrahash = {16a62a2422f40ec1ff9b3a83f6c5a861},
  isbn = {978-3-642-24448-3},
  keywords = {dblp},
  pages = {282-291},
  publisher = {Springer},
  series = {Lecture Notes in Computer Science},
  timestamp = {2011-09-13T00:00:00.000+0200},
  title = {{Noncollective Communicator Creation in {MPI}.}},
  volume = 6960,
  year = 2011
}

@inproceedings{Jin,
 author = {Jin, Tong and Zhang, Fan and Sun, Qian and Bui, Hoang and Parashar, Manish and Yu, Hongfeng and Klasky, Scott and Podhorszki, Norbert and Abbasi, Hasan},
 title = {{Using Cross-layer Adaptations for Dynamic Data Management in Large Scale Coupled Scientific Workflows}},
 booktitle = {{Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis}},
 series = {SC 2013},
 year = {2013},
 isbn = {978-1-4503-2378-9},
 location = {Denver, Colorado},
 pages = {74:1--74:12},
 articleno = {74},
 numpages = {12},
 url = {http://doi.acm.org/10.1145/2503210.2503301},
 doi = {10.1145/2503210.2503301},
 acmid = {2503301},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {coupled simulation workflows, cross-layer adaptation, data management, in-situ/in-transit, staging},
}

@inproceedings{Bennett,
 author = {Bennett, Janine C. and Abbasi, Hasan and Bremer, Peer-Timo and Grout, Ray and Gyulassy, Attila and Jin, Tong and Klasky, Scott and Kolla, Hemanth and Parashar, Manish and Pascucci, Valerio and Pebay, Philippe and Thompson, David and Yu, Hongfeng and Zhang, Fan and Chen, Jacqueline},
 title = {{Combining In-situ and In-transit Processing to Enable Extreme-scale Scientific Analysis}},
 booktitle = {{Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis}},
 series = {SC 2012},
 year = {2012},
 isbn = {978-1-4673-0804-5},
 location = {Salt Lake City, Utah},
 pages = {49:1--49:9},
 articleno = {49},
 numpages = {9},
 url = {http://dl.acm.org/citation.cfm?id=2388996.2389063},
 acmid = {2389063},
 publisher = {IEEE Computer Society Press},
 address = {Los Alamitos, CA, USA},
} 

@proceedings {pubsub,
	title = {A Scalable Messaging System for Accelerating Discovery from Large Scale Scientific Simulations},
	journal = {Proceedings of the 19th Annual International Conference on High Performance Computing (HiPC 2012)},
	year = {2012},
	month = {12/2012},
	publisher = {IEEE Computer Society Press},
	address = {Pune, India},
	author = {T. Jin and F. Zhang and Manish Parashar and S Klasky and N. Podhorszki and H. Abbasi}
}

@inproceedings{Zhang,
 author = {Zhang, Fan and Lasluisa, Solomon and Jin, Tong and Rodero, Ivan and Bui, Hoang and Parashar, Manish},
 title = {{In-situ Feature-Based Objects Tracking for Large-Scale Scientific Simulations}},
 booktitle = {{Proceedings of the 2012 SC Companion: High Performance Computing, Networking Storage and Analysis}},
 series = {SCC 2012},
 year = {2012},
 isbn = {978-0-7695-4956-9},
 pages = {736--740},
 numpages = {5},
 url = {http://dx.doi.org/10.1109/SC.Companion.2012.100},
 doi = {10.1109/SC.Companion.2012.100},
 acmid = {2477107},
 publisher = {IEEE Computer Society},
 address = {Washington, DC, USA},
 keywords = {Scientific data analysis, scalable in-situ data analytics, feature-based object tracking},
} 

@inproceedings{2013-33,
	Acmid = {2465821},
	Address = {New York, NY, USA},
	Author = {Lu, Guoming and Zheng, Ziming and Chien, Andrew A.},
	Booktitle = {Proceedings of the 3rd Workshop on Fault-tolerance for HPC at Extreme Scale},
	Date-Added = {2014-07-23 18:06:20 +0000},
	Date-Modified = {2014-07-23 18:08:04 +0000},
	Doi = {10.1145/2465813.2465821},
	Isbn = {978-1-4503-1983-6},
	Keywords = {checkpointing, error recovery, high-performance computing, reliability},
	Location = {New York, New York, USA},
	Numpages = {8},
	Pages = {49--56},
	Publisher = {ACM},
	Rating = {4},
	Series = {FTXS '13},
	Title = {When is Multi-version Checkpointing Needed?},
	Url = {http://doi.acm.org/10.1145/2465813.2465821},
	Year = {2013},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2465813.2465821},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2465813.2465821}}

@inproceedings{2014-5,
	Author = {Ziming Zheng and Andrew A. Chien and Keita Teranishi},
	Booktitle = {VECPAR 2014},
	Date-Added = {2014-07-23 17:54:57 +0000},
	Date-Modified = {2014-07-23 18:08:27 +0000},
	Month = {June},
	Title = {Fault Tolerance in an Inner-Outer Solver: a GVR-enabled Case Study},
	Year = {2014}}

@inproceedings{2014-4,
	Author = {Kento Sato and A. Moody and K. Mohror and T. Gamblin and B. R. de Supinski and N. Maruyama and S. Matsuoka},
	Booktitle = {28th IEEE International Parallel & Distributed Processing Symposium (IPDPS 2014)},
	Date-Added = {2014-07-23 17:54:40 +0000},
	Date-Modified = {2014-07-23 18:08:18 +0000},
	Month = {5},
	Title = {{FMI: Fault Tolerant Messaging Interface for Fast and Transparent Recovery}},
	Year = {2014}}

@article{1994-2,
	Abstract = {Presents the results of an implementation of several algorithms for checkpointing and restarting parallel programs on shared-memory multiprocessors. The algorithms are compared according to the metrics of overall checkpointing time, overhead imposed by the checkpointer on the target program, and amount of time during which the checkpointer interrupts the target program. The best algorithm measured achieves its efficiency through a variation of copy-on-write, which allows the most time-consuming operations of the checkpoint to be overlapped with the running of the program being checkpointed},
	Author = {Li, K. and Naughton, J.F. and Plank, J.S.},
	Date-Added = {2014-07-23 17:54:15 +0000},
	Date-Modified = {2014-07-23 18:07:52 +0000},
	Journal = {Parallel and Distributed Systems, IEEE Transactions on},
	Keywords = {fault tolerant computing;parallel programming;program diagnostics;software reliability;system recovery;backward error recovery;copy-on-write;efficiency;fault tolerance;interruption time;low latency concurrent checkpointing;metrics;overall checkpointing time;overhead;overlapping operations;parallel programs;program restarting;shared-memory multiprocessors;Benchmark testing;Central Processing Unit;Checkpointing;Computer science;Concurrent computing;Delay;Fault tolerance;Fault tolerant systems;Registers},
	Month = {Aug},
	Number = {8},
	Pages = {874-879},
	Title = {Low-latency, concurrent checkpointing for parallel programs},
	Volume = {5},
	Year = {1994},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/71.298215}}

@article{2007-4,
	Author = {G. Bosilca and Z. Chen and J. Dongarra and J. Langou},
	Date-Added = {2014-07-23 17:52:33 +0000},
	Date-Modified = {2014-07-23 18:07:21 +0000},
	Journal = {SIAM Journal on Scientific Computing},
	Month = {November},
	Number = {1},
	Pages = {102-116},
	Title = {Recovery patterns for iterative methods in a parallel unstable environment},
	Volume = {30},
	Year = {2007}}

@inproceedings{2010-10,
	Author = {Dinan, J. and Singri, A. and Sadayappan, P. and Krishnamoorthy, S.},
	Booktitle = {Cluster, Cloud and Grid Computing (CCGrid), 2010 10th IEEE/ACM International Conference on},
	Date-Added = {2014-07-23 17:51:31 +0000},
	Date-Modified = {2014-07-23 18:07:41 +0000},
	Doi = {10.1109/CCGRID.2010.34},
	Keywords = {Chemistry;Clouds;Computer science;Electronics packaging;Fault tolerance;Grid computing;Hardware;Kernel;Parallel processing;Parallel programming;Global Arrays;PGAS;Parallel processing;fault tolerance;selective recovery;task parallelism},
	Pages = {709-714},
	Title = {Selective Recovery from Failures in a Task Parallel Programming Model},
	Year = {2010},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/CCGRID.2010.34}}

@inproceedings{2013-34,
	Address = {New York, NY, USA},
	Author = {Sao, Piyush and Vuduc, Richard},
	Booktitle = {Proceedings of the Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems},
	Date-Added = {2014-07-23 17:51:09 +0000},
	Date-Modified = {2014-07-23 18:08:11 +0000},
	Keywords = {fault-tolerance, iterative linear solvers, self-stabilization, transient soft faults},
	Location = {Denver, Colorado},
	Numpages = {8},
	Pages = {4:1--4:8},
	Publisher = {ACM},
	Series = {ScalA '13},
	Title = {Self-stabilizing Iterative Solvers},
	Year = {2013},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2530268.2530272},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2530268.2530272}}

@inproceedings{2013-22-slides,
	Abstract = {Energy consumption and fault tolerance are two interrelated issues
	to address for designing future exascale systems. Fault tolerance
	protocols used for check pointing have different energy consumption
	depending on parameters like application features, number of processes
	in the execution and platform characteristics. Currently, the only
	way to select a protocol for a given execution is to run the application
	and monitor the energy consumption of different fault tolerance protocols.
	This is needed for any variation of the execution setting. To avoid
	this time and energy consuming process, we propose an energy estimation
	framework. It relies on an energy calibration of the considered platform
	and a user description of the execution setting. We evaluate the
	accuracy of our estimations with real applications running on a real
	platform with energy consumption monitoring. Results show that our
	estimations are highly accurate and allow selecting the best fault
	tolerant protocol without pre-executing the application.},
	Added-At = {2013-06-27T00:00:00.000+0200},
	Author = {el Mehdi Diouri, Mohammed and Gl{\"u}ck, Olivier and Lef{\`e}vre, Laurent and Cappello, Franck},
	Biburl = {http://www.bibsonomy.org/bibtex/22357d1e076ed904762b76fe705c285aa/dblp},
	Booktitle = {{CCGRID}},
	Crossref = {conf/ccgrid/2013},
	Ee = {http://doi.ieeecomputersociety.org/10.1109/CCGrid.2013.80},
	Interhash = {8e635a076a6276720aa523e61f0ef738},
	Intrahash = {2357d1e076ed904762b76fe705c285aa},
	Isbn = {978-1-4673-6465-2},
	Keywords = {dblp},
	Pages = {522-529},
	Publisher = {IEEE Computer Society},
	Series = {CCGRID 2013},
	Timestamp = {2013-06-27T00:00:00.000+0200},
	Title = {{ECOFIT: A Framework to Estimate Energy Consumption of Fault Tolerance Protocols for HPC Applications.}},
	Url = {http://dblp.uni-trier.de/db/conf/ccgrid/ccgrid2013.html#DiouriGLC13},
	Year = {2013},
	Bdsk-Url-1 = {http://dblp.uni-trier.de/db/conf/ccgrid/ccgrid2013.html#DiouriGLC13}}

@inproceedings{2011-9,
	Abstract = {Fault tolerance is becoming a major concern in HPC systems. The two
	traditional approaches for message passing applications, coordinated
	checkpointing and message logging, have severe scalability issues.
	Coordinated checkpointing protocols make all processes roll back
	after a failure. Message logging protocols log a huge amount of data
	and can induce an overhead on communication performance. Hierarchical
	rollback-recovery protocols based on the combination of coordinated
	checkpointing and message logging are an alternative. These partial
	message logging protocols are based on process clustering: only messages
	between clusters are logged to limit the consequence of a failure
	to one cluster. These protocols would work efficiently only if one
	can find clusters of processes in the applications such that the
	ratio of logged messages is very low. We study the communication
	patterns of message passing HPC applications to show that partial
	message logging is suitable in most cases. We propose a partitioning
	algorithm to find suitable clusters of processes given the communication
	pattern of an application. Finally, we evaluate the efficiency of
	partial message logging using two state of the art protocols on a
	set of representative applications.},
	Author = {Thomas Ropars and Amina Guermouche and Bora U{\c c}ar and Esteban Meneses and Laxmikant V. Kal{\'e} and Franck Cappello},
	Booktitle = {{Euro-Par (1)}},
	Crossref = {2011},
	Ee = {http://dx.doi.org/10.1007/978-3-642-23400-2_53},
	Pages = {567-578},
	Series = {Euro-Par 2011},
	Title = {{On the Use of Cluster-Based Partial Message Logging to Improve Fault Tolerance for {MPI HPC} Applications.}},
	Year = {2011}}

@inproceedings{2006-8,
	Added-At = {2007-01-02T00:00:00.000+0100},
	Author = {Subramaniyan, Rajagopal and Aggarwal, Vikas and Jacobs, Adam and George, Alan},
	Biburl = {http://www.bibsonomy.org/bibtex/2bc708e5f120e51efac8bd1174d20eca2/dblp},
	Booktitle = {{14th Annual European Symposium on Algorithms}},
	Date = {2007-01-02},
	Description = {dblp},
	Editor = {Arabnia, Hamid R.},
	Interhash = {63e4239ea4da2865c2793dcc9872a918},
	Intrahash = {bc708e5f120e51efac8bd1174d20eca2},
	Isbn = {1-60132-017-5},
	Keywords = {dblp},
	Pages = {3-9},
	Publisher = {CSREA Press},
	Series = {ESA 2006},
	Timestamp = {2007-01-02T00:00:00.000+0100},
	Title = {{FEMPI: A Lightweight Fault-tolerant MPI for Embedded Cluster Systems.}},
	Url = {http://dblp.uni-trier.de/db/conf/csreaESA/csreaESA2006.html#SubramaniyanAJG06},
	Year = {2006},
	Bdsk-Url-1 = {http://dblp.uni-trier.de/db/conf/csreaESA/csreaESA2006.html#SubramaniyanAJG06}}

@inproceedings{2004-5,
	Acmid = {1006248},
	Address = {New York, NY, USA},
	Author = {Agarwal, Saurabh and Garg, Rahul and Gupta, Meeta S. and Moreira, Jose E.},
	Booktitle = {{Proceedings of the 18th annual international conference on Supercomputing}},
	Doi = {10.1145/1006209.1006248},
	Isbn = {1-58113-839-3},
	Keywords = {fault-tolerance, incremental checkpoint, large scale systems, probabilistic checkpoint},
	Location = {Malo, France},
	Numpages = {10},
	Pages = {277--286},
	Publisher = {ACM},
	Series = {ICS 2004},
	Title = {{Adaptive incremental checkpointing for massively parallel systems}},
	Url = {http://doi.acm.org/10.1145/1006209.1006248},
	Year = {2004},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/1006209.1006248},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/1006209.1006248}}

@inproceedings{2010-9,
	Abstract = {The Gemini System Interconnect is a new network for Cray's supercomputer
	systems. It provides improved network functionality, latency and
	issue rate. Latency is reduced with OS bypass for sends and direct
	user completion notification on receives. Atomic memory operations
	support the construction of fast synchronization and reduction primitives.},
	Author = {Alverson, R. and Roweth, D. and Kaplan, L.},
	Booktitle = {{IEEE 18th Annual Symposium on High Performance Interconnects (HOTI)}},
	Doi = {10.1109/HOTI.2010.23},
	Keywords = {mainframes;multiprocessor interconnection networks;network computers;parallel machines;Crays supercomputer system;Gemini system interconnect;OS bypass;atomic memory operations;direct user completion notification;operating system;Bandwidth;Hardware;Kernel;Payloads;Routing;Synchronization;Tiles},
	Month = {Aug},
	Pages = {83-87},
	Title = {{The Gemini System Interconnect}},
	Year = {2010},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/HOTI.2010.23}}

@inproceedings{1999-3,
	Abstract = {Communication induced checkpointing (CIC) allows processes in a distributed
	computation to take independent checkpoints and to avoid the domino
	effect. This paper presents an analysis of CIC protocols based on
	a prototype implementation and validated simulations. Our result
	indicate that there is sufficient evidence to suspect that much of
	the conventional wisdom about these protocols is questionable.},
	Author = {Alvisi, L. and Elnozahy, E. and Rao, S. and Husain, S.A. and de Mel, A.},
	Booktitle = {{Twenty-Ninth Annual International Symposium on Fault-Tolerant Computing. Digest of Papers}},
	Doi = {10.1109/FTCS.1999.781058},
	Isbn = {0731-3071},
	Keywords = {distributed programming, protocols, system recovery, CIC, CIC protocols, distributed computation, independent checkpoints, Analytical models, Checkpointing, Computational modeling, Electrical capacitance tomography, Protocols, Prototypes, Scalability, Virtual prototyping},
	Pages = {242-249},
	Series = {FTCS 1999 - DSN},
	Title = {{An analysis of communication induced checkpointing}},
	Year = {1999},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/FTCS.1999.781058}}

@techreport{2009-6,
	Authors = {Amarasinghe, Saman and Campbell, Dan and Carlson, William and Chien, Andrew and Dally, William and Elnohazy, Elmootazbellah and Hall, Mary and Harrison, Robert and Harrod, William and Hill, Kerry and Hiller, Jon and Karp, Sherman and Koelbel, Charles and Koester, David and Kogge, Peter and Levesque, John and Reed, Daniel and Sarkar, Vivek and Schreiber, Robert and Richards, Mark and Scarpelli, Al and Shalf, John and Snavely, Allan and Sterling, Thomas},
Author = {Saman Amarasinghe and et al},
	Citeulike-Article-Id = {9804599},
	Day = {14},
	Institution = {DARPA IPTO, Air Force Reserach Lab},
	Keywords = {exascale, scalability, supercomputer, supercomputing},
	Month = sep,
	Posted-At = {2011-09-25 12:36:24},
	Priority = {2},
	Title = {{ExaScale Software Study: Software Challenges in Extreme Scale Systems}},
	Year = {2009}}

@inproceedings{2011-17,
	Authorss = {Saman Amarasinghe and Mary Hall and Richard Lethin and Keshav Pingali and Dan Quinlan and Vivek Sarkar and John Shalf and Robert Lucas and Katherine Yelick and Pavan Balanji and Pedro C. Diniz and Alice Koniges and Marc Snir},
Author = {Saman Amarasinghe and et al},
	Booktitle = {{Proceedings of the Workshop on Exascale Programming Challenges, Marina del Rey, CA, USA}},
	Month = {Jul},
	Publisher = {U.S Department of Energy, Office of Science, Office of Advanced Scientific Computing Research (ASCR)},
	Title = {{Exascale Programming Challenges}},
	Url = {http://science.energy.gov/~/media/ascr/pdf/program-documents/docs/ProgrammingChallengesWorkshopReport.pdf},
	Year = {2011},
	Bdsk-Url-1 = {http://science.energy.gov/~/media/ascr/pdf/program-documents/docs/ProgrammingChallengesWorkshopReport.pdf}}

@inproceedings{2013-9,
	Abstract = {In this paper, we revisit traditional checkpointing and rollback recovery
	strategies, with a focus on silent data corruption errors. Contrarily
	to fail-stop failures, such latent errors cannot be detected immediately,
	and a mechanism to detect them must be provided. We consider two
	models: (i) errors are detected after some delays following a probability
	distribution (typically, an Exponential distribution); (ii) errors
	are detected through some verification mechanism. In both cases,
	we compute the optimal period in order to minimize the waste, i.e.,
	the fraction of time where nodes do not perform useful computations.
	In practice, only a fixed number of checkpoints can be kept in memory,
	and the first model may lead to an irrecoverable failure. In this
	case, we compute the minimum period required for an acceptable risk.
	For the second model, there is no risk of irrecoverable failure,
	owing to the verification mechanism, but the corresponding overhead
	is included in the waste. Finally, both models are instantiated using
	realistic scenarios and application/architecture parameters.},
	Address = {Vancouver, Canada},
	Affiliation = {Laboratoire de l'Informatique du Parallelisme - LIP , ROMA - ENS Lyon / CNRS / Inria Grenoble Rh{\^o}ne-Alpes , Innovative Computing Laboratory - ICL},
	Author = {Aupy, Guillaume and Benoit, Anne and Herault, Thomas and Robert, Yves and Vivien, Frederic and Zaidouni, Dounia},
	Booktitle = {{The 19th IEEE Pacific Rim International Symposium on Dependable Computing - 2013}},
	Keywords = {High-performance computing, checkpointing, silent data corruption, verification, error recovery},
	Month = {Dec},
	Pdf = {http://hal.inria.fr/hal-00847620/PDF/resilience2013.pdf},
	Publisher = {IEEE},
	Series = {PRDC 2013},
	Title = {{On the Combination of Silent Error Detection and Checkpointing}},
	Url = {http://hal.inria.fr/hal-00847620},
	Year = {2013},
	Bdsk-Url-1 = {http://hal.inria.fr/hal-00847620}}

@inproceedings{2001-3,
	Abstract = {MPI has proven effective for parallel applications in situations with
	neither QoS nor fault handling. Emerging environments motivate fault-tolerant
	MPI middleware. Environments include space-based, wide-area/web/meta
	computing and scalable clusters. MPI/FT, the system described in
	the paper, trades off sufficient MPI fault coverage against acceptable
	parallel performance, based on mission requirements and constraints.
	MPI codes are evolved to use MPI/FT features. Non-portable code for
	event handlers and recovery management is isolated. User-coordinated
	recovery, checkpointing, transparency and event handling, as well
	as evolvability of legacy MPI codes form key design criteria. Parallel
	self-checking threads address four levels of MPI implementation robustness,
	three of which are portable to any multithreaded MPI. A taxonomy
	of application types provides six initial fault-relevant models;
	user-transparent parallel nMR computation is thereby considered.
	Key concepts from MPI/RT-real-time MPI-are also incorporated into
	MPI/FT, with further overt support for MPI/RT and MPI/FT in applications
	possible in future},
	Author = {Batchu, R. and Neelamegam, J.P. and Zhenqian Cui and Beddhu, M. and Skjellum, A. and Dandass, Y. and Apte, M.},
	Booktitle = {{Proceedings of the First IEEE/ACM International Symposium on Cluster Computing and the Grid}},
	Doi = {10.1109/CCGRID.2001.923171},
	Keywords = {client-server systems;message passing;parallel programming;software architecture;software fault tolerance;system recovery;MPI/FT;checkpointing;event handlers;event handling;fault-tolerant middleware;message passing;meta computing;parallel performance;parallel self-checking threads;performance-portable parallel computing;real-time MPI;recovery management;scalable clusters;wide-area network;Checkpointing;Communication standards;Fault tolerance;Fault tolerant systems;Middleware;Operating systems;Process control;Protocols;Quality of service;Taxonomy},
	Pages = {26-33},
	Series = {CCGRID 2011},
	Title = {{MPI/FTTM: architecture and taxonomies for fault-tolerant, message-passing middleware for performance-portable parallel computing}},
	Year = {2001},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/CCGRID.2001.923171}}

@inproceedings{2011-1,
	Author = {Bautista-Gomez, Leonardo and Tsuboi, Seiji and Komatitsch, Dimitri and Cappello, Franck and Maruyama, Naoya and Matsuoka, Satoshi},
	Booktitle = {{Proceedings of International Conference for High Performance Computing, Networking, Storage and Analysis}},
	Series = {SC 2011},
	Summary = {FTI offers high-frequency (25s/300MB per node on 48nodes; 6min/400MB per node on 1152 nodes -8\%overhead-) multi-level checkpoint for systems using node-local storage (present in some HPC systems, such as TSUBAME2.0 which has SSDs in every node). \hide{They contribution is small, because the 3 ideas they implement (FT thread, topology-aware Reed Solomon, multilevel checkpoint), were already presented in the past.} They partition the system in groups of K processes, where each group will implement Reed-Solomon encoding to tolerate M process (not node) failures within a group. Each group can't contain more than one core of the same node, allowing therefore node failures (which they call topology-aware Reed-Solomon). They set M=K, tolerating half group failure because they store the encoded checkpoints locally \hide{THEY ACHIEVE THE SAME STORAGE COST OF NEIGHBORS CHECKPOINTING, THE SAME NETWORK COST, BUT WITH MUCH MORE COMPUTATION, BECAUSE THEY HAVE TO CALCULATE ALL THE ENCODED DATA, INSTEAD OF JUST SENDING THE UNMODIFIED CHECKPOINT TO A NEIGHBOR! (they don't mention this). No apparent benefit from Neighbor checkpointing, with added complexity.}. They use one dedicated thread per node to calculate the encoding. They implement three-level checkpoint scheme (L1 SSD, L2 RS encoding, L3 PFS) as presented in [2010-8]. They include a reliability study of the multilevel approach. Upon failure, they stop the whole job and require the user to relaunch it. They evaluate with a real application, SPECFEM3D, showing 8\% of checkpoint overhead in 1000 GPUs. Note about eval: FTI only shows FLOPS, checkpoint and encoding time... never time to solution. They briefly mention failure correlation causes.},
	Title = {{FTI: High Performance Fault Tolerance Interface for Hybrid Systems}},
	Year = {2011}}

@techreport{2012-12,
	Author = {P. Beckman and R. Brightwell and B. R. de Supinski and M. Gokhale and S. Hofmeyr and S. Krishnamoorthy and M. Lang and B. Maccabe and J. Shalf and M. Snir},
	Institution = {US Department of Energy},
	Month = {December},
	Title = {{Exascale Operating Systems and Runtime Software Report}},
	Type = {Technical Report},
	Url = {http://science.energy.gov/~/media/ascr/pdf/research/cs/Exascale%20Workshop/ExaOSR-Report-Final.pdf},
	Year = {2012},
	Bdsk-Url-1 = {http://science.energy.gov/~/media/ascr/pdf/research/cs/Exascale%20Workshop/ExaOSR-Report-Final.pdf}}

@inproceedings{2001-2,
	Acmid = {654524},
	Address = {London, UK, UK},
	Author = {Bhandarkar, Milind A. and Kale, Laxmikant V. and Sturler, Eric de and Hoeflinger, Jay},
	Booktitle = {{Proceedings of the International Conference on Computational Science-Part II}},
	Isbn = {3-540-42233-1},
	Numpages = {10},
	Pages = {108--117},
	Publisher = {Springer-Verlag},
	Series = {ICCS 2001},
	Title = {{Adaptive Load Balancing for MPI Programs}},
	Url = {http://dl.acm.org/citation.cfm?id=645456.654524},
	Year = {2001},
	Bdsk-Url-1 = {http://dl.acm.org/citation.cfm?id=645456.654524}}

@article{2009-13,
	Author = {Bianchini, Ricardo and Fox, Armando and Godfrey, Berkeley Forest and Hoisie, Adolfy and McKinley, Kathryn and Plank, James and Ranganathan, Partha and Simons, Josh},
	Title = {{System Resilience at Extreme Scale White Paper}},
	Year = {2013}}

@inproceedings{2012-5,
	Author = {Bland, W.},
	Booktitle = {{12th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing}},
	Doi = {10.1109/CCGrid.2012.25},
	Keywords = {application program interfaces, checkpointing, fault tolerant computing, message passing, MPI standard, Open MPI library, application resilience, concurrent state checkpointing, fail-stop failures, fault tolerance approach, message passing interface, minimalistic fault discovery, proof of concept, runtime process failure, Fault tolerance, Fault tolerant systems, Libraries, Routing, Runtime, Standards, Topology, Distributed Runtime, Fault Tolerance, Message Passing Interface},
	Pages = {746-751},
	Series = {CCGrid 2012},
	Title = {{Enabling Application Resilience with and without the MPI Standard}},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/CCGrid.2012.25}}

@techreport{2012-10,
	Author = {Bland et al., W.},
	Booktitle = {{University of Tennessee Electrical Engineering and Computer Science Technical Report}},
	Institution = {Innovative Computing Laboratory, University of Tennessee},
	Month = {February},
	Title = {{A Proposal for User-Level Failure Mitigation in the MPI-3 Standard}},
	Year = {2012}}

@article{bland2013post,
	Abstract = {As supercomputers are entering an era of massive parallelism where
	the frequency of faults is increasing, the MPI Standard remains distressingly
	vague on the consequence of failures on MPI communications. Advanced
	fault-tolerance techniques have the potential to prevent full-scale
	application restart and therefore lower the cost incurred for each
	failure, but they demand from MPI the capability to detect failures
	and resume communications afterward. In this paper, we present a
	set of extensions to MPI that allow communication capabilities to
	be restored, while maintaining the extreme level of performance to
	which MPI users have become accustomed. The motivation behind the
	design choices are weighted against alternatives, a task that requires
	simultaneously considering MPI from the viewpoint of both the user
	and the implementor. The usability of the interfaces for expressing
	advanced recovery techniques is then discussed, including the difficult
	issue of enabling separate software layers to coordinate their recovery.},
	Author = {Bland et al, Wesley},
	Doi = {10.1177/1094342013488238},
	Eprint = {http://hpc.sagepub.com/content/early/2013/06/02/1094342013488238.full.pdf+html},
	Journal = {Int. J. High Performance Computing Applications},
	Title = {{Post-failure recovery of MPI communication capability: Design and rationale}},
	Url = {http://hpc.sagepub.com/content/early/2013/06/02/1094342013488238.abstract},
	Year = {2013},
	Bdsk-Url-1 = {http://hpc.sagepub.com/content/early/2013/06/02/1094342013488238.abstract},
	Bdsk-Url-2 = {http://dx.doi.org/10.1177/1094342013488238}}

@inproceedings{2012-9,
	Acmid = {2404064},
	Address = {Berlin, Heidelberg},
	Author = {Bland et al., Wesley},
	Booktitle = {{Proc. 19th European Conf. on Recent Advances in the Message Passing Interface}},
	Doi = {10.1007/978-3-642-33518-1_24},
	Isbn = {978-3-642-33517-4},
	Location = {Vienna, Austria},
	Numpages = {11},
	Pages = {193--203},
	Publisher = {Springer-Verlag},
	Series = {EuroMPI 2012},
	Title = {{An evaluation of user-level failure mitigation support in MPI}},
	Url = {http://dx.doi.org/10.1007/978-3-642-33518-1_24},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-33518-1_24}}

@incollection{2012-18,
	Author = {Bland, Wesley and Du, Peng and Bouteiller, Aurelien and Herault, Thomas and Bosilca, George and Dongarra, Jack},
	Booktitle = {{Euro-Par 2012 Parallel Processing}},
	Doi = {10.1007/978-3-642-32820-6_48},
	Editor = {Kaklamanis, Christos and Papatheodorou, Theodore and Spirakis, PaulG.},
	Isbn = {978-3-642-32819-0},
	Pages = {477-488},
	Publisher = {Springer Berlin Heidelberg},
	Series = {Lecture Notes in Computer Science},
	Title = {{A Checkpoint-on-Failure Protocol for Algorithm-Based Recovery in Standard MPI}},
	Url = {http://dx.doi.org/10.1007/978-3-642-32820-6_48},
	Volume = {7484},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-32820-6_48}}

@article{2013-7,
	Author = {Bland, Wesley and Du, Peng and Bouteiller, Aurelien and Herault, Thomas and Bosilca, George and Dongarra, Jack J.},
	Doi = {10.1002/cpe.3100},
	Issn = {1532-0634},
	Journal = {Concurrency and Computation: Practice and Experience},
	Keywords = {fault tolerance, message passing interface, ABFT, Checkpoint-on-Failure},
	Title = {{Extending the scope of the Checkpoint-on-Failure protocol for forward recovery in standard MPI}},
	Url = {http://dx.doi.org/10.1002/cpe.3100},
	Year = {2013},
	Bdsk-Url-1 = {http://dx.doi.org/10.1002/cpe.3100}}

@techreport{2012-14,
	Abstract = {In this article, we present a unified model for several well-known
	checkpoint/restart protocols. The proposed model is generic enough
	to encompass both extremes of the check- point/restart space, from
	coordinated approaches to a variety of uncoordinated checkpoint strate-
	gies (with message logging). We identify a set of crucial parameters,
	instantiate them and compare the expected efficiency of the fault
	tolerant protocols, for a given application/platform pair. We then
	propose a detailed analysis of several scenarios, including some
	of the most powerful currently available HPC platforms, as well as
	anticipated Exascale designs. The results of this analytical comparison
	are corroborated by a comprehensive set of simulations. Altogether,
	they outline com- parative behaviors of checkpoint strategies at
	very large scale, thereby providing insight that is hardly accessible
	to direct experimentation.},
	Affiliation = {Innovative Computing Laboratory - ICL , Departement Informatique - INF , GRAND-LARGE - INRIA Saclay - Ile de France , Joint Laboratory for Petascale Computing [Illinois] - JLPC , Laboratoire de Recherche en Informatique - LRI , ROMA - ENS Lyon / CNRS / Inria Grenoble Rh{\^o}ne-Alpes , Laboratoire de l'Informatique du Parallelisme - LIP},
	Author = {Bosilca, George and Bouteiller, Aurelien and Brunet, Elisabeth and Cappello, Franck and Dongarra, Jack and Guermouche, Amina and Herault, Thomas and Robert, Yves and Vivien, Frederic and Zaidouni, Dounia},
	Institution = {INRIA},
	Keywords = {Fault-tolerance, checkpointing, coordinated, hierarchical, model, exascale},
	Month = {Oct},
	Number = {RR-7950},
	Pdf = {http://hal.inria.fr/hal-00696154/PDF/RR-7950.pdf},
	Title = {{Unified Model for Assessing Checkpointing Protocols at Extreme-Scale}},
	Type = {Rapport de recherche},
	Url = {http://hal.inria.fr/hal-00696154},
	Year = {2012},
	Bdsk-Url-1 = {http://hal.inria.fr/hal-00696154}}

@inproceedings{2002-5,
	Abstract = {Global Computing platforms, large scale clusters and future TeraGRID
	systems gather thousands of nodes for computing parallel scientific
	applications. At this scale, node failures or disconnections are
	frequent events. This Volatility reduces the MTBF of the whole system
	in the range of hours or minutes. We present MPICH-V, an automatic
	Volatility tolerant MPI environment based on uncoordinated checkpoint/roll-back
	and distributed message logging. MPICH-V architecture relies on Channel
	Memories, Checkpoint servers and theoretically proven protocols to
	execute existing or new, SPMD and Master-Worker MPI applications
	on volatile nodes. To evaluate its capabilities, we run MPICH-V within
	a framework for which the number of nodes, Channels Memories and
	Checkpoint Servers can be completely configured as well as the node
	Volatility. We present a detailed performance evaluation of every
	component of MPICH-V and its global performance for non-trivial parallel
	applications. Experimental results demonstrate good scalability and
	high tolerance to node volatility.},
	Author = {Bosilca, G. and Bouteiller, A. and Cappello, F. and Djilali, S. and Fedak, G. and Germain, C. and Herault, T. and Lemarinier, P. and Lodygensky, O. and Magniette, F. and Neri, V. and Selikhov, A.},
	Booktitle = {{Supercomputing, ACM/IEEE 2002 Conference}},
	Doi = {10.1109/SC.2002.10048},
	Issn = {1063-9535},
	Keywords = {Application software;Computer applications;Computer architecture;Computer industry;Concurrent computing;Distributed computing;Fault tolerance;Large-scale systems;Message passing;Peer to peer computing},
	Pages = {29-29},
	Title = {{MPICH-V: Toward a Scalable Fault Tolerant MPI for Volatile Nodes}},
	Year = {2002},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/SC.2002.10048}}

@article{2008-1,
	Acmid = {1514767},
	Address = {Orlando, FL, USA},
	Author = {Bosilca, George and Delmas, R{\'e}mi and Dongarra, Jack and Langou, Julien},
	Doi = {10.1016/j.jpdc.2008.12.002},
	Issn = {0743-7315},
	Journal = {J. Parallel Distrib. Comput.},
	Keywords = {Fault tolerance, High performance computing, Linear algebra},
	Month = {apr},
	Number = {4},
	Numpages = {7},
	Pages = {410--416},
	Publisher = {Academic Press, Inc.},
	Title = {{Algorithm-based fault tolerance applied to high performance computing}},
	Url = {http://dx.doi.org/10.1016/j.jpdc.2008.12.002},
	Volume = {69},
	Year = {2008},
	Bdsk-Url-1 = {http://dx.doi.org/10.1016/j.jpdc.2008.12.002}}

@article{2013-6,
	Abstract = {High performance computing applications must be resilient to faults.
	The traditional fault-tolerance solution is checkpoint-recovery,
	by which application state is saved to and recovered from secondary
	storage throughout execution. It has been shown that, even when using
	an optimal checkpointing strategy, the checkpointing overhead precludes
	high parallel efficiency at large scale. Additional fault- tolerance
	mechanisms must thus be used. Such a mechanism is replication, that
	is, multiple processors performing the same computation so that a
	processor failure does not necessarily imply an application failure.
	In spite of resource waste, repli- cation can lead to higher parallel
	efficiency when compared to using only checkpoint-recovery at large
	scale.We propose to execute and checkpoint multiple application instances
	concurrently, an approach we term group replication. For exponential
	failures we give an upper bound on the expected application execution
	time. This bound corresponds to a particular checkpointing period
	that we derive. For general failures, we propose a dynamic programming
	algorithm to determine non-periodic checkpoint dates as well as an
	empirical periodic checkpointing solution whose period is found via
	a numerical search. Using simulation we evaluate our proposed approaches,
	including comparison to the non-replication case, for both exponential
	and Weibull failure distributions. Our broad finding is that group
	replication is useful in a range of realistic application and checkpointing
	overhead scenarios for future exascale platforms.},
	Author = {Bougeret, Marin and Casanova, Henri and Robert, Yves and Vivien, Fr{\'e}d{\'e}ric and Zaidouni, Dounia},
	Doi = {10.1177/1094342013505348},
	Eprint = {http://hpc.sagepub.com/content/early/2013/09/30/1094342013505348.full.pdf+html},
	Journal = {International Journal of High Performance Computing Applications},
	Title = {{Using group replication for resilience on exascale systems}},
	Url = {http://hpc.sagepub.com/content/early/2013/09/30/1094342013505348.abstract},
	Year = {2013},
	Bdsk-Url-1 = {http://hpc.sagepub.com/content/early/2013/09/30/1094342013505348.abstract},
	Bdsk-Url-2 = {http://dx.doi.org/10.1177/1094342013505348}}

@article{2013-13-slides,
	Abstract = {As the failure frequency is increasing with the components count in
	modern and future supercomputers, resilience is becoming critical
	for extreme scale systems. The association of failure prediction
	with proactive check pointing seeks to reduce the effect of failures
	in the execution time of parallel applications. Unfortunately, proactive
	check pointing does not systematically avoid restarting from scratch.
	To mitigate this issue, failure prediction and proactive check pointing
	can be coupled with periodic check pointing. However, blind use of
	these techniques does not always improves system efficiency, because
	everyone of them comes with a mix of overheads and benefits. In order
	to study and understand the combination of these techniques and their
	improvement in the system's efficiency, we developed: (i) a prototype
	combining state of the art failure prediction, fast proactive check
	pointing and preventive check pointing, (ii) a mathematical model
	that reflects the expected computing efficiency of the combination
	and computes the optimal check pointing interval in this context,
	(iii) a discrete event simulator to evaluate the computing efficiency
	of the combination for system parameters corresponding to the current
	and projected large scale HPC systems. We evaluate our proposed technique
	on a large supercomputer (i.e. TSUBAME2) with production-level HPC
	applications and we show that failure prediction, proactive and preventive
	check pointing can be coupled successfully, imposing only about 2\%
	to 6\% of overhead in comparison with preventive check pointing only.
	Moreover, our model-based simulations show that the optimal solution
	improves the computing efficiency up to 30\% in comparison with classic
	periodic check pointing. We show that the prediction recall has a
	much higher impact on execution efficiency than the prediction precision.
	This result suggests that researchers on failure prediction algorithms
	should focus on improving the recall. We also show that the combination
	of these techniques can significantly improve (by a factor 2, for
	a particular configuration) the mean time between failures (MTBF)
	perceived by the application. },
	Address = {Los Alamitos, CA, USA},
	Author = {Mohamed Slim Bouguerra and Ana Gainaru and Leonardo Bautista Gomez and Franck Cappello and Satoshi Matsuoka and Naoya Maruyam},
	Doi = {http://doi.ieeecomputersociety.org/10.1109/IPDPS.2013.74},
	Issn = {1530-2075},
	Journal = {International Parallel and Distributed Processing Symposium},
	Keywords = {Checkpointing;Fault tolerance;Fault tolerant systems;Correlation;Mathematical model;Predictive models;Computational modeling;large scale HPC systems;Failure prediction;multilevel checkpointing;resilience},
	Pages = {501-512},
	Publisher = {IEEE Computer Society},
	Series = {IPDPS 2013},
	Title = {{Improving the Computing Efficiency of HPC Systems Using a Combination of Proactive and Preventive Checkpointing}},
	Volume = {0},
	Year = {2013},
	Bdsk-Url-1 = {http://doi.ieeecomputersociety.org/10.1109/IPDPS.2013.74}}

@article{2010-6,
	Abstract = {Over the past decade the number of processors used in high performance
	computing has increased to hundreds of thousands. As a direct consequence,
	and while the computational power follows the trend, the mean time
	between failures (MTBF) has suffered and is now being counted in
	hours. In order to circumvent this limitation, a number of fault-tolerant
	algorithms as well as execution environments have been developed
	using the message passing paradigm. Among them, message logging has
	been proved to achieve a better overall performance when the MTBF
	is low, mainly due to a faster failure recovery. However, message
	logging suffers from a high overhead when no failure occurs. Therefore,
	in this paper we discuss a refinement of the message logging model
	intended to improve the failure-free message logging performance.
	The proposed approach simultaneously removes useless memory copies
	and reduces the number of logged events. We present the implementation
	of a pessimistic message logging protocol in Open MPI and compare
	it with the previous reference implementation MPICH-V2. The results
	outline a several order of magnitude improvement on the performance
	and a zero overhead for most messages. Published in 2010 by John
	Wiley & Sons, Ltd.},
	Author = {Bouteiller, Aurelien and Bosilca, George and Dongarra, Jack},
	Doi = {10.1002/cpe.1589},
	Issn = {1532-0634},
	Journal = {Concurrency and Computation: Practice and Experience},
	Keywords = {high performance computing, fault tolerance, message logging, uncoordinated checkpoint},
	Number = {16},
	Pages = {2196--2211},
	Publisher = {John Wiley \%\& Sons, Ltd.},
	Title = {{Redesigning the message logging model for high performance}},
	Url = {http://dx.doi.org/10.1002/cpe.1589},
	Volume = {22},
	Year = {2010},
	Bdsk-Url-1 = {http://dx.doi.org/10.1002/cpe.1589}}

@inproceedings{2013-11,
	Author = {Aurelien Bouteiller and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert},
	Booktitle = {{Euro-Par}},
	Ee = {http://dx.doi.org/10.1007/978-3-642-40047-6_43},
	Pages = {420-431},
	Series = {Euro-Par 2013},
	Title = {{Multi-criteria Checkpointing Strategies: Response-Time versus Resource Utilization}},
	Year = {2013}}

@inproceedings{2011-13,
	Acmid = {2033415},
	Address = {Berlin, Heidelberg},
	Author = {Bouteiller, Aurelien and Herault, Thomas and Bosilca, George and Dongarra, Jack J.},
	Booktitle = {{Proceedings of the 17th international conference on Parallel processing}},
	Isbn = {978-3-642-23396-8},
	Location = {Bordeaux, France},
	Numpages = {14},
	Pages = {51--64},
	Publisher = {Springer-Verlag},
	Series = {Euro-Par 2011},
	Title = {{Correlated set coordination in fault tolerant message logging protocols}},
	Url = {http://dl.acm.org/citation.cfm?id=2033408.2033415},
	Volume = {Part II},
	Year = {2011},
	Bdsk-Url-1 = {http://dl.acm.org/citation.cfm?id=2033408.2033415}}

@article{2006-9,
	Abstract = {High performance computing platforms like Clusters, Grid and Desktop
	Grids are becoming larger and subject to more frequent failures.
	MPI is one of the most used message passing library in HPC applications.
	These two trends raise the need for fault tolerant MPI. The MPICH-V
	project focuses on designing, implementing and comparing several
	automatic fault tolerance protocols for MPI applications. We present
	an extensive related work section highlighting the originality of
	our approach and the proposed protocols. We present then four fault
	tolerant protocols implemented in a new generic framework for fault
	tolerant protocol comparison, covering a large spectrum of known
	approaches from coordinated checkpoint, to uncoordinated checkpoint
	associated with causal message logging. We measure the performance
	of these protocols on a microbenchmark and compare them for the NAS
	benchmark, using an original fault tolerance test. Finally, we outline
	the lessons learned from this in depth fault tolerant protocol comparison
	for MPI applications.},
	Author = {Bouteiller, A. and Herault, T. and Krawezik, G. and Lemarinier, P. and Cappello, F.},
	Doi = {10.1177/1094342006067469},
	Eprint = {http://hpc.sagepub.com/content/20/3/319.full.pdf+html},
	Journal = {International Journal of High Performance Computing Applications},
	Number = {3},
	Pages = {319-333},
	Title = {{MPICH-V Project: A Multiprotocol Automatic Fault-Tolerant MPI}},
	Url = {http://hpc.sagepub.com/content/20/3/319.abstract},
	Volume = {20},
	Year = {2006},
	Bdsk-Url-1 = {http://hpc.sagepub.com/content/20/3/319.abstract},
	Bdsk-Url-2 = {http://dx.doi.org/10.1177/1094342006067469}}

@inproceedings{2009-10,
	Abstract = {With the growing scale of high performance computing platforms, fault
	tolerance has become a major issue. Among the various approaches
	for providing fault tolerance to MPI applications, message logging
	has been proved to tolerate higher failure rate. However, this advantage
	comes at the expense of a higher overhead on communications, due
	to latency intrusive logging of events to a stable storage. Previous
	work proposed and evaluated several protocols relaxing the synchronicity
	of event logging to moderate this overhead. Recently, the model of
	message logging has been refined to better match the reality of high
	performance network cards, where message receptions are decomposed
	in multiple interdependent events. According to this new model, deterministic
	and non-deterministic events are clearly discriminated, reducing
	the overhead induced by message logging. In this paper we compare,
	experimentally, a pessimistic and an optimistic message logging protocol,
	using this new model and implemented in the Open MPI library. Although
	pessimistic and optimistic message logging are, respectively, the
	most and less synchronous message logging paradigms, experiments
	show that most of the time their performance is comparable.},
	Author = {Bouteiller, A. and Ropars, T. and Bosilca, G. and Morin, C. and Dongarra, J.},
	Booktitle = {{IEEE International Conference on Cluster Computing and Workshops, CLUSTER 2009}},
	Doi = {10.1109/CLUSTR.2009.5289157},
	Issn = {1552-5244},
	Keywords = {fault tolerant computing;libraries;message passing;parallel machines;protocols;MPI failure recovery;Open MPI library;fault tolerance;high performance computing;high performance network cards;message logging protocol;message passing interface;Delay;Fault tolerance;High performance computing;Laboratories;Libraries;Lifting equipment;Message passing;Network interfaces;Protocols;Usability},
	Pages = {1-9},
	Title = {{Reasons for a pessimistic or optimistic message logging protocol in MPI uncoordinated failure, recovery}},
	Year = {2009},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/CLUSTR.2009.5289157}}

@inproceedings{2003-5,
	Abstract = {Execution of MPI applications on clusters and Grid deployments suffering
	from node and network failures motivates the use of fault tolerant
	MPI implementations. We present MPICH-V2 (the second protocol of
	MPICH-V project), an automatic fault tolerant MPI implementation
	using an innovative protocol that removes the most limiting factor
	of the pessimistic message logging approach: reliable logging of
	in transit messages. MPICH-V2 relies on uncoordinated checkpointing,
	sender based message logging and remote reliable logging of message
	logical clocks. This paper presents the architecture of MPICH-V2,
	its theoretical foundation and the performance of the implementation.
	We compare MPICH-V2 to MPICH-V1 and MPICH-P4 evaluating a) its point-to-point
	performance, b) the performance for the NAS benchmarks, c) the application
	performance when many faults occur during the execution. Experimental
	results demonstrate that MPICH-V2 provides performance close to MPICH-P4
	for applications using large messages while reducing dramatically
	the number of reliable nodes compared to MPICH-V1.},
	Author = {Bouteiller, B. and Cappello, F. and Herault, T. and Krawezik, K. and Lemarinier, P. and Magniette, M.},
	Booktitle = {{Supercomputing, 2003 ACM/IEEE Conference}},
	Doi = {10.1109/SC.2003.10027},
	Keywords = {Checkpointing;Clocks;Costs;Fault tolerance;High performance computing;Message passing;Permission;Programming profession;Protocols;Uniform resource locators},
	Pages = {25-25},
	Title = {{MPICH-V2: a Fault Tolerant MPI for Volatile Nodes based on Pessimistic Sender Based Message Logging}},
	Year = {2003},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/SC.2003.10027}}

@inproceedings{2003-6,
	Abstract = {MPI is one of the most adopted programming models for large clusters
	and grid deployments. However, these systems often suffer from network
	or node failures. This raises the issue of selecting a fault tolerance
	approach for MPI. Automatic and transparent ones are based on either
	coordinated checkpointing or message logging associated with uncoordinated
	checkpoint. There are many protocols, implementations and optimizations
	for these approaches but few results about their comparison. Coordinated
	checkpoint has the advantage of a very low overhead on fault free
	executions. In contrary a message logging protocol systematically
	adds a significant message transfer penalty. The drawbacks of coordinated
	checkpoint come from its synchronization cost at checkpoint and restart
	times. In this paper we implement, evaluate and compare the two kinds
	of protocols with a special emphasis on their respective performance
	according to fault frequency. The main conclusion (under our experimental
	conditions) is that message logging becomes relevant for a large
	scale cluster from one fault every hour for applications with large
	dataset.},
	Author = {Bouteiller, B. and Lemarinier, P. and Krawezik, K. and Cappello, F.},
	Booktitle = {{Proceedings of the IEEE International Conference on Cluster Computing}},
	Doi = {10.1109/CLUSTR.2003.1253321},
	Keywords = {distributed programming;fault tolerant computing;grid computing;message passing;performance evaluation;system recovery;workstation clusters;PC clusters;coordinated checkpoint;coordinated checkpointing;fault free executions;fault frequency;fault tolerant MPI;grid computing;message log;message logging protocol;message transfer penalty;network failures;node failures;performance evaluation;programming models;restart times;synchronization cost;Checkpointing;Clouds;Computer fault tolerance;Costs;Electronic mail;Fault tolerance;Frequency synchronization;High performance computing;Large-scale systems;Message passing;Protocols;System recovery},
	Pages = {242-250},
	Title = {{Coordinated checkpoint versus message log for fault tolerant MPI}},
	Year = {2003},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/CLUSTR.2003.1253321}}

@misc{2004-7,
	Author = {P. J. Braam.},
	Howpublished = {\url{https:// http://www.lustre.org/docs.html}},
	Note = {Accessed: 2014-03-15},
	Publisher = {Cluster File System, Inc.},
	Title = {{ Lustre: A Scalable, High Performance File System.}},
	Year = {2004}}

@article{2003-2,
	Acmid = {781513},
	Address = {New York, NY, USA},
	Author = {Bronevetsky, Greg and Marques, Daniel and Pingali, Keshav and Stodghill, Paul},
	Doi = {10.1145/966049.781513},
	Issn = {0362-1340},
	Issue_Date = {October 2003},
	Journal = {SIGPLAN Not.},
	Keywords = {MPI, application-level checkpointing, fault-tolerance, non-FIFO communication, scientific computing},
	Month = {Jun},
	Number = {10},
	Numpages = {11},
	Pages = {84--94},
	Publisher = {ACM},
	Title = {{Automated application-level checkpointing of MPI programs}},
	Url = {http://doi.acm.org/10.1145/966049.781513},
	Volume = {38},
	Year = {2003},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/966049.781513},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/966049.781513}}

@article{2009-7,
	Author = {Bronevetsky, Greg and Moody, Adam},
	Journal = {Lawrence Livermore National Laboratory, Livermore, CA, USA, Tech. Rep. TR-JLPC-09-01},
	Title = {{Scalable I/O systems via node-local storage: Approaching 1 TB/sec file I/O}},
	Year = {2009}}

@inproceedings{2004-3,
	Author = {David Callahan and Bradford L. Chamberlain and Hans P. Zima},
	Booktitle = {{Ninth International Workshop on High-Level Parallel Programming Models and Supportive Environments}},
	Pages = {52--60},
	Series = {HIPS'04},
	Title = {{The Cascade High Productivity Language}},
	Year = {2004}}

@article{2014-1,
	Abstract = {InfiniBand is widely used for low-latency, high-throughput cluster
	computing. Saving the state of the InfiniBand network as part of
	distributed checkpointing has been a long-standing challenge for
	researchers. Because of a lack of a solution, typical MPI implementations
	have included custom checkpoint-restart services that "tear down"
	the network, checkpoint each node as if the node were a standalone
	computer, and then re-connect the network again. We present the first
	example of transparent, system-initiated checkpoint-restart that
	directly supports InfiniBand. The new approach is independent of
	any particular Linux kernel, thus simplifying the current practice
	of using a kernel-based module, such as BLCR. This direct approach
	results in checkpoints that are found to be faster than with the
	use of a checkpoint-restart service. The generality of this approach
	is shown not only by checkpointing an MPI computation, but also a
	native UPC computation (Berkeley Unified Parallel C), which does
	not use MPI. Scalability is shown by checkpointing 2,048 MPI processes
	across 128 nodes (with 16 cores per node). In addition, a cost-effective
	debugging approach is also enabled, in which a checkpoint image from
	an InfiniBand-based production cluster is copied to a local Ethernet-based
	cluster, where it can be restarted and an interactive debugger can
	be attached to it. This work is based on a plugin that extends the
	DMTCP (Distributed MultiThreaded CheckPointing) checkpoint-restart
	package.},
	Author = {Cao, Jiajun and Arya, Kapil and Cooperman, Gene},
	Bibsource = {DBLP, http://dblp.uni-trier.de},
	Ee = {http://arxiv.org/abs/1312.3938},
	Journal = {CoRR},
	Title = {{Transparent Checkpoint-Restart over InfiniBand}},
	Volume = {abs/1312.3938},
	Year = {2013}}

@article{2009-1,
	Acmid = {1572229},
	Address = {Thousand Oaks, CA, USA},
	Author = {Cappello, Franck},
	Doi = {10.1177/1094342009106189},
	Issn = {1094-3420},
	Issue_Date = {August 2009},
	Journal = {Int. J. High Perform. Comput. Appl.},
	Keywords = {challenges, fault tolerance, knowledge, opportunities, petascale/exascale},
	Month = aug,
	Number = {3},
	Numpages = {15},
	Pages = {212--226},
	Publisher = {Sage Publications, Inc.},
	Title = {{Fault Tolerance in Petascale/ Exascale Systems: Current Knowledge, Challenges and Research Opportunities}},
	Url = {http://dx.doi.org/10.1177/1094342009106189},
	Volume = {23},
	Year = {2009},
	Bdsk-Url-1 = {http://dx.doi.org/10.1177/1094342009106189}}

@article{2009-5,
	Acmid = {1640428},
	Address = {Thousand Oaks, CA, USA},
	Author = {Cappello, Franck and Geist, Al and Gropp, Bill and Kale, Laxmikant and Kramer, Bill and Snir, Marc},
	Doi = {10.1177/1094342009347767},
	Issn = {1094-3420},
	Issue_Date = {November 2009},
	Journal = {Int. J. High Perform. Comput. Appl.},
	Keywords = {challenge, exascale, fault tolerance, high-performance computing., resilience},
	Month = {Nov},
	Number = {4},
	Numpages = {15},
	Pages = {374--388},
	Publisher = {Sage Publications, Inc.},
	Title = {{Toward Exascale Resilience}},
	Url = {http://dx.doi.org/10.1177/1094342009347767},
	Volume = {23},
	Year = {2009},
	Bdsk-Url-1 = {http://dx.doi.org/10.1177/1094342009347767}}

@techreport{1999-2,
	Author = {William W. Carlson and Jesse M. Draper and David E. Culler},
	Title = {{Introduction to UPC and Language Specification}},
	Year = {1999}}

@techreport{2012-17,
	Abstract = {Processor failures in post-petascale settings are common occurrences.
	The traditional fault-tolerance solution, checkpoint-rollback, severely
	limits parallel efficiency. One solution is to replicate application
	processes so that a processor failure does not necessarily imply
	an application failure. Process replication, combined with checkpoint-rollback,
	has been recently advocated by Ferreira et al. We first identify
	an incorrect analogy made in their work between process replication
	and the birthday problem, and derive correct values for the Mean
	Number of Failures To Interruption and Mean Time To Interruption
	for exponentially distributed failures. We then extend these results
	to arbitrary failure distributions, including closed-form solutions
	for Weibull distributions. Finally, we evaluate process replication
	using both synthetic and real-world failure traces. Our main findings
	are: (i) replication is less beneficial than claimed by Ferreira
	et al; (ii) although the choice of the checkpointing period can have
	a high impact on application execution in the no-replication case,
	with process replication this choice is no longer critical.},
	Affiliation = {Concurrency Research Group - CoRG , ROMA - ENS Lyon / CNRS / Inria Grenoble Rhone-Alpes , Laboratoire de l'Informatique du Parallelisme - LIP},
	Author = {Casanova, Henri and Robert, Yves and Vivien, Frederic and Zaidouni, Dounia},
	Collaboration = {Equipe associee Aloha},
	Institution = {INRIA},
	Keywords = {Fault-tolerance, checkpointing, replication, exascale},
	Month = {May},
	Number = {RR-7951},
	Pdf = {http://hal.inria.fr/hal-00697180/PDF/RR-7951.pdf},
	Title = {{Combining Process Replication and Checkpointing for Resilience on Exascale Systems}},
	Type = {Rapport de recherche},
	Url = {http://hal.inria.fr/hal-00697180},
	Year = {2012},
	Bdsk-Url-1 = {http://hal.inria.fr/hal-00697180}}

@inproceedings{2001-4,
	Abstract = {Software errors are a major cause of outages and they are increasingly
	exploited in malicious attacks. Byzantine fault tolerance allows
	replicated systems to mask some software errors but it is expensive
	to deploy. This paper describes a replication technique, BFTA, which
	uses abstraction to reduce the cost of Byzantine fault tolerance
	and to improve its ability to mask software errors. BFTA reduces
	cost because it enables reuse of off-the-shelf service implementations.
	It improves availability because each replica can be repaired periodically
	using an abstract view of the state stored by correct replicas, and
	because each replica can run distinct or non-deterministic service
	implementations, which reduces the probability of common mode failures.
	We built an NFS service that allows each replica to run a different
	operating system. This example suggests that BFTA can be used in
	practice --- the replicated file system required only a modest amount
	of new code, and preliminary performance results indicate that it
	performs comparably to the off-the-shelf implementations that it
	wraps.},
	Author = {Miguel Castro},
	Booktitle = {{In Proc. 18th SOSP}},
	Pages = {15--28},
	Publisher = {ACM Press},
	Title = {{BASE: using abstraction to improve fault tolerance}},
	Year = {2001}}

@inproceedings{2006-3,
	Acmid = {2081570},
	Address = {Berlin, Heidelberg},
	Author = {Chakravorty, Sayantan and Mendes, Celso L. and Kal{\'e}, Laxmikant V.},
	Booktitle = {{Proceedings of the 13th international conference on High Performance Computing}},
	Doi = {10.1007/11945918_47},
	Isbn = {3-540-68039-X, 978-3-540-68039-0},
	Location = {Bangalore, India},
	Numpages = {12},
	Pages = {485--496},
	Publisher = {Springer-Verlag},
	Series = {HiPC 2006},
	Title = {{Proactive fault tolerance in MPI applications via task migration}},
	Url = {http://dx.doi.org/10.1007/11945918_47},
	Year = {2006},
	Bdsk-Url-1 = {http://dx.doi.org/10.1007/11945918_47}}

@article{1985-1,
	Abstract = {This paper presents an algorithm by which a process in a distributed
	system determines a global state of the system during a computation.
	Many problems in distributed systems can be cast in terms of the
	problem of detecting global states. For instance, the global state
	detection algorithm helps to solve an important class of problems:
	stable property detection. A stable property is one that persists:
	once a stable property becomes true it remains true thereafter. Examples
	of stable properties are ``computation has terminated,'' `` the system
	is deadlocked'' and ``all tokens in a token ring have disappeared.''
	The stable property detection problem is that of devising algorithms
	to detect a given stable property. Global state detection can also
	be used for checkpointing.},
	Acmid = {214456},
	Address = {New York, NY, USA},
	Author = {Chandy, K. Mani and Lamport, Leslie},
	Doi = {10.1145/214451.214456},
	Issn = {0734-2071},
	Issue_Date = {Feb. 1985},
	Journal = {ACM Trans. Comput. Syst.},
	Month = feb,
	Number = {1},
	Numpages = {13},
	Pages = {63--75},
	Publisher = {ACM},
	Title = {{Distributed snapshots: determining global states of distributed systems}},
	Url = {http://doi.acm.org/10.1145/214451.214456},
	Volume = {3},
	Year = {1985},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/214451.214456},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/214451.214456}}

@inproceedings{2003-8,
	Abstract = {The distributed RAID for serverless cluster computer is used to save
	the checkpoint files periodically according to the checkpointing
	algorithm for rollback recovery. Striped checkpointing algorithm
	newly proposed in this paper can adopt the merits of the coordinated
	and the staggered checkpointing algorithms. Coordinating enables
	parallel I/O on distributed disks and staggering avoids network bottleneck
	in distributed disk I/O operations. With a fixed cluster size, we
	reveal the tradeoffs between these two speedup techniques. The striped
	checkpointing approach allows dynamical reconfiguration to minimize
	checkpointing overhead among concurrent software processes.We demonstrate
	how to reduce the overhead by striping and staggering dynamically.
	For communication-intensive computational programs, this new scheme
	can significantly reduce the checkpointing overhead. Linpack HPC
	Benchmark results prove the benefits of trading between stripe parallelism
	and distributed staggering. These results are useful to design efficient
	checkpointing algorithm for fast rollback recovery from any single
	node failure in a cluster computer.},
	Acmid = {1761099},
	Address = {Berlin, Heidelberg},
	Author = {Chang, Yun Seok and Cho, Sun Young and Kim, Bo Yeon},
	Booktitle = {{Proceedings of the 2003 international conference on Computational science: PartII}},
	Isbn = {3-540-40195-4},
	Location = {Melbourne, Australia},
	Numpages = {8},
	Pages = {955--962},
	Publisher = {Springer-Verlag},
	Series = {ICCS 2003},
	Title = {{Performance evaluation of the striped checkpointing algorithm on the distributed RAID for cluster computer}},
	Url = {http://dl.acm.org/citation.cfm?id=1760988.1761099},
	Year = {2003},
	Bdsk-Url-1 = {http://dl.acm.org/citation.cfm?id=1760988.1761099}}

@article{2009-12,
	Abstract = {Computational science is paramount to the understanding of underlying
	processes in internal combustion engines of the future that will
	utilize non-petroleum-based alternative fuels, including carbon-neutral
	biofuels, and burn in new combustion regimes that will attain high
	efficiency while minimizing emissions of particulates and nitrogen
	oxides. Next-generation engines will likely operate at higher pressures,
	with greater amounts of dilution and utilize alternative fuels that
	exhibit a wide range of chemical and physical properties. Therefore,
	there is a significant role for high-fidelity simulations, direct
	numerical simulations (DNS), specifically designed to capture key
	turbulence-chemistry interactions in these relatively uncharted combustion
	regimes, and in particular, that can discriminate the effects of
	differences in fuel properties. In DNS, all of the relevant turbulence
	and flame scales are resolved numerically using high-order accurate
	numerical algorithms. As a consequence terascale DNS are computationally
	intensive, require massive amounts of computing power and generate
	tens of terabytes of data. Recent results from terascale DNS of turbulent
	flames are presented here, illustrating its role in elucidating flame
	stabilization mechanisms in a lifted turbulent hydrogen/air jet flame
	in a hot air coflow, and the flame structure of a fuel-lean turbulent
	premixed jet flame. Computing at this scale requires close collaborations
	between computer and combustion scientists to provide optimized scaleable
	algorithms and software for terascale simulations, efficient collective
	parallel I/O, tools for volume visualization of multiscale, multivariate
	data and automating the combustion workflow. The enabling computer
	science, applied to combustion science, is also required in many
	other terascale physics and engineering simulations. In particular,
	performance monitoring is used to identify the performance of key
	kernels in the DNS code, S3D and especially memory intensive loops
	in the code. Through the careful application of loop transformations,
	data reuse in cache is exploited thereby reducing memory bandwidth
	needs, and hence, improving S3D's nodal performance. To enhance collective
	parallel I/O in S3D, an MPI-I/O caching design is used to construct
	a two-stage write-behind method for improving the performance of
	write-only operations. The simulations generate tens of terabytes
	of data requiring analysis. Interactive exploration of the simulation
	data is enabled by multivariate time-varying volume visualization.
	The visualization highlights spatial and temporal correlations between
	multiple reactive scalar fields using an intuitive user interface
	based on parallel coordinates and time histogram. Finally, an automated
	combustion workflow is designed using Kepler to manage large-scale
	data movement, data morphing, and archival and to provide a graphical
	display of run-time diagnostics.},
	Adsnote = {Provided by the SAO/NASA Astrophysics Data System},
	Adsurl = {http://adsabs.harvard.edu/abs/2009CS\%26D....2a5001C},
	Author = {Chen, J.~H. and Choudhary, A. and de Supinski, B. and DeVries, M. and Hawkes, E.~R. and Klasky, S. and Liao, W.~K. and Ma, K.~L. and Mellor-Crummey, J. and Podhorszki, N. and Sankaran, R. and Shende, S. and Yoo, C.~S.},
	Doi = {10.1088/1749-4699/2/1/015001},
	Eid = {015001},
	Journal = {Computational Science and Discovery},
	Month = jan,
	Number = {1},
	Pages = {015001},
	Title = {{Terascale direct numerical simulations of turbulent combustion using S3D}},
	Volume = {2},
	Year = {2009},
	Bdsk-Url-1 = {http://dx.doi.org/10.1088/1749-4699/2/1/015001}}

@inproceedings{2013-12,
	Acmid = {2442533},
	Address = {New York, NY, USA},
	Author = {Chen, Zizhong},
	Booktitle = {{Proceedings of the 18th ACM SIGPLAN symposium on Principles and practice of parallel programming}},
	Doi = {10.1145/2442516.2442533},
	Isbn = {978-1-4503-1922-5},
	Keywords = {algorithm-based fault tolerance (abft), checkpoint, iterative methods, online error detection, soft error},
	Location = {Shenzhen, China},
	Numpages = {10},
	Pages = {167--176},
	Publisher = {ACM},
	Series = {PPoPP 2013},
	Title = {{Online-ABFT: an online algorithm based fault tolerance scheme for soft error detection in iterative methods}},
	Url = {http://doi.acm.org/10.1145/2442516.2442533},
	Year = {2013},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2442516.2442533},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2442516.2442533}}

@article{2008-3,
	Author = {Chen, Zizhong and Dongarra, Jack},
	Issn = {1045-9219},
	Issue_Date = {December 2008},
	Journal = {IEEE Trans. Parallel Distrib. Syst.},
	Month = {Dec},
	Number = {12},
	Numpages = {14},
	Pages = {1628--1641},
	Title = {{Algorithm-Based Fault Tolerance for Fail-Stop Failures}},
	Url = {http://dx.doi.org/10.1109/TPDS.2008.58},
	Volume = {19},
	Year = {2008},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/TPDS.2008.58}}

@inproceedings{2005-3,
	Author = {Zizhong Chen and Graham E. Fagg and Edgar Gabriel and Julien Langou and Thara Angskun and George Bosilca and Jack Dongarra},
	Booktitle = {{Proceedings for ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}},
	Pages = {213--223},
	Series = {PPoPP 2005},
	Title = {{Building Fault Survivable MPI Programs with FT-MPI Using Diskless Checkpointing}},
	Year = {2005}}

@inproceedings{2005-4,
	Acmid = {1065973},
	Address = {New York, NY, USA},
	Author = {Chen, Zizhong and Fagg, Graham E. and Gabriel, Edgar and Langou, Julien and Angskun, Thara and Bosilca, George and Dongarra, Jack},
	Booktitle = {{Proceedings of the tenth ACM SIGPLAN symposium on Principles and practice of parallel programming}},
	Doi = {10.1145/1065944.1065973},
	Isbn = {1-59593-080-9},
	Keywords = {fault tolerance, floating-point arithmetic coding, high performance computing, message passing interface},
	Location = {Chicago, IL, USA},
	Numpages = {11},
	Pages = {213--223},
	Publisher = {ACM},
	Series = {PPoPP 2005},
	Title = {{Fault tolerant high performance computing by a coding approach}},
	Url = {http://doi.acm.org/10.1145/1065944.1065973},
	Year = {2005},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/1065944.1065973},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/1065944.1065973}}

@article{1988-2,
	Abstract = {A fault-tolerant systolic sorter design is proposed. An algorithm-based
	fault tolerance is achieved by testing the invariants of a systolic
	sorter during normal operation. Transient and permanent computation
	errors can be detected by using error-checking code and some redundant
	cells. A block with a single faulty cell can be located. Small hardware
	overhead and negligible time overhead are shown to be the major advantages
	of the method. A hierarchical structure is suggested as an efficient
	architecture for realizing the method. An offline fault-testing method
	for permanent stuck-at faults is presented},
	Author = {Choi, Y.-H. and Malek, M.},
	Doi = {10.1109/12.4615},
	Issn = {0018-9340},
	Journal = {IEEE Transactions on Computers},
	Keywords = {automatic testing;cellular arrays;digital integrated circuits;error detection codes;fault tolerant computing;integrated circuit testing;redundancy;sorting;VLSI sorter;algorithm-based fault tolerance;error-checking code;fault-tolerant systolic sorter;hardware overhead;invariants;offline fault-testing;permanent computation errors;permanent stuck-at faults;reconfiguration;redundant cells;single faulty cell;testing;time overhead;Circuit faults;Computer architecture;Degradation;Fault diagnosis;Fault tolerance;Particle separators;Signal processing algorithms;Systolic arrays;Testing;Very large scale integration},
	Number = {5},
	Pages = {621-624},
	Title = {{A fault-tolerant systolic sorter}},
	Volume = {37},
	Year = {1988},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/12.4615}}

@inproceedings{2012-1,
	Acmid = {2389075},
	Address = {Los Alamitos, CA, USA},
	Articleno = {58},
	Author = {Chung, Jinsuk and Lee, Ikhwan and Sullivan, Michael and Ryoo, Jee Ho and Kim, Dong Wan and Yoon, Doe Hyun and Kaplan, Larry and Erez, Mattan},
	Booktitle = {{Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis}},
	Isbn = {978-1-4673-0804-5},
	Location = {Salt Lake City, Utah},
	Numpages = {11},
	Pages = {58:1--58:11},
	Publisher = {IEEE Computer Society Press},
	Series = {SC 2012},
	Title = {{Containment domains: a scalable, efficient, and flexible resilience scheme for exascale systems}},
	Url = {http://dl.acm.org/citation.cfm?id=2388996.2389075},
	Year = {2012},
	Bdsk-Url-1 = {http://dl.acm.org/citation.cfm?id=2388996.2389075}}

@inproceedings{2006-5,
	Abstract = {The progress towards programming methodologies that simplify the work
	of the programmer involves automating, whenever possible, activities
	that are secondary to the main task of designing algorithms and developing
	applications. Automatic memory management, using garbage collection,
	and automatic persistence, using checkpointing, are both examples
	of mechanisms that operate behind the scenes, simplifying the work
	of the programmer. Implementing such mechanisms in the presence of
	real-time constraints, however, is particularly difficult.In this
	paper we review the behavior of traditional copy-on-write implementations
	of checkpointing in the context of real-time systems, and we show
	how such implementations may, in pathological cases, seriously impair
	the ability of the user code to meet its deadlines. We discuss the
	source of the problem, supply benchmarks, and discuss possible remedies.
	We subsequently propose a novel approach that does not rely on copy-on-write
	and that, while more expensive in terms of CPU time overhead, is
	unaffected by pathological user code. We also describe our implementation
	of the proposed solution, based on the Ovm RTSJ Java Virtual Machine,
	and we discuss our experimental results.},
	Acmid = {1134771},
	Address = {New York, NY, USA},
	Author = {Cunei, Antonio and Vitek, Jan},
	Booktitle = {{Proceedings of the 2nd international conference on Virtual execution environments}},
	Doi = {10.1145/1134760.1134771},
	Isbn = {1-59593-332-8},
	Keywords = {Java, checkpoint, real-time, virtual machine},
	Location = {Ottawa, Ontario, Canada},
	Numpages = {10},
	Pages = {68--77},
	Publisher = {ACM},
	Series = {VEE 2006},
	Title = {{A new approach to real-time checkpointing}},
	Url = {http://doi.acm.org/10.1145/1134760.1134771},
	Year = {2006},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/1134760.1134771},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/1134760.1134771}}

@article{1999-4,
	Abstract = {Process migration is a desirable and useful facility for the Network
	of Workstations (NOW). This paper presents a novel migration protocol
	called quasi-asynchronous migration for PVM tasks, which allows non-migrating
	tasks to execute during most of the time of migration. Message flushing
	and message delaying are the key mechanisms used in quasi-asynchronous
	migration. Because the protocol is implemented on top of PVM and
	at user-level, it is both transparent to users and portable. Both
	the analysis of the migration protocol and the experimental results
	show that quasi-asynchronous migration has lower overhead than other
	migration protocols.},
	Acmid = {311100},
	Address = {New York, NY, USA},
	Author = {Dan, Pei and Dongsheng, Wang and Youhui, Zhang and Meiming, Shen},
	Doi = {10.1145/311094.311100},
	Issn = {0163-5980},
	Issue_Date = {April 1999},
	Journal = {SIGOPS Oper. Syst. Rev.},
	Keywords = {Network of Workstations (NOW), PVM, process migration},
	Month = apr,
	Number = {2},
	Numpages = {10},
	Pages = {5--14},
	Publisher = {ACM},
	Title = {{Quasi-asynchronous migration: a novel migration protocol for PVM tasks}},
	Url = {http://doi.acm.org/10.1145/311094.311100},
	Volume = {33},
	Year = {1999},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/311094.311100},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/311094.311100}}

@inproceedings{2013-4,
	Abstract = {In high-performance systems, the probability of failure is higher
	with more processors. Errors in calculations may occur that cannot
	be detected by outside means. To address this problem, we create
	a checksum-based approach that detects and recovers from calculation
	errors. We apply this approach to the LU factorization algorithm
	used by High Performance Linpack. Our approach has low overhead;
	in contrast to an existing approach that requires repeated calculation,
	it repeats only a fraction of the calculation during recovery. Because
	of error propagation, the existing approach has to repeat calculations
	when soft errors occur. Our approach detects and corrects errors
	during the calculation before they are propagated. The frequency
	of checking can be adjusted for the error rate, resulting in a flexible
	method of fault tolerance.},
	Acmid = {2462920},
	Address = {New York, NY, USA},
	Author = {Davies, Teresa and Chen, Zizhong},
	Benchs = {High Performance Linpack benchmark; LU factorization???????????},
	Booktitle = {{Proceedings of the 22nd international symposium on High-performance parallel and distributed computing}},
	Detailedtb = {Kraken at the University of Tennessee (99,072 cores in 8,256 nodes. Each node has 2 Opteron 2435 ''Istanbul'' processors linked with dual HyperTransport connections. Each processor has 6 cores with a clock rate of 2600 MHz supporting 4 floating-point operations per clock period per core. Each node is a dual-socket, twelve-core node with 16 gigabytes of shared memory. Each processor has directly attached 8 gigabytes of DDR2-800 memory. Each node has a peak processing performance of 124.8 gigaflops. Each core has a peak processing performance of 10.4 gigaflops. The network is a 3D torus interconnection network. We used Cray MPI implementation MPT 3.1.02). Ra at Colorado School of Mines (2,144 cores in 268 nodes. Each node has two 512 Clovertown E5355 quad-core processor at a clock rate of 2670 MHz supporting 4 floating-point operations per clock period per core. Each node has 16 GB memory. Each node has a peak processing performance of 85.44 gigaflops. The network uses a Cisco SFS 7024 IB Server Switch. We used OpenMPI 1.4).},
	Doi = {10.1145/2462902.2462920},
	Isbn = {978-1-4503-1910-2},
	Keywords = {LU factorization, algorithm-based recovery, fault tolerance, high performance linpack benchmark, soft errors},
	Location = {New York, New York, USA},
	Numpages = {12},
	Pages = {167--178},
	Publisher = {ACM},
	Series = {HPDC 2013},
	Title = {{Correcting soft errors online in LU factorization}},
	Url = {http://doi.acm.org/10.1145/2462902.2462920},
	Year = {2013},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2462902.2462920},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2462902.2462920}}

@inproceedings{2012-24,
	Abstract = {With the increase of the number of nodes in clusters, the probability
	of failures and unusual events increases. In this paper, we present
	checksum mechanisms to detect data corruption. We study the impact
	of checksums on network communication performance and we propose
	a mechanism to amortize their cost on InfiniBand. We have implemented
	our mechanisms in the NewMadeleine communication library. Our evaluation
	shows that our mechanisms to ensure message integrity do not impact
	noticeably the application performance, which is an improvement over
	the state of the art MPI implementations.},
	Acmid = {2404063},
	Address = {Berlin, Heidelberg},
	Author = {Denis, Alexandre and Trahay, Francois and Ishikawa, Yutaka},
	Booktitle = {{Proceedings of the 19th European Conference on Recent Advances in the Message Passing Interface}},
	Doi = {10.1007/978-3-642-33518-1_23},
	Isbn = {978-3-642-33517-4},
	Keywords = {checksum, fault-tolerance, high-performance networks, infini-band},
	Location = {Vienna, Austria},
	Numpages = {10},
	Pages = {183--192},
	Publisher = {Springer-Verlag},
	Series = {EuroMPI 2012},
	Title = {{High Performance Checksum Computation for Fault-tolerant MPI over Infiniband}},
	Url = {http://dx.doi.org/10.1007/978-3-642-33518-1_23},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-33518-1_23}}

@inproceedings{2013-21,
	Abstract = {We consider the closed nesting and checkpointing model for transactions
	in fault-tolerant distributed transactional memory (DTM). The closed
	nested model allows inner-nested transactions to be aborted (in the
	event of a transactional conflict) without aborting the parent transaction,
	while checkpointing allows transactions to rollback to a previous
	execution state, potentially improving concurrency over flat nesting.
	We consider a quorum-based replicated model for fault-tolerant DTM,
	and present algorithms to support closed nesting and checkpointing.
	The algorithms use incremental validation to avoid communication
	overhead on commit, and ensure 1-copy equivalence. Our experimental
	studies using a Java DTM implementation of the algorithms on micro
	and macro benchmarks reveal the conditions when they improve transactional
	throughput over flat nesting, and also their relative advantages
	and disadvantages.},
	Author = {Dhoke, A. and Ravindran, B. and Bo Zhang},
	Booktitle = {{IEEE 27th International Paralflel Distributed Processing Symposium (IPDPS)}},
	Doi = {10.1109/IPDPS.2013.103},
	Issn = {1530-2075},
	Keywords = {Java;checkpointing;concurrency control;distributed memory systems;software fault tolerance;transaction processing;1-copy equivalence;Java DTM implementation;checkpointing model;closed nesting;communication overhead;concurrency;execution state;fault-tolerant DTM;fault-tolerant distributed transactional memory;flat nesting;inner-nested transaction;macrobenchmark;microbenchmark;parent transaction;quorum-based replicated model;transactional conflict;transactional throughput;Benchmark testing;Checkpointing;Concurrent computing;Electronic mail;Fault tolerance;Fault tolerant systems;Protocols;checkpointing;closed nesting;distributed systems;replication;transactional memory},
	Month = {May},
	Pages = {41-52},
	Title = {{On Closed Nesting and Checkpointing in Fault-Tolerant Distributed Transactional Memory}},
	Year = {2013},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPS.2013.103}}

@inproceedings{2013-24,
	Abstract = {In this paper, we aim at optimizing fault-tolerance techniques based
	on a checkpointing/restart mechanism, in the context of cloud computing.
	Our contribution is three-fold. (1) We derive a fresh formula to
	compute the optimal number of checkpoints for cloud jobs with varied
	distributions of failure events. Our analysis is not only generic
	with no assumption on failure probability distribution, but also
	attractively simple to apply in practice. (2) We design an adaptive
	algorithm to optimize the impact of checkpointing regarding various
	costs like checkpointing/restart overhead. (3) We evaluate our optimized
	solution in a real cluster environment with hundreds of virtual machines
	and Berkeley Lab Checkpoint/Restart tool. Task failure events are
	emulated via a production trace produced on a large-scale Google
	data center. Experiments confirm that our solution is fairly suitable
	for Google systems. Our optimized formula outperforms Young's formula
	by 3-10 percent, reducing wall-clock lengths by 50-100 seconds per
	job on average.},
	Acmid = {2503217},
	Address = {New York, NY, USA},
	Articleno = {64},
	Author = {Di, Sheng and Robert, Yves and Vivien, Fr{\'e}d{\'e}ric and Kondo, Derrick and Wang, Cho-Li and Cappello, Franck},
	Booktitle = {{Proceedings of SC13: International Conference for High Performance Computing, Networking, Storage and Analysis}},
	Doi = {10.1145/2503210.2503217},
	Isbn = {978-1-4503-2378-9},
	Keywords = {BLCR, checkpoint-restart mechanism, cloud computing, google, optimal checkpointing interval},
	Location = {Denver, Colorado},
	Numpages = {12},
	Pages = {64:1--64:12},
	Publisher = {ACM},
	Series = {SC 2013},
	Title = {{Optimization of Cloud Task Processing with Checkpoint-restart Mechanism}},
	Url = {http://doi.acm.org/10.1145/2503210.2503217},
	Year = {2013},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2503210.2503217},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2503210.2503217}}

@inproceedings{2011-16,
	Abstract = {Commercial graphics processing units (GPUs) prove their attractive,
	inexpensive in high performance scientific applications. However,
	a recent research through Folding@home demonstrates that two-thirds
	of tested GPUs on Folding@home exhibit a detectable, pattern-sensitive
	rate of memory soft errors for GPGPU. Fault tolerance has been viewed
	as critical to the effective use of these GPUs. In this paper, we
	present an on-line GPU error detection, location, and correction
	method to incorporate fault tolerance into matrix multiplication.
	The main contribution of the paper is to extend the traditional algorithm-based
	fault tolerance (ABFT) from offline to online and apply it to matrix
	multiplication on GPUs. The proposed on-line fault tolerance mechanism
	detects soft errors in the middle of the computation so that better
	reliability can be achieved by correcting corrupted computations
	in time. Experimental results demonstrate that the proposed method
	is highly efficient.},
	Author = {Chong Ding and Karlsson, C. and Hui Liu and Davies, T. and Zizhong Chen},
	Booktitle = {{IEEE 9th International Symposium on Parallel and Distributed Processing with Applications (ISPA), 2011}},
	Doi = {10.1109/ISPA.2011.50},
	Keywords = {coprocessors;fault tolerant computing;matrix multiplication;Folding@home;GPGPU;algorithm-based fault tolerance;graphics processing units;high performance scientific application;matrix multiplication;memory soft errors;on-line GPU error correction;on-line GPU error detection;on-line GPU error location;on-line fault tolerance;soft error detection;Fault tolerance;Fault tolerant systems;Graphics processing unit;Hardware;Random access memory;Tunneling magnetoresistance;Fault Tolerance;GPUs;Matrix Multiplication;Soft Errors},
	Pages = {311-317},
	Title = {{Matrix Multiplication on GPUs with On-Line Fault Tolerance}},
	Year = {2011},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/ISPA.2011.50}}

@article{2009-9,
	Abstract = {Over the last 20 years, the open-source community has provided more
	and more software on which the worlds high-performance computing
	systems depend for performance and productivity. The community has
	invested millions of dollars and years of effort to build key components.
	Although the investments in these separate software elements have
	been tremendously valuable, a great deal of productivity has also
	been lost because of the lack of planning, coordination, and key
	integration of technologies necessary to make them work together
	smoothly and efficiently, both within individual petascale systems
	and between different systems. A repository gatekeeper and an email
	discussion list can coordinate open-source development within a single
	project, but there is no global mechanism working across the community
	to identify critical holes in the overall software environment, spot
	opportunities for beneficial integration, or specify requirements
	for more careful coordination. It seems clear that this completely
	uncoordinated development model will not provide the software needed
	to support the unprecedented parallelism required for peta/exascale
	computation on millions of cores, or the flexibility required to
	exploit new hardware models and features, such as transactional memory,
	speculative execution, and GPUs. We believe the community must work
	together to prepare for the challenges of exascale computing, ultimately
	combing their efforts in a coordinated International Exascale Software
	Project.},
	Acmid = {1640423},
	Address = {Thousand Oaks, CA, USA},
	sssAuthorsss = {Dongarra, Jack and Beckman, Pete and Aerts, Patrick and Cappello, Frank and Lippert, Thomas and Matsuoka, Satoshi and Messina, Paul and Moore, Terry and Stevens, Rick and Trefethen, Anne and Valero, Mateo},
	Author = {Dongarra, Jack and , et al.},
	Doi = {10.1177/1094342009347714},
	Issn = {1094-3420},
	Issue_Date = {November 2009},
	Journal = {Int. J. High Perform. Comput. Appl.},
	Keywords = {applications, exascale, international, scientific computing, software},
	Month = nov,
	Number = {4},
	Numpages = {14},
	Pages = {309--322},
	Publisher = {Sage Publications, Inc.},
	Title = {{{The International Exascale Software Project: a Call To Cooperative Action By the Global High-Performance Community}}},
	Url = {http://dx.doi.org/10.1177/1094342009347714},
	Volume = {23},
	Year = {2009},
	Bdsk-Url-1 = {http://dx.doi.org/10.1177/1094342009347714}}

@article{2011-23,
	Abstract = {Over the last 20 years, the open-source community has provided more
	and more software on which the world's high-performance computing
	systems depend for performance and productivity. The community has
	invested millions of dollars and years of effort to build key components.
	However, although the investments in these separate software elements
	have been tremendously valuable, a great deal of productivity has
	also been lost because of the lack of planning, coordination, and
	key integration of technologies necessary to make them work together
	smoothly and efficiently, both within individual petascale systems
	and between different systems. It seems clear that this completely
	uncoordinated development model will not provide the software needed
	to support the unprecedented parallelism required for peta/ exascale
	computation on millions of cores, or the flexibility required to
	exploit new hardware models and features, such as transactional memory,
	speculative execution, and graphics processing units. This report
	describes the work of the community to prepare for the challenges
	of exascale computing, ultimately combing their efforts in a coordinated
	International Exascale Software Project.},
	sssAuthorsss = {Dongarra, Jack and Beckman, Pete and Moore, Terry and Aerts, Patrick and Aloisio, Giovanni and Andre, Jean-Claude and Barkai, David and Berthou, Jean-Yves and Boku, Taisuke and Braunschweig, Bertrand and Cappello, Franck and Chapman, Barbara and Chi, Xuebin and Choudhary, Alok and Dosanjh, Sudip and Dunning, Thom and Fiore, Sandro and Geist, Al and Gropp, Bill and Harrison, Robert and Hereld, Mark and Heroux, Michael and Hoisie, Adolfy and Hotta, Koh and Jin, Zhong and Ishikawa, Yutaka and Johnson, Fred and Kale, Sanjay and Kenway, Richard and Keyes, David and Kramer, Bill and Labarta, Jesus and Lichnewsky, Alain and Lippert, Thomas and Lucas, Bob and Maccabe, Barney and Matsuoka, Satoshi and Messina, Paul and Michielse, Peter and Mohr, Bernd and Mueller, Matthias S. and Nagel, Wolfgang E. and Nakashima, Hiroshi and Papka, Michael E and Reed, Dan and Sato, Mitsuhisa and Seidel, Ed and Shalf, John and Skinner, David and Snir, Marc and Sterling, Thomas and Stevens, Rick and Streitz, Fred and Sugar, Bob and Sumimoto, Shinji and Tang, William and Taylor, John and Thakur, Rajeev and Trefethen, Anne and Valero, Mateo and van der Steen, Aad and Vetter, Jeffrey and Williams, Peg and Wisniewski, Robert and Yelick, Kathy},
	Author = {Dongarra, Jack and et al},
	Doi = {10.1177/1094342010391989},
	Eprint = {http://hpc.sagepub.com/content/25/1/3.full.pdf+html},
	Journal = {International Journal of High Performance Computing Applications},
	Number = {1},
	Pages = {3-60},
	Title = {{The International Exascale Software Project Roadmap}},
	Url = {http://hpc.sagepub.com/content/25/1/3.abstract},
	Volume = {25},
	Year = {2011},
	Bdsk-Url-1 = {http://hpc.sagepub.com/content/25/1/3.abstract},
	Bdsk-Url-2 = {http://dx.doi.org/10.1177/1094342010391989}}

@inproceedings{2012-19,
	Acmid = {2145845},
	Address = {New York, NY, USA},
	Author = {Du, Peng and Bouteiller, Aurelien and Bosilca, George and Herault, Thomas and Dongarra, Jack},
	Booktitle = {{Proceedings of the 17th ACM SIGPLAN symposium on Principles and Practice of Parallel Programming}},
	Doi = {10.1145/2145816.2145845},
	Isbn = {978-1-4503-1160-1},
	Keywords = {ABFT, LU, QR, fail-stop failure, fault-tolerance},
	Location = {New Orleans, Louisiana, USA},
	Numpages = {10},
	Pages = {225--234},
	Publisher = {ACM},
	Series = {PPoPP 2012},
	Title = {{Algorithm-based fault tolerance for dense matrix factorizations}},
	Url = {http://doi.acm.org/10.1145/2145816.2145845},
	Year = {2012},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2145816.2145845},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2145816.2145845}}

@inproceedings{2011-15,
	Acmid = {2065696},
	Address = {Washington, DC, USA},
	Author = {Du, Peng and Luszczek, Piotr and Dongarra, Jack},
	Booktitle = {{Proceedings of the IEEE International Conference on Cluster Computing}},
	Doi = {10.1109/CLUSTER.2011.38},
	Isbn = {978-0-7695-4516-5},
	Numpages = {9},
	Pages = {272--280},
	Publisher = {IEEE Computer Society},
	Series = {CLUSTER 2011},
	Title = {{High Performance Dense Linear System Solver with Soft Error Resilience}},
	Url = {http://dx.doi.org/10.1109/CLUSTER.2011.38},
	Year = {2011},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/CLUSTER.2011.38}}

@inproceedings{2011-14,
	Acmid = {2133179},
	Address = {New York, NY, USA},
	Author = {Du, Peng and Luszczek, Piotr and Tomov, Stan and Dongarra, Jack},
	Booktitle = {{Proceedings of the second workshop on Scalable algorithms for large-scale systems}},
	Doi = {10.1145/2133173.2133179},
	Isbn = {978-1-4503-1180-9},
	Keywords = {GPGPU, QR factorization, soft error},
	Location = {Seattle, Washington, USA},
	Numpages = {4},
	Pages = {11--14},
	Publisher = {ACM},
	Series = {ScalA 2011},
	Title = {{Soft error resilient QR factorization for hybrid system with GPGPU}},
	Url = {http://doi.acm.org/10.1145/2133173.2133179},
	Year = {2011},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2133173.2133179},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2133173.2133179}}

@article{2013-17,
	Author = {James Elliott and Mark Hoemmen and Frank Mueller},
	Bibsource = {DBLP, http://dblp.uni-trier.de},
	Ee = {http://arxiv.org/abs/1311.6505},
	Journal = {CoRR},
	Title = {{Evaluating the Impact of SDC on the GMRES Iterative Solver}},
	Volume = {abs/1311.6505},
	Year = {2013}}

@article{2013-18,
	Author = {James Elliott and Mark Hoemmen and Frank Mueller},
	Bibsource = {DBLP, http://dblp.uni-trier.de},
	Ee = {http://arxiv.org/abs/1312.2333},
	Journal = {CoRR},
	Title = {{Exploiting Data Representation for Fault Tolerance}},
	Volume = {abs/1312.2333},
	Year = {2013}}

@article{1992-1,
	Abstract = {Manetho is a new transparent rollback-recovery protocol for long-running
	distributed computations. It uses a novel combination of antecedence
	graph maintenance, uncoordinated checkpointing, and sender-based
	message logging. Manetho simultaneously achieves the advantages of
	pessimistic message logging, namely limited rollback and, fast output
	commit, and the advantage of optimistic message logging, namely low
	failure-free overhead. These advantages come at the expense of a
	complex recovery scheme},
	Author = {Elnozahy, E.N. and Zwaenepoel, W.},
	Doi = {10.1109/12.142678},
	Issn = {0018-9340},
	Journal = {IEEE Transactions on Computers},
	Keywords = {fault tolerant computing;graph theory;Manetho;antecedence graph maintenance;distributed computations;failure-free overhead;optimistic message logging;output commit;pessimistic message logging;sender-based message logging;transparent rollback-recovery protocol;uncoordinated checkpointing;Checkpointing;Computer science;Delay;Distributed computing;Fault tolerance;Hardware;Optimization methods;Printers;Protocols},
	Number = {5},
	Pages = {526-531},
	Title = {{Manetho: transparent roll back-recovery with low overhead, limited rollback, and fast output commit}},
	Volume = {41},
	Year = {1992},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/12.142678}}

@article{2002-4,
	Acmid = {568525},
	Address = {New York, NY, USA},
	Author = {Elnozahy, E. N. (Mootaz) and Alvisi, Lorenzo and Wang, Yi-Min and Johnson, David B.},
	Doi = {10.1145/568522.568525},
	Issn = {0360-0300},
	Issue_Date = {September 2002},
	Journal = {ACM Comput. Surv.},
	Keywords = {message logging, rollback-recovery},
	Month = {Sep},
	Number = {3},
	Numpages = {34},
	Pages = {375--408},
	Publisher = {ACM},
	Title = {{A survey of rollback-recovery protocols in message-passing systems}},
	Url = {http://doi.acm.org/10.1145/568522.568525},
	Volume = {34},
	Year = {2002},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/568522.568525},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/568522.568525}}

@conference{2011-2,
	Abstract = {This paper presents a modular-redundant Message Passing Interface
	(MPI) solution, MR-MPI, for transparently executing high-performance
	computing (HPC) applications in a redundant fashion. The presented
	work addresses the deficiencies of recovery-oriented HPC, i.e., checkpoint/restart
	to/from a parallel file system, at extreme scale by adding the redundancy
	approach to the HPC resilience portfolio. It utilizes the MPI performance
	tool interface, PMPI, to transparently intercept MPI calls from an
	application and to hide all redundancy-related mechanisms. A redundantly
	executed application runs with $r*m$ native MPI processes, where
	$r$ is the number of MPI ranks visible to the application and $m$
	is the replication degree. Messages between redundant nodes are replicated.
	Partial replication for tunable resilience is supported. The performance
	results clearly show the negative impact of the O(m^2) messages between
	replicas. For low-level, point-to-point benchmarks, the impact can
	be as high as the replication degree. For applications, performance
	highly depends on the actual communication types and counts. On single-core
	systems, the overhead can be 0\% for embarrassingly parallel applications
	independent of the employed redundancy configuration or up to 70-90\%
	for communication-intensive applications in a dual-redundant configuration.
	On multi-core systems, the overhead can be significantly higher due
	to the additional communication contention.},
	Address = {Innsbruck, Austria},
	Author = {Christian Engelmann and Swen Bohm},
	Booktitle = {{Proceedings of the 10th IASTED International Conference on Parallel and Distributed Computing and Networks}},
	Doi = {http://dx.doi.org/10.2316/P.2011.719-031},
	Isbn = {978-0-88986-864-9},
	Pages = {31--38},
	Publisher = {ACTA Press, Calgary, AB, Canada},
	Series = {PDCN 2011},
	Title = {{Redundant Execution of HPC Applications with MR-MPI}},
	Url = {http://www.christian-engelmann.info/publications/engelmann11redundant.pdf},
	Url2 = {http://www.christian-engelmann.info/publications/engelmann11redundant.ppt.pdf},
	Year = {2011},
	Bdsk-Url-1 = {http://www.christian-engelmann.info/publications/engelmann11redundant.pdf},
	Bdsk-Url-2 = {http://dx.doi.org/10.2316/P.2011.719-031}}

@inproceedings{2005-7,
	Abstract = {In the next five years, the number of processors in high-end systems
	for scientific computing is expected to rise to tens and even hundreds
	of thousands. For example, the IBM BlueGene/L can have up to 128,000
	processors and the delivery of the .rst system is scheduled for 2005.
	Existing deficiencies in scalability and fault-tolerance of scientific
	applications need to be addressed soon. If the number of processors
	grows by a magnitude and efficiency drops by a magnitude, the overall
	effective computing performance stays the same. Furthermore, the
	mean time to interrupt of high-end computer systems decreases with
	scale and complexity. In a 100,000-processor system, failures may
	occur every couple of minutes and traditional checkpointing may no
	longer be feasible. With this paper, we summarize our recent research
	in super-scalable algorithms for computing on 100,000 processors.
	We introduce the algorithm properties of scale invariance and natural
	fault tolerance, and discuss how they can be applied to two different
	classes of algorithms. We also describe a super-scalable diskless
	checkpointing algorithm for problems that can't be transformed into
	a superscalable variant, or where other solutions are more efficient.
	Finally, a 100,000-processor simulator is presented as a platform
	for testing and experimentation. },
	Acmid = {2152771},
	Address = {Berlin, Heidelberg},
	Author = {Engelmann, Christian and Geist, Al},
	Booktitle = {{Proceedings of the 5th international conference on Computational Science - Volume Part I}},
	Doi = {10.1007/11428831_39},
	Isbn = {3-540-26032-3, 978-3-540-26032-5},
	Location = {Atlanta, GA},
	Numpages = {9},
	Pages = {313--321},
	Publisher = {Springer-Verlag},
	Series = {ICCS 2005},
	Title = {{Super-Scalable algorithms for computing on 100,000 processors}},
	Url = {http://dx.doi.org/10.1007/11428831_39},
	Year = {2005},
	Bdsk-Url-1 = {http://dx.doi.org/10.1007/11428831_39}}

@inproceedings{2005-8,
	Author = {Engelmann, Christian and Scott, Stephen L},
	Booktitle = {{Proceedings of 2nd International Workshop on Operating Systems, Programming Environments and Management Tools for High-Performance Computing on Clusters (COSET-2)}},
	Title = {{High availability for ultra-scale high-end scientific computing}},
	Year = {2005}}

@inproceedings{2005-1,
	Author = {Fagg, Graham E. and Angskun, Thara and Bosilca, George and Pjesivac-Grbovic, Jelena and Dongarra, Jack},
	Biburl = {http://www.bibsonomy.org/bibtex/279852aee3bcab95b6fd43af8f7eed114/dblp},
	Booktitle = {{PVM/MPI}},
	Editor = {Martino, Beniamino Di and Kranzlm{\"u}ller, Dieter and Dongarra, Jack},
	Ee = {http://dx.doi.org/10.1007/11557265_13},
	Isbn = {3-540-29009-5},
	Pages = {67-75},
	Publisher = {Springer},
	Series = {Lecture Notes in Computer Science},
	Title = {{Scalable Fault Tolerant MPI: Extending the Recovery Algorithm.}},
	Url = {http://dblp.uni-trier.de/db/conf/pvm/pvm2005.html#FaggABPD05},
	Volume = {3666},
	Year = {2005},
	Bdsk-Url-1 = {http://dblp.uni-trier.de/db/conf/pvm/pvm2005.html#FaggABPD05}}

@inproceedings{2000-2,
	Acmid = {746632},
	Address = {London, UK},
	Author = {Fagg, Graham E. and Dongarra, Jack},
	Booktitle = {{Proceedings of the 7th European PVM/MPI Users' Group Meeting on Recent Advances in Parallel Virtual Machine and Message Passing Interface}},
	Isbn = {3-540-41010-4},
	Numpages = {8},
	Pages = {346--353},
	Publisher = {Springer-Verlag},
	Title = {{FT-MPI: Fault Tolerant MPI, Supporting Dynamic Applications in a Dynamic World}},
	Url = {http://dl.acm.org/citation.cfm?id=648137.746632},
	Year = {2000},
	Bdsk-Url-1 = {http://dl.acm.org/citation.cfm?id=648137.746632}}

@article{2004-1,
	Acmid = {1080717},
	Address = {Thousand Oaks, CA, USA},
	Author = {Fagg, Graham E. and Dongarra, Jack J.},
	Doi = {10.1177/1094342004046052},
	Issn = {1094-3420},
	Journal = {Int. J. High Perform. Comput. Appl.},
	Keywords = {Fault tolerant, MPI, message passing, parallel computing},
	Month = {Aug},
	Number = {3},
	Numpages = {9},
	Pages = {353--361},
	Publisher = {Sage Publications, Inc.},
	Title = {{Building and Using a Fault-Tolerant MPI Implementation}},
	Url = {http://dx.doi.org/10.1177/1094342004046052},
	Volume = {18},
	Year = {2004},
	Bdsk-Url-1 = {http://dx.doi.org/10.1177/1094342004046052}}

@inproceedings{2003-3,
	Author = {Graham E. Fagg and Edgar Gabriel and George Bosilca and Thara Angskun and Zhizhong Chen and Jelena Pjesivac-grbovic and Kevin London and Jack J. Dongarra},
	Booktitle = {{Proceeding of International Supercomputer Conference (ICS)}},
	Title = {{Extending the MPI Specification for Process Fault Tolerance on High Performance Computing Systems}},
	Year = {2003}}

@inproceedings{2005-2,
	Author = {Graham E. Fagg and Edgar Gabriel and Zizhong Chen and Thara Angskun and George Bosilca and Jelena Pjesivac-grbovic and Jack J. Dongarra},
	Booktitle = {INTERNATIONAL JOURNAL FOR HIGH PERFORMANCE APPLICATIONS AND SUPERCOMPUTING},
	Title = {{Process fault tolerance: semantics, design and applications for high performance computing}},
	Year = {2005}}

@techreport{2011-5,
	Address = {Albuquerque, NM},
	Author = {Kurt Ferreira and Ron Oldfield and Jon Stearley and James Laros and Kevin Pedretti and Ron Brightwell and Rolf Riesen},
	Institution = {Sandia National Laboratories},
	Keywords = {rMPI, process replication, fault tolerance},
	Month = {April},
	Number = {SAND2011-2488},
	Title = {{rMPI: Increasing Fault Resiliency in a Message-Passing Environment}},
	Year = {2011}}

@misc{2009-2,
	Abstract = {Petaflops systems will have tens to hundreds of thousands of compute
	nodes which increases the likelihood of faults. Applications use
	checkpoint/restart to recover from these faults, but even under ideal
	conditions, applications running on more than 30,000 nodes will likely
	spend more than half of their total run time saving checkpoints,
	restarting, and redoing work that was lost. We created a library
	that performs redundant computations on additional nodes allocated
	to the application. An active node and its redundant partner form
	a node bundle which will only fail, and cause an application restart,
	when both nodes in the bundle fail. The goal of this library is to
	learn whether this can be done entirely at the user level, what requirements
	this library places on a Reliability, Availability, and Serviceability
	(RAS) system, and what its impact on performance and run time is.
	We find that our redundant MPI layer library imposes a relatively
	modest performance penalty for applications, but that it greatly
	reduces the number of applications interrupts. This reduction in
	interrupts leads to huge savings in restart and rework time. For
	large-scale applications the savings compensate for the performance
	loss and the additional nodes required for redundant computations.},
	Author = {Kurt Ferreira and Rolf Riesen and Ron Oldfield and Jon Stearley and James Laros and Kevin Pedretti and Ron Brightwell and Todd Kordenbrock},
	Institution = {Sandia National Laboratories},
	Month = {Oct},
	Number = {SAND2009-6753},
	Title = {{Increasing Fault Resiliency in a Message-Passing Environment}},
	Year = {2009}}

@inproceedings{2011-4,
	Acmid = {2063443},
	Address = {New York, NY, USA},
	Articleno = {44},
	Author = {Ferreira, Kurt and Stearley, Jon and Laros,III, James H. and Oldfield, Ron and Pedretti, Kevin and Brightwell, Ron and Riesen, Rolf and Bridges, Patrick G. and Arnold, Dorian},
	Booktitle = {{Proceedings of International Conference for High Performance Computing, Networking, Storage and Analysis}},
	Doi = {10.1145/2063384.2063443},
	Isbn = {978-1-4503-0771-0},
	Location = {Seattle, Washington},
	Numpages = {12},
	Pages = {44:1--44:12},
	Publisher = {ACM},
	Series = {SC 2011},
	Title = {{Evaluating the viability of process replication reliability for exascale systems}},
	Url = {http://doi.acm.org/10.1145/2063384.2063443},
	Year = {2011},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2063384.2063443},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2063384.2063443}}

@inproceedings{2012-20,
	Acmid = {2238469},
	Address = {Berlin, Heidelberg},
	Author = {Fiala, David and Ferreira, Kurt B. and Mueller, Frank and Engelmann, Christian},
	Booktitle = {{Proceedings of the international conference on Parallel Processing}},
	Doi = {10.1007/978-3-642-29740-3_29},
	Isbn = {978-3-642-29739-7},
	Location = {Bordeaux, France},
	Numpages = {11},
	Pages = {251--261},
	Publisher = {Springer-Verlag},
	Series = {Euro-Par 2011},
	Title = {{A tunable, software-based DRAM error detection and correction library for HPC}},
	Url = {http://dx.doi.org/10.1007/978-3-642-29740-3_29},
	Volume = {2},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-29740-3_29}}

@inproceedings{2012-3,
	Author = {Fiala, David and Mueller, Frank and Engelmann, Christian and Riesen, Rolf and Ferreira, Kurt and Brightwell, Ron},
	Booktitle = {{Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis}},
	Isbn = {978-1-4673-0804-5},
	Location = {Salt Lake City, Utah},
	Numpages = {12},
	Pages = {78:1--78:12},
	Series = {SC 2012},
	Title = {{Detection and correction of silent data corruption for large-scale high-performance computing}},
	Url = {http://dl.acm.org/citation.cfm?id=2388996.2389102},
	Year = {2012},
	Bdsk-Url-1 = {http://dl.acm.org/citation.cfm?id=2388996.2389102}}

@techreport{2012-11-mpi3,
	Author = {Message Passing Interface Forum},
	Institution = {Message Passing Interface Forum},
	Keywords = {MPI Specification},
	Month = {September},
	Title = {{MPI: A Message-Passing Interface Standard Version 3.0}},
	Type = {Specification},
	Year = {2012}}

@inproceedings{2004-2,
	Author = {Edgar Gabriel and Graham E. Fagg and George Bosilca and Thara Angskun and Jack J. Dongarra and Jeffrey M. Squyres and Vishal Sahay and Prabhanjan Kambadur and Brian Barrett and Andrew Lumsdaine and Ralph H. Castain and David J. Daniel and Richard L. Graham and Timothy S. Woodall},
	Booktitle = {{Proceedings, 11th European PVM/MPI Users' Group Meeting}},
	Location = {Budapest, Hungary},
	Month = {September},
	Pages = {97--104},
	Series = {PVM/MPI 2004},
	Title = {{Open MPI: Goals, Concept, and Design of a Next Generation MPI Implementation}},
	Year = {2004}}

@inproceedings{2012-15,
	Abstract = {HPC systems are complex machines that generate a huge volume of system
	state data called "events". Events are generated without following
	a general consistent rule and different hardware and software components
	of such systems have different failure rates. Distinguishing between
	normal system behaviour and faulty situation relies on event analysis.
	Being able to detect quickly deviations from normality is essential
	for system administration and is the foundation of fault prediction.
	As HPC systems continue to grow in size and complexity, mining event
	flows become more challenging and with the upcoming 10 Pet flop systems,
	there is a lot of interest in this topic. Current event mining approaches
	do not take into consideration the specific behaviour of each type
	of events and as a consequence, fail to analyze them according to
	their characteristics. In this paper we propose a novel way of characterizing
	the normal and faulty behaviour of the system by using signal analysis
	concepts. All analysis modules create ELSA (Event Log Signal Analyzer),
	a toolkit that has the purpose of modelling the normal flow of each
	state event during a HPC system lifetime, and how it is affected
	when a failure hits the system. We show that these extracted models
	provide an accurate view of the system output, which improves the
	effectiveness of proactive fault tolerance algorithms. Specifically,
	we implemented a filtering algorithm and short-term fault prediction
	methodology based on the extracted model and test it against real
	failure traces from a large-scale system. We show that by analyzing
	each event according to its specific behaviour, we get a more realistic
	overview of the entire system.},
	Author = {Gainaru, A. and Cappello, F. and Kramer, W.},
	Booktitle = {{IEEE 26th International Parallel Distributed Processing Symposium}},
	Doi = {10.1109/IPDPS.2012.107},
	Issn = {1530-2075},
	Keywords = {data mining, failure analysis, large-scale systems, parallel processing, prediction theory, program diagnostics, signal processing, software fault tolerance, 10 PetaHop systems, ELSA, complex machines, event analysis, event flows mining, event log signal analyzer, event mining approaches, events generation, failure rates, fault tolerance algorithms, faulty behaviour, filtering algorithm, hardware components, large-scale HPC systems, normal system behaviour, real failure tracing, short-term fault prediction methodology, signal analysis, software components, system administration, system state data generation, Analytical models, Correlation, Data mining, Large-scale systems, Prediction algorithms, Predictive models, Signal analysis, fault detection, fault tolerance, large-scale HPC systems, signal analysis},
	Pages = {1168-1179},
	Series = {IPDPS 2012},
	Title = {{Taming of the Shrew: Modeling the Normal and Faulty Behaviour of Large-scale HPC Systems}},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPS.2012.107}}

@article{2013-23,
	Acmid = {2509969},
	Address = {Thousand Oaks, CA, USA},
	Author = {Gainaru, Ana and Cappello, Franck and Snir, Marc and Kramer, William},
	Doi = {10.1177/1094342013488258},
	Issn = {1094-3420},
	Issue_Date = {August 2013},
	Journal = {Int. J. High Perform. Comput. Appl.},
	Keywords = {failure prediction, fault tolerance, signal analysis},
	Month = aug,
	Number = {3},
	Numpages = {10},
	Pages = {273--282},
	Publisher = {Sage Publications, Inc.},
	Title = {{Failure Prediction for HPC Systems and Applications: Current Situation and Open Issues}},
	Url = {http://dx.doi.org/10.1177/1094342013488258},
	Volume = {27},
	Year = {2013},
	Bdsk-Url-1 = {http://dx.doi.org/10.1177/1094342013488258}}

@article{2002-2,
	Author = {Geist, Al and Engelmann, Christian},
	Journal = {Journal of Parallel and Distributed Computing},
	Publisher = {Citeseer},
	Title = {{Development of naturally fault tolerant algorithms for computing on 100,000 processors}},
	Year = {2002}}

@article{2012-13,
	Author = {Al Geist and Marc Snir and Eric Roman and Bert Still and Roberty Clay and Christian Engelmann and Rob Ross and Martin Schulz and Sriram Krishnamoorthy and Abhinav Vishnu and Bob Lucas and Shekhar Borkar and Mootaz Elnozahy and Andrew Chien and John Wu and Nathan DeBardeleben and Larry Kaplan and Mike Heroux and Lucy Nowell and Lee-Ann Talley},
	Note = {BWI Airport Marriott, Maryland, June 6, 2012},
	Title = {{US Department of Energy Fault Management Workshop Report}},
	Year = {2012}}

@inproceedings{2005-9,
	Abstract = {This paper presents P2P-MPI, a middleware aimed at computational grids.
	From the programmer point of view, P2P-MPI provides a message-passing
	programming model which enables the development of MPI applications
	for grids. Its originality lies in its adaptation to unstable environments.
	First, the peer-to-peer design of P2P-MPI allows for a dynamic discovery
	of collaborating resources. Second, it gives the user the possibility
	to adjust the robustness of an execution thanks to an internal process
	replication mechanism. Finally, we measure the middleware performances
	on two NAS benchmarks.},
	Acmid = {2104602},
	Address = {Berlin, Heidelberg},
	Author = {Genaud, Stephane and Rattanapoka, Choopan},
	Booktitle = {{Proceedings of the 12th European PVM/MPI users' group conference on Recent Advances in Parallel Virtual Machine and Message Passing Interface}},
	Doi = {10.1007/11557265_37},
	Isbn = {3-540-29009-5, 978-3-540-29009-4},
	Keywords = {MPI, grid, java, middleware, peer-to-peer},
	Location = {Sorrento, Italy},
	Numpages = {9},
	Pages = {276--284},
	Publisher = {Springer-Verlag},
	Series = {PVM/MPI 2005},
	Title = {{A peer-to-peer framework for robust execution of message passing parallel programs on grids}},
	Url = {http://dx.doi.org/10.1007/11557265_37},
	Year = {2005},
	Bdsk-Url-1 = {http://dx.doi.org/10.1007/11557265_37}}

@inproceedings{2011-8,
	Abstract = {As reported by many recent studies, the mean time between failures
	of future post-petascale supercomputers is likely to reduce, compared
	to the current situation. The most popular fault tolerance approach
	for MPI applications on HPC Platforms relies on coordinated check
	pointing which raises two major issues: a) global restart wastes
	energy since all processes are forced to rollback even in the case
	of a single failure, b) checkpoint coordination may slow down the
	application execution because of congestions on I/O resources. Alternative
	approaches based on uncoordinated check pointing and message logging
	require logging all messages, imposing a high memory/storage occupation
	and a significant overhead on communications. It has recently been
	observed that many MPI HPC applications are send-deterministic, allowing
	to design new fault tolerance protocols. In this paper, we propose
	an uncoordinated check pointing protocol for send-deterministic MPI
	HPC applications that (i) logs only a subset of the application messages
	and (ii) does not require to restart systematically all processes
	when a failure occurs. We first describe our protocol and prove its
	correctness. Through experimental evaluations, we show that its implementation
	in MPICH2 has a negligible overhead on application performance. Then
	we perform a quantitative evaluation of the properties of our protocol
	using the NAS Benchmarks. Using a clustering approach, we demonstrate
	that this protocol actually succeeds to combine the two expected
	properties: a) it logs only a small fraction of the messages and
	b) it reduces by a factor approaching 2 the average number of processes
	to rollback compared to coordinated check pointing.},
	Author = {Guermouche, A. and Ropars, T. and Brunet, E. and Snir, M. and Cappello, F.},
	Booktitle = {{IEEE International Parallel and Distributed Processing Symposium (IPDPS)}},
	Doi = {10.1109/IPDPS.2011.95},
	Issn = {1530-2075},
	Keywords = {checkpointing;fault tolerance;message passing;parallel machines;HPC platform;I/O resource congestion;checkpoint coordination;domino effect;failure;fault tolerance protocol;global restart wastes energy;memory-storage occupation;message logging;post-petascale supercomputer;send-deterministic MPI HPC application;uncoordinated checkpointing;Checkpointing;Computational modeling;Erbium;Fault tolerance;Fault tolerant systems;Protocols;Receivers},
	Pages = {989-1000},
	Title = {{Uncoordinated Checkpointing Without Domino Effect for Send-Deterministic MPI Applications}},
	Year = {2011},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPS.2011.95}}

@inproceedings{2012-16,
	Author = {Guermouche, A. and Ropars, T. and Snir, M. and Cappello, F.},
	Booktitle = {{IEEE 26th International Parallel and Distributed Processing Symposium, 2012}},
	Doi = {10.1109/IPDPS.2012.111},
	Issn = {1530-2075},
	Keywords = {application program interfaces, checkpointing, fault tolerant computing, message passing, parallel processing, protocols, HydEE, MPICH2 library, application messages, concurrent failures, failure checkpointing protocols, failure containment, failure mean time, global restart, high-performance computing, hybrid fault tolerant protocols, hybrid rollback-recovery protocol, inter-cluster messages, large-scale send-deterministic MPI applications, memory occupation, message logging, message passing applications, Checkpointing, Fault tolerance, Fault tolerant systems, Libraries, Message passing, Protocols, High performance computing, MPI, failure containment, fault tolerance, send-determinism},
	Pages = {1216-1227},
	Series = {IPDPS 2012},
	Title = {{HydEE: Failure Containment without Event Logging for Large Scale Send-Deterministic MPI Applications}},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPS.2012.111}}

@inproceedings{2001-1,
	Abstract = {In this paper, we extend the theory of algorithmic fault-tolerant
	matrix-matrix multiplication, C = AB, in a number of ways. First,
	we propose low-overhead methods for detecting errors introduced not
	only in C but also in A and/or B. Second, we theoretically show that
	the methods will detect all errors as long as only one entry is corrupted.
	Third, we propose a low-overhead rollback approach to correct errors
	once detected. Finally, we give a high-performance implementation
	of matrix-matrix multiplication that incorporates these error detection
	and correction methods. Empirical results demonstrate that the methods
	work well in practice with an acceptable level of overhead relative
	to high-performance implementations without fault-tolerance.},
	Author = {John A. Gunnels and Daniel S. Katz and Enrique S. Quintana-Orti and Robert A. van de Geijn},
	Booktitle = {{International Conference on Dependable Systems and Networks. DSN 2001.}},
	Doi = {10.1109/DSN.2001.941390},
	Keywords = {error analysis;fault tolerant computing;matrix multiplication;error correction;error detection;errors;fault-tolerance;fault-tolerant high-performance matrix multiplication;low-overhead roll-back approach;matrix-matrix multiplication;Contracts;Costs;Error correction;Fault tolerance;High performance computing;Laboratories;Linear algebra;NASA;Propulsion;Space technology},
	Pages = {47--56},
	Title = {{Fault-Tolerant High-Performance Matrix Multiplication: theory and practice}},
	Year = {2001},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/DSN.2001.941390}}

@inproceedings{2010-5,
	Abstract = {Modeling and analysis of large scale scientific systems often use
	linear least squares regression, frequently employing Cholesky factorization
	to solve the resulting set of linear equations. With large matrices,
	this often will be performed in high performance clusters containing
	many processors. Assuming a constant failure rate per processor,
	the probability of a failure occurring during the execution increases
	linearly with additional processors. Fault tolerant methods attempt
	to reduce the expected execution time by allowing recovery from failure.
	This paper presents an analysis and implementation of a fault tolerant
	Cholesky factorization algorithm that does not require checkpointing
	for recovery from fail-stop failures. Rather, this algorithm uses
	redundant data added in an additional set of processors. This differs
	from previous works with algorithmic methods as it addresses fail-stop
	failures rather than fail-continue cases. The implementation and
	experimentation using ScaLAPACK demonstrates that this method has
	decreasing overhead in relation to overall runtime as the matrix
	size increases, and thus shows promise to reduce the expected runtime
	for Cholesky factorizations on very large matrices.},
	Author = {Hakkarinen, D. and Zizhong Chen},
	Booktitle = {{IEEE International Symposium on Parallel and Distributed Processing (IPDPS)}},
	Doi = {10.1109/IPDPS.2010.5470436},
	Issn = {1530-2075},
	Keywords = {fault tolerant computing;least squares approximations;matrix decomposition;regression analysis;system recovery;ScaLAPACK;algorithmic Cholesky factorization fault recovery;fail-stop failures;fault tolerant method;high performance clusters;large scale scientific systems;linear equations;linear least squares regression;Collaboration;Computer architecture;Computer networks;Delay;Distributed computing;Grid computing;Network topology;Processor scheduling;Routing;Switches;Algorithmic Based Fault Tolerance;Checkpoint Free;Linear Algebra},
	Pages = {1-10},
	Title = {{Algorithmic Cholesky factorization fault recovery}},
	Year = {2010},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPS.2010.5470436}}

@inproceedings{2006-7,
	Author = {Hargrove, Paul H and Duell, Jason C},
	Booktitle = {{Journal of Physics: Conference Series}},
	Number = {1},
	Organization = {IOP Publishing},
	Pages = {494},
	Title = {{Berkeley Lab Checkpoint/Restart (BLCR) for Linux clusters}},
	Volume = {46},
	Year = {2006}}

@inproceedings{2013-31,
	Abstract = {Traditional petascale applications, such as QMCPack, can scale their
	computations to completely utilize modern supercomputers like Titan,
	but they cannot scale their I/O. To preserve scalability, scientists
	cannot save data at the granularity needed to enable scientific discovery
	and are forced to use large intervals between two checkpoint calls.
	In this paper, we work to increase the granularity of the I/O in
	QMCPack simulations without increasing the I/O associated overhead
	or compromising the scalability of the simulations. Our solution
	redesigns the I/O algorithms used by QMCPack to gather finer-grained
	data at high frequencies and integrate the ADIOS API to select effective
	I/O methods without major code changes. The extension of a tool such
	as Skel to mimic the variable I/O in QMCPack allows us to predict
	the I/O performance of the code when using ADIOS methods at the petascale.
	We show how I/O libraries like ADIOS allow us to increase the amount
	of scientific data extracted from QMCPack simulations at the granularity
	desired by the scientists while keeping the I/O overhead below 10\%.
	We also show how the impact of checkpoint I/O for the QMCPack code
	using ADIOS is below 5\% when using preventive tactics for check
	pointing at the petascale and beyond.},
	Author = {Herbein, S. and Matheny, M. and Wezowicz, M. and Krogel, J. and Logan, J. and Kim, J. and Klasky, S. and Taufer, M.},
	Booktitle = {{ IEEE 16th International Conference on Computational Science and Engineering (CSE)}},
	Doi = {10.1109/CSE.2013.24},
	Keywords = {application program interfaces;checkpointing;data acquisition;input-output programs;software libraries;ADIOS API integration;I/O granularity;I/O libraries;I/O overhead;I/O performance prediction;QMCPack simulations;Skel tool;Titan supercomputers;checkpoint I/O;finer-grained data gathering;petascale applications;scientific data extraction;scientific discovery;Aggregates;Checkpointing;Computational modeling;Data models;Graphite;Scalability;XML;ADIOS library;HDF5;Quantum Monte Carlo;checkpointing;scalability},
	Month = {Dec},
	Pages = {92-99},
	Series = {CSE 2013},
	Title = {{Performance Impact of I/O on QMCPack Simulations at the Petascale and Beyond}},
	Year = {2013},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/CSE.2013.24}}

@inproceedings{2013-15,
	Abstract = {Large-scale computing platforms have always dealt with unreliability
	coming from many sources. In contrast applications for large-scale
	systems have generally assumed a fairly simplistic failure model:
	The computer is a reliable digital machine, with consistent execution
	time and infrequent failures that can be handled by occasionally
	storing a checkpoint of application state and restarting from that
	saved state if the system fails. Many computing experts, and several
	key technology trends indicate that the current simplistic application
	view of a high-end system is no longer feasible. Instead, algorithms
	and application developers must adopt more complex models for system
	reliability and adapt algorithms and implementation to be more resilient
	in the presence of failures and increased failure detection and correction.
	In this talk we present motivation for moving away from a checkpoint-restart-only
	model and discuss several new models for resilience, including latency
	tolerance, local recovery from local failure and selective reliability.
	We also discuss strategies for designing new algorithms and applications,
	and some of the required system and programming environment features.},
	Acmid = {2465814},
	Address = {New York, NY, USA},
	Area = {FT programming models},
	Author = {Heroux, Michael A.},
	Booktitle = {{Proceedings of the 3rd Workshop on Fault-tolerance for HPC at Extreme Scale}},
	Doi = {10.1145/2465813.2465814},
	Isbn = {978-1-4503-1983-6},
	Keywords = {fault-tolerance, resilience, scalable computing},
	Location = {New York, New York, USA},
	Numpages = {2},
	Pages = {1--2},
	Publisher = {ACM},
	Series = {FTXS 2013},
	Summary = {This survey paper presents 4 resilience-enabling programming models. - Skeptical programming enables writing simple validation tests to check for silent data corruption. - Relaxed Bulk-synchronous programming tries to solve synchronization overheads due to performance variability derived from failures (maybe the underlying hardware/software is trying to detect/solve a failure). This leads to latency-tolerant algorithms (e.g. with MPI-3 async collectives), which have to find useful work to do during the collective completion. - Local Failure, Local Recovery enables failure containment (e.g. store important data to persistent for each rank which will be accessed upon node failure). Paper gives examples about partial differential equations algorithms. ULFM enables this. - Selective Reliability Programming enables the declaration of specific data and compute regions to be more reliable that the 'bulk' reliability of the underlying system (e.g. can be implemented using even very expensive N-modular redundancy). Example: outer-loop can be reliable while inner calculation can be unreliable (outer loop can then check if the inner calculation is correct and use it, discard it, recalculate it, or partially use).},
	Title = {{Toward Resilient Algorithms and Applications}},
	Url = {http://doi.acm.org/10.1145/2465813.2465814},
	Year = {2013},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2465813.2465814},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2465813.2465814}}

@article{1984-1,
	Abstract = {The rapid progress in VLSI technology has reduced the cost of hardware,
	allowing multiple copies of low-cost processors to provide a large
	amount of computational capability for a small cost. In addition
	to achieving high performance, high reliability is also important
	to ensure that the results of long computations are valid. This paper
	proposes a novel system-level method of achieving high reliability,
	called algorithm-based fault tolerance. The technique encodes data
	at a high level, and algorithms are designed to operate on encoded
	data and produce encoded output data. The computation tasks within
	an algorithm are appropriately distributed among multiple computation
	units for fault tolerance. The technique is applied to matrix compomations
	which form the heart of many computation-intensive tasks. Algorithm-based
	fault tolerance schemes are proposed to detect and correct errors
	when matrix operations such as addition, multiplication, scalar product,
	LU-decomposition, and transposition are performed using multiple
	processor systems. The method proposed can detect and correct any
	failure within a single processor in a multiple processor system.
	The number of processors needed to just detect errors in matrix multiplication
	is also studied.},
	Author = {Kuang-Hua Huang and Abraham, J.A.},
	Doi = {10.1109/TC.1984.1676475},
	Issn = {0018-9340},
	Journal = {IEEE Transactions on Computers},
	Keywords = {Algorithm-based fault tolerance;checksum matrix;error correction;error detection;matrix operations;multiple processor systems;processor arrays;systolic arrays;transient errors;Algorithm design and analysis;Costs;Distributed computing;Fault detection;Fault tolerance;Fault tolerant systems;Hardware;Heart;High performance computing;Very large scale integration;Algorithm-based fault tolerance;checksum matrix;error correction;error detection;matrix operations;multiple processor systems;processor arrays;systolic arrays;transient errors},
	Number = {6},
	Pages = {518-528},
	Title = {{Algorithm-Based Fault Tolerance for Matrix Operations}},
	Volume = {C-33},
	Year = {1984},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/TC.1984.1676475}}

@inproceedings{2005-5,
	Author = {Wei Huang and Gopalakrishnan Santhanaraman and Hyun-Wook Jin and Dhabaleswar K. Panda},
	Booktitle = {{PVM/MPI}},
	Ee = {http://dx.doi.org/10.1007/11557265_27},
	Pages = {191-199},
	Series = {PVM/MPI 2005},
	Title = {{Design Alternatives and Performance Trade-Offs for Implementing MPI-2 over InfiniBand}},
	Year = {2005}}

@phdthesis{2010-4,
	Address = {Indianapolis, IN, USA},
	Advisor = {Lumsdaine, Andrew},
	Author = {Hursey, Joshua},
	Isbn = {978-1-124-24735-9},
	Note = {AAI3423687},
	Publisher = {Indiana University},
	School = {Indiana University},
	Title = {{Coordinated checkpoint/restart process fault tolerance for MPI applications on HPC systems}},
	Year = {2010}}

@inproceedings{2011-11,
	Author = {Hursey, J. and Graham, R.L.},
	Booktitle = {{IEEE International Symposium on Parallel and Distributed Processing Workshops and Phd Forum (IPDPSW)}},
	Doi = {10.1109/IPDPS.2011.274},
	Issn = {1530-2075},
	Keywords = {message passing, resource allocation, safety-critical software, software fault tolerance, software performance evaluation, table lookup, tree data structures, ABFT techniques, algorithm based fault tolerance, application developers, application recovery, collective operations, collective performance, failure-free performance, fault aware algorithms, fault aware collective algorithms, fault tolerant MPI, high performance computing systems, lookup avoiding, performance degradation, process failure, rebalancing, rerouting, tree structured communication patterns, Algorithm design and analysis, Fault tolerance, Fault tolerant systems, Optimization, Proposals, Prototypes, Semantics},
	Pages = {1208-1215},
	Title = {{Preserving Collective Performance across Process Failure for a Fault Tolerant MPI}},
	Year = {2011},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPS.2011.274}}

@inproceedings{2011-7,
	Author = {Hursey, J. and Graham, R.L.},
	Booktitle = {{IEEE International Symposium on Parallel and Distributed Processing Workshops and Phd Forum}},
	Doi = {10.1109/IPDPS.2011.308},
	Issn = {1530-2075},
	Keywords = {application program interfaces, fault tolerant computing, message passing, program control structures, program verification, security of data, telecommunication network topology, ABFT, MPI forum fault tolerance working group, application developer, checkpoint-restart technique, data preservation mechanism, fault tolerance semantics, fault tolerance technique, fault tolerant MPI application, fault tolerant application, high performance computing application, message detection, message passing interface, program control management, ring communication program, termination detection, Context, Fault tolerance, Fault tolerant systems, Libraries, Proposals, Semantics},
	Pages = {1549-1556},
	Series = {IPDPSW},
	Title = {{Building a Fault Tolerant MPI Application: A Ring Communication Example}},
	Year = {2011},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPS.2011.308}}

@incollection{2011-3,
	Author = {Hursey, Joshua and Graham, RichardL. and Bronevetsky, Greg and Buntinas, Darius and Pritchard, Howard and Solt, DavidG.},
	Booktitle = {{Recent Advances in the Message Passing Interface}},
	Doi = {10.1007/978-3-642-24449-0_40},
	Editor = {Cotronis, Yiannis and Danalis, Anthony and Nikolopoulos, DimitriosS. and Dongarra, Jack},
	Isbn = {978-3-642-24448-3},
	Keywords = {MPI; Fault Tolerance; Run-through Stabilization; Algorithm Based Fault Tolerance; Fail-Stop Process Failure},
	Pages = {329-332},
	Publisher = {Springer Berlin Heidelberg},
	Series = {Lecture Notes in Computer Science},
	Title = {{Run-Through Stabilization: An MPI Proposal for Process Fault Tolerance}},
	Url = {http://dx.doi.org/10.1007/978-3-642-24449-0_40},
	Volume = {6960},
	Year = {2011},
	Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-24449-0_40}}

@inproceedings{2010-3,
	Address = {Stuttgart, Germany},
	Author = {Joshua Hursey and Chris January and Mark O'Connor and Paul H. Hargrove and David Lecomber and Jeffrey M. Squyres and Andrew Lumsdaine},
	Booktitle = {{Proceedings of the 17th {EuroMPI}} Conference},
	Month = {September},
	Series = {EuroMPI 2010},
	Title = {{Checkpoint/Restart-Enabled Parallel Debugging}},
	Year = {2010}}

@inproceedings{2009-4,
	Address = {New York, NY, USA},
	Author = {Hursey, Joshua and Mattox, Timothy I. and Lumsdaine, Andrew},
	Booktitle = {{Proceedings of the 18th ACM international symposium on High Performance Distributed Computing}},
	Doi = {http://doi.acm.org/10.1145/1551609.1551619},
	Isbn = {978-1-60558-587-1},
	Location = {Garching, Germany},
	Pages = {49--58},
	Publisher = {ACM},
	Series = {HPDC 2009},
	Title = {{Interconnect agnostic checkpoint/restart in Open MPI}},
	Year = {2009},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/1551609.1551619}}

@inproceedings{2011-12,
	Author = {Joshua Hursey and Thomas Naughton and Geoffroy Vallee and Richard L. Graham},
	Booktitle = {{Proceedings of the 18th {EuroMPI}} Conference},
	Doi = {http://dx.doi.org/10.1007/978-3-642-24449-0_29},
	Location = {Santorini, Greece},
	Series = {EuroMPI 2011},
	Title = {{A Log-Scaling Fault Tolerant Agreement Algorithm for a Fault Tolerant {MPI}}},
	Year = {2011},
	Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-24449-0_29}}

@inproceedings{2007-1,
	Author = {Hursey, J. and Squyres, J.M. and Mattox, T.I. and Lumsdaine, A.},
	Booktitle = {{IEEE International Parallel and Distributed Processing Symposium}},
	Doi = {10.1109/IPDPS.2007.370605},
	Keywords = {application program interfaces, checkpointing, message passing, software architecture, software fault tolerance, software portability, Open MPI, checkpoint-restart process fault tolerance, modular component architecture, production quality MPI, system software, Application software, Fault tolerance, Fault tolerant systems, Laboratories, Libraries, Message passing, Open systems, Platform virtualization, Production, Robustness},
	Pages = {1-8},
	Title = {{The Design and Implementation of Checkpoint/Restart Process Fault Tolerance for Open MPI}},
	Year = {2007},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPS.2007.370605}}

@techreport{2006-1,
	Address = {Bloomington, Indiana, USA},
	Author = {Joshua Hursey and Jeffrey M. Squyres and Andrew Lumsdaine},
	Institution = {Indiana University},
	Month = {July},
	Number = {TR635},
	Title = {{A Checkpoint and Restart Service Specification for Open MPI}},
	Url = {http://www.cs.indiana.edu/cgi-bin/techreports/TRNNN.cgi?trnum=TR635},
	Year = {2006},
	Bdsk-Url-1 = {http://www.cs.indiana.edu/cgi-bin/techreports/TRNNN.cgi?trnum=TR635}}

@inproceedings{2012-22,
	Abstract = {The increasing size and complexity of high performance computing (HPC)
	systems have led to major concerns over fault frequencies and the
	mechanisms necessary to tolerate these faults. Previous studies have
	shown that state-of-the-field checkpoint/restart mechanisms will
	not scale sufficiently for future generation systems. Therefore,
	optimizations that reduce checkpoint overheads are necessary to keep
	checkpoint/restart mechanisms effective. In this work, we demonstrate
	that checkpoint data compression is a feasible mechanism for reducing
	checkpoint commit latencies and storage overheads. Leveraging a simple
	model for checkpoint compression viability, we show: (1) checkpoint
	data compression is feasible for many types of scientific applications
	expected to run on extreme scale systems, (2) checkpoint compression
	viability scales with checkpoint size, (3) user-level versus system-level
	checkpoints bears little impact on checkpoint compression viability,
	and (4) checkpoint compression viability scales with application
	process count. Lastly, we describe the impact that checkpoint compression
	might have on future generation extreme scale systems.},
	Author = {Ibtesham, D. and Arnold, D. and Bridges, P.G. and Ferreira, K.B. and Brightwell, R.},
	Booktitle = {{41st International Conference on Parallel Processing (ICPP)}},
	Doi = {10.1109/ICPP.2012.45},
	Issn = {0190-3918},
	Keywords = {checkpointing;data compression;software fault tolerance;HPC system;checkpoint commit latency;checkpoint compression viability scale;checkpoint data compression;checkpoint overhead;checkpoint size;checkpoint/restart mechanism;checkpoint/restart-based fault tolerance;fault frequency;high performance computing;scientific application;storage overhead;system-level checkpoints;user-level checkpoints;Benchmark testing;Checkpointing;Compression algorithms;Data compression;Fault tolerance;Libraries;Mathematical model;Checkpoint Compression;Fault tolerance},
	Pages = {148-157},
	Title = {{On the Viability of Compression for Reducing the Overheads of Checkpoint/Restart-Based Fault Tolerance}},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/ICPP.2012.45}}

@inproceedings{2012-30,
	Abstract = {High performance computing (HPC) systems use checkpoint-restart to
	tolerate failures. Typically, applications store their states in
	checkpoints on a parallel file system (PFS). As applications scale
	up, checkpoint-restart incurs high overheads due to contention for
	PFS resources. The high overheads force large-scale applications
	to reduce checkpoint frequency, which means more compute time is
	lost in the event of failure. We alleviate this problem through a
	scalable checkpointrestart system, MCRENGINE. MCRENGINE aggregates
	checkpoints from multiple application processes with knowledge of
	the data semantics available through widely-used I/O libraries, e.g.,
	HDF5 and netCDF, and compresses them. Our novel scheme improves compressibility
	of checkpoints up to 115\% over simple concatenation and compression.
	Our evaluation with large-scale application checkpoints show that
	MCRENGINE reduces checkpointing overhead by up to 87\% and restart
	overhead by up to 62\% over a baseline with no aggregation or compression.},
	Author = {Islam, T.Z. and Mohror, K. and Bagchi, S. and Moody, A. and De Supinski, B.R. and Eigenmann, R.},
	Booktitle = {{International Conference for High Performance Computing, Networking, Storage and Analysis}},
	Doi = {10.1109/SC.2012.77},
	Issn = {2167-4329},
	Keywords = {checkpointing;data compression;parallel processing;MCREngine;PFS;checkpoint frequency;data compression;data semantics;data-aware aggregation;high performance computing systems;large-scale application checkpoints;parallel file system;scalable checkpointing system;Arrays;Checkpointing;Computer numerical control;Libraries;Message systems;Reactive power;Transceivers},
	Month = {Nov},
	Pages = {1-11},
	Series = {SC 2012},
	Title = {{MCREngine: A scalable checkpointing system using data-aware aggregation and compression}},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/SC.2012.77}}

@inproceedings{2013-26,
	Abstract = {Checkpointing has been widely adopted in support of fault-tolerance
	and job migration, with checkpoint files preferably kept also at
	remote storage to withstand unavailability/failures of local nodes
	in networked systems. Lately, I/O bandwidth to remote storage becomes
	the bottleneck for checkpointing on a large-scale system. This paper
	proposes an adaptive incremental checkpointing (AIC), aiming to reduce
	the checkpointing file size considerably so that its involved overhead
	is lowered and thus the expected job turnaround time drops. Given
	production multicore systems are observed to have unused cores often
	available, we design AIC to make use of separate cores for carrying
	out multi-level checkpointing with delta compression at desirable
	points of time adaptively. We develop a new Markov model for predicting
	the performance of such multi-level concurrent checkpointing, with
	AIC performance evaluated using six SPEC benchmarks under various
	system sizes. AIC is observed to lower the normalized expected turnaround
	time substantially (by up to 47\%) when compared to its static counterpart
	and a recent multi-level checkpointing scheme with fixed checkpoint
	intervals.},
	Author = {Jangjaimon, I. and Nian-Feng Tzeng},
	Booktitle = {{IEEE 27th International Symposium on Parallel and Distributed Processing (IPDPS)}},
	Doi = {10.1109/IPDPS.2013.33},
	Issn = {1530-2075},
	Keywords = {Markov processes;checkpointing;data compression;fault tolerant computing;multiprocessing systems;AIC performance;IO bandwidth;Markov model;SPEC benchmarks;adaptive incremental checkpointing;delta compression;fault-tolerance;fixed checkpoint intervals;job migration;large-scale system;multilevel checkpointing scheme;networked multicore systems;production multicore systems;remote storage;Bandwidth;Benchmark testing;Checkpointing;Markov processes;Multicore processing;Numerical models;Runtime;Adaptive checkpointing;Markov model;delta compression;fault tolerance;incremental checkpointing;multicore systems;two-level checkpointing},
	Month = {May},
	Pages = {7-18},
	Title = {{Adaptive Incremental Checkpointing via Delta Compression for Networked Multicore Systems}},
	Year = {2013},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPS.2013.33}}

@inproceedings{2011-6,
	Author = {Jeyapaul, R. and Fei Hong and Rhisheekesan, A. and Shrivastava, A. and Kyoungwoo Lee},
	Booktitle = {{International Conference on Parallel Processing (ICPP)}},
	Doi = {10.1109/ICPP.2011.76},
	Issn = {0190-3918},
	Keywords = {computer architecture, failure analysis, fault tolerant computing, microprocessor chips, power aware computing, system recovery, CMP systems, MiBench benchmark, RTL model, Reunion technique, SPEC2000, UnSync architecture, always forward execution enabled recovery mechanism, area overhead, charge carrying particles, cycle-accurate simulation, device dimension reduction, error free execution, general purpose chip multiprocessors, hardware based detection mechanism, hardware resources, hardware synthesis, overhead reduction, performance efficiency, power consumption reduction, power overhead, processor technology, processor vulnerability, redundancy based technique, redundant CMP architecture, soft error failure, soft error resilient redundant multicore architecture, system reliability, system resilience, timing window, transistor density, Computer architecture, Hardware, Instruction sets, Pipelines, Redundancy, core-level redundancy, error resilient, hardware detection, low power, multi-core architecture, redundant architecture, soft error},
	Pages = {632-641},
	Title = {{UnSync: A Soft Error Resilient Redundant Multicore Architecture}},
	Year = {2011},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/ICPP.2011.76}}

@inproceedings{2013-16,
	Author = {Jia, Yulu and Bosilca, George and Luszczek, Piotr and Dongarra, Jack J},
	Booktitle = {{Proceedings of SC13: International Conference for High Performance Computing, Networking, Storage and Analysis}},
	Organization = {ACM},
	Pages = {88},
	Title = {{Parallel reduction to hessenberg form with algorithm-based fault tolerance}},
	Year = {2013}}

@article{2008-4,
	Abstract = {Checkpointing and rollback recovery are widely used techniques for
	achieving fault-tolerance in distributed systems. In this paper,
	we present a novel checkpointing algorithm which has the following
	desirable features: A process can independently initiate consistent
	global checkpointing by saving its current state, called a tentative
	checkpoint. Other processes come to know about a consistent global
	checkpoint initiation through information piggy-backed with the application
	messages or limited control messages if necessary. When a process
	comes to know about a new consistent global checkpoint initiation,
	it takes a tentative checkpoint after processing the message (not
	before processing the message as in existing communication-induced
	checkpointing algorithms). After a process takes a tentative checkpoint,
	it starts logging the messages sent and received in memory. When
	a process comes to know that every other process has taken a tentative
	checkpoint corresponding to current consistent global checkpoint
	initiation, it flushes the tentative checkpoint and the message log
	to the stable storage. The tentative checkpoints together with the
	message logs stored in the stable storage form a consistent global
	checkpoint. Two or more processes can concurrently initiate consistent
	global checkpointing by taking a new tentative checkpoint; in that
	case, the tentative checkpoints taken by all these processes will
	be part of the same consistent global checkpoint. The sequence numbers
	assigned to checkpoints by a process increase monotonically. Checkpoints
	with the same sequence number form a consistent global checkpoint.
	We also present the performance evaluation of our algorithm.},
	Acmid = {1465430},
	Address = {Orlando, FL, USA},
	Author = {Jiang, Qiangfeng and Luo, Yi and Manivannan, D.},
	Booktitle = {{IEEE International Parallel and Distributed Processing Symposium}},
	Doi = {10.1016/j.jpdc.2008.08.003},
	Issn = {0743-7315},
	Issue_Date = {December, 2008},
	Journal = {J. Parallel Distrib. Comput.},
	Keywords = {Checkpointing, Communication-induced checkpointing, Distributed systems, Fault-tolerance, Rollback recovery},
	Month = dec,
	Number = {12},
	Numpages = {15},
	Pages = {1575--1589},
	Publisher = {Academic Press, Inc.},
	Title = {{An optimistic checkpointing and message logging approach for consistent global checkpoint collection in distributed systems}},
	Url = {http://dx.doi.org/10.1016/j.jpdc.2008.08.003},
	Volume = {68},
	Year = {2008},
	Bdsk-Url-1 = {http://dx.doi.org/10.1016/j.jpdc.2008.08.003}}

@article{2008-6,
	Author = {Jin, Chen and Klasky, Scott and Hodson, Stephen and Yu, Weikuan and Lofstead, Jay and Abbasi, Hasan and Schwan, Karsten and Wolf, Matthew and Liao, W and Choudhary, Alok and others},
	Journal = {Cray User's Group},
	Title = {{Adaptive io system (adios)}},
	Year = {2008}}

@incollection{2002-1,
	Abstract = {This paper presents a new striped and staggered checkpointing (SSC)
	scheme for multicomputer clusters. We consider serverless clusters,
	where local disks attached to cluster nodes collectively form a distributed
	RAID (redundant array of inexpensive disks) with a single I/O space.
	The distributed RAID is used to save the checkpoint files periodically.
	Striping enables parallel I/O on distributed disks. Staggering avoids
	network bottleneck in distributed disk I/O operations. With a fixed
	cluster size, we reveal the tradeoffs between these two speedup techniques.
	Our SSC approach allows dynamical reconfiguration to minimize message-logging
	requirements among concurrent software processes. We demonstrate
	how to reduce the checkpointing overhead by striping and staggering
	dynamically. For communication-intensive programs, our SCC scheme
	can significantly reduce the checkpointing overhead. Benchmark results
	prove the benefits of trading between stripe parallelism and distributed
	staggering. These results are useful to design efficient checkpointing
	schemes for fast rollback recovery from any single node (disk) failure
	in a cluster of computers.},
	Author = {Jin, Hai and Hwang, Kai},
	Booktitle = {{Advances in Computing Science --- ASIAN 2002}},
	Doi = {10.1007/3-540-36184-7_4},
	Editor = {Jean-Marie, Alain},
	Isbn = {978-3-540-00195-9},
	Language = {English},
	Pages = {19--33},
	Publisher = {Springer Berlin Heidelberg},
	Series = {Lecture Notes in Computer Science},
	Title = {{Distributed Checkpointing on Clusters with Dynamic Striping and Staggering}},
	Url = {http://dx.doi.org/10.1007/3-540-36184-7_4},
	Volume = {2550},
	Year = {2002},
	Bdsk-Url-1 = {http://dx.doi.org/10.1007/3-540-36184-7_4}}

@inproceedings{1988-1,
	Acmid = {62575},
	Address = {New York, NY, USA},
	Author = {Johnson, David B. and Zwaenepoel, Willy},
	Booktitle = {{Proceedings of the seventh annual ACM Symposium on Principles of distributed computing}},
	Doi = {10.1145/62546.62575},
	Isbn = {0-89791-277-2},
	Location = {Toronto, Ontario, Canada},
	Numpages = {11},
	Pages = {171--181},
	Publisher = {ACM},
	Series = {PODC 1988},
	Title = {{Recovery in distributed systems using asynchronous message logging and checkpointing}},
	Url = {http://doi.acm.org/10.1145/62546.62575},
	Year = {1988},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/62546.62575},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/62546.62575}}

@inproceedings{2013-20,
	Abstract = {Rapid checkpointing will remain key functionality for next generation
	high end machines. This paper explores the use of node-local nonvolatile
	memories (NVM) such as phase-change memory, to provide frequent,
	low overhead checkpoints. By adapting existing multi-level checkpoint
	techniques, we devise new methods, termed NVM-checkpoints, that efficiently
	store checkpoints on both local and remote node NVM. The checkpoint
	frequencies are guided by failure models that capture the expected
	accessibility of such data after failure. To lower overheads, NVM-checkpoints
	reduce the NVM and interconnect bandwidth used with a novel pre-copy
	mechanism, which incrementally moves checkpoint data from DRAM to
	NVM before a local checkpoint is started. This reduces local checkpoint
	cost by limiting the instantaneous data volume moved at checkpoint
	time, thereby freeing bandwidth for use by applications. In fact,
	the pre-copy method can reduce peak interconnect usage up to 46\%.
	Since our approach treats NVM as memory rather than as 'Ramdisk',
	pre-copying can be generalized to directly move data to remote NVMs.
	This results in 40\% faster application execution times compared
	to asynchronous approaches not using pre-copying.},
	Author = {Kannan, S. and Gavrilovska, A. and Schwan, K. and Milojicic, D.},
	Booktitle = {{IEEE 27th International Symposium on Parallel Distributed Processing}},
	Doi = {10.1109/IPDPS.2013.69},
	Issn = {1530-2075},
	Keywords = {DRAM chips;checkpointing;failure analysis;fault tolerance;information retrieval;optimisation;virtual machines;virtual storage;DRAM;checkpoint frequency;checkpoint optimization;data accessibility;failure model;local node NVM;multilevel checkpoint technique;next generation high end machine;nonvolatile memory;precopy mechanism;remote node NVM;virtual memory;Bandwidth;Checkpointing;Hardware;Nonvolatile memory;Peer-to-peer computing;Phase change materials;Random access memory;Checkpointing;Memory bandwidth;Non volatile memory (NVM);PCM;Pre-Copy},
	Month = {May},
	Pages = {29-40},
	Title = {{Optimizing Checkpoints Using NVM as Virtual Memory}},
	Year = {2013},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPS.2013.69}}

@techreport{2009-11,
	Author = {D. S. Katz and J. Daly and N. DeBardeleben and M. Elnozahy and B. Kramer and L. Lathrop and N. Nystrom and K. Milfeld and S. Sanielevici and S. Cott and L. Votta},
	Institution = {Argonne National Laboratory},
	Month = {December},
	Number = {ANL/MCS-TM-312},
	Title = {{Fault Tolerance for Extreme-Scale Computing Workshop, {A}lbuquerque, {NM} - {M}arch 19-20, 2009}},
	Year = {2009}}

@inproceedings{2012-4,
	Author = {Kharbas, Kishor and Fiala, David and Mueller, Frank and Ferreira, Kurt B. and Engelmann, Christian},
	Biburl = {http://www.bibsonomy.org/bibtex/2ed93ad70efe8bd55da7327b8f8650a4a/dblp},
	Booktitle = {{ICDCS}},
	Ee = {http://doi.ieeecomputersociety.org/10.1109/ICDCS.2012.56},
	Isbn = {978-1-4577-0295-2},
	Pages = {615-626},
	Publisher = {IEEE},
	Title = {{Combining Partial Redundancy and Checkpointing for HPC.}},
	Url = {http://dblp.uni-trier.de/db/conf/icdcs/icdcs2012.html#ElliottKFMFE12},
	Year = {2012},
	Bdsk-Url-1 = {http://dblp.uni-trier.de/db/conf/icdcs/icdcs2012.html#ElliottKFMFE12}}

@inproceedings{2009-3,
	Author = {LeBlanc, Troy P. and Anand, Rakhi and Gabriel, Edgar and Subhlok, Jaspal},
	Biburl = {http://www.bibsonomy.org/bibtex/2562315d5c9940f88f8bfbbcdb037f8d9/dblp},
	Booktitle = {{PVM/MPI}},
	Editor = {Ropo, Matti and Westerholm, Jan and Dongarra, Jack},
	Ee = {http://dx.doi.org/10.1007/978-3-642-03770-2_19},
	Isbn = {978-3-642-03769-6},
	Pages = {124-133},
	Publisher = {Springer},
	Series = {Lecture Notes in Computer Science},
	Title = {{VolpexMPI: An MPI Library for Execution of Parallel Applications on Volatile Nodes.}},
	Url = {http://dblp.uni-trier.de/db/conf/pvm/pvm2009.html#LeBlancAGS09},
	Volume = {5759},
	Year = {2009},
	Bdsk-Url-1 = {http://dblp.uni-trier.de/db/conf/pvm/pvm2009.html#LeBlancAGS09}}

@inproceedings{2006-6,
	Abstract = {Large-scale MPI programs must work with dynamic and heterogeneous
	resources. While many of the involved issues can be handled by the
	MPI implementation, some must be dealt with by the application program.
	This paper considers a master/slave application, in which MPI processes
	internally use a different number of threads created by OpenMP. We
	modify the standard master/slave pattern to allow for dynamic addition
	and withdrawal of slaves. Moreover, the application dynamically adapts
	to use processors for either processes or threads. The paper evaluates
	the support that MPI-2 provides for implementing the scheme, partly
	referring to experiments with the MPICH2 implementation. We found
	that most requirements can be met if optional parts of the standard
	are used, but slave crashes require additional functionality.},
	Acmid = {2091412},
	Address = {Berlin, Heidelberg},
	Author = {Leopold, Claudia and S\"{u}\ss, Michael},
	Booktitle = {{Proceedings of the 13th European PVM/MPI User's Group conference on Recent advances in parallel virtual machine and message passing interface}},
	Doi = {10.1007/11846802_41},
	Isbn = {3-540-39110-X, 978-3-540-39110-4},
	Keywords = {adaptivity, dynamic process management, hybrid MPI/OpenMP, malleability, master/slave pattern},
	Location = {Bonn, Germany},
	Numpages = {8},
	Pages = {285--292},
	Publisher = {Springer-Verlag},
	Series = {EuroPVM/MPI 2006},
	Title = {{Observations on MPI-2 support for hybrid master/slave applications in dynamic and heterogeneous environments}},
	Url = {http://dx.doi.org/10.1007/11846802_41},
	Year = {2006},
	Bdsk-Url-1 = {http://dx.doi.org/10.1007/11846802_41}}

@inproceedings{2012-25,
	Abstract = {Hybridization is the process of converting an application with a single
	level of parallelism to an application with multiple levels of parallelism.
	Over the past 15 years a majority of the applications that run on
	High Performance Computing systems have employed MPI for all of the
	parallelism within the application. In the Peta-Exascale computing
	regime, effective utilization of the hardware requires multiple levels
	of parallelism matched to the macro architecture of the system to
	achieve good performance. A hybridized code base is performance portable
	when sufficient parallelism is expressed in an architecture agnostic
	form to achieve good performance on a range of available systems.
	The hybridized S3D code is performance portable across today's leading
	many core and GPU accelerated systems. The OpenACC framework allows
	a unified code base to be deployed for either (Manycore CPU or Manycore
	CPU+GPU) while permitting architecture specific optimizations to
	expose new dimensions of parallelism to be utilized.},
	Author = {Levesque, J.M. and Sankaran, R. and Grout, R.},
	Booktitle = {{International Conference for High Performance Computing, Networking, Storage and Analysis}},
	Doi = {10.1109/SC.2012.69},
	Issn = {2167-4329},
	Keywords = {graphics processing units;message passing;multiprocessing systems;optimisation;parallel architectures;parallel machines;GPU accelerated system;MPI;OpenACC;S3D hybridization code;architecture specific optimization;high performance computing systems;macroarchitecture;many core systems;parallelism matching;petaexascale computing;Arrays;Chemistry;Equations;Mathematical model;Parallel processing;Vectors;Accelerators;Communication overlap;Directives;Hybrid Architectures;Hybrid Programming;MPI;Multi-core;OpenACC;OpenMP},
	Month = {Nov},
	Pages = {1-11},
	Title = {{Hybridizing S3D into an Exascale application using OpenACC: An approach for moving to multi-petaflops and beyond}},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/SC.2012.69}}

@article{2014-3,
	Abstract = {Applications running on leadership platforms are more and more bottlenecked
	by storage input/output (I/O). In an effort to combat the increasing
	disparity between I/O throughput and compute capability, we created
	Adaptable IO System (ADIOS) in 2005. Focusing on putting users first
	with a service oriented architecture, we combined cutting edge research
	into new I/O techniques with a design effort to create near optimal
	I/O methods. As a result, ADIOS provides the highest level of synchronous
	I/O performance for a number of mission critical applications at
	various Department of Energy Leadership Computing Facilities. Meanwhile
	ADIOS is leading the push for next generation techniques including
	staging and data processing pipelines. In this paper, we describe
	the startling observations we have made in the last half decade of
	I/O research and development, and elaborate the lessons we have learned
	along this journey. We also detail some of the challenges that remain
	as we look toward the coming Exascale era. Copyright {\copyright} 2013 John
	Wiley & Sons, Ltd.},
	Author = {Liu, Qing and Logan, Jeremy and Tian, Yuan and Abbasi, Hasan and Podhorszki, Norbert and Choi, Jong Youl and Klasky, Scott and Tchoua, Roselyne and Lofstead, Jay and Oldfield, Ron and Parashar, Manish and Samatova, Nagiza and Schwan, Karsten and Shoshani, Arie and Wolf, Matthew and Wu, Kesheng and Yu, Weikuan},
	Doi = {10.1002/cpe.3125},
	Issn = {1532-0634},
	Journal = {Concurrency and Computation: Practice and Experience},
	Keywords = {high performance computing, high performance I/O, I/O middleware},
	Number = {7},
	Pages = {1453--1473},
	Title = {{Hello ADIOS: the challenges and lessons of developing leadership class I/O frameworks}},
	Url = {http://dx.doi.org/10.1002/cpe.3125},
	Volume = {26},
	Year = {2014},
	Bdsk-Url-1 = {http://dx.doi.org/10.1002/cpe.3125}}

@inproceedings{2008-7,
	Abstract = {The increase in the physical size of high performance computing (HPC)
	platform makes system reliability more challenging. In order to minimize
	the performance loss (rollback and checkpoint overheads) due to unexpected
	failures or unnecessary overhead of fault tolerant mechanisms, we
	present a reliability-aware method for an optimal checkpoint/restart
	strategy. Our scheme aims at addressing fault tolerance challenge,
	especially in a large-scale HPC system, by providing optimal checkpoint
	placement techniques that are derived from the actual system reliability.
	Unlike existing checkpoint models, which can only handle Poisson
	failure and a constant checkpoint interval, our model can deal with
	a varying checkpoint interval and with different failure distributions.
	In addition, the approach considers optimality for both checkpoint
	overhead and rollback time. Our validation results suggest a significant
	improvement over existing techniques.},
	Author = {Yudan Liu and Nassar, R. and Leangsuksun, C.B. and Naksinehaboon, N. and P{\u a}un, M. and Scott, S.L.},
	Booktitle = {{IEEE International Symposium on Parallel and Distributed Processing}},
	Doi = {10.1109/IPDPS.2008.4536279},
	Issn = {1530-2075},
	Keywords = {checkpointing;fault tolerant computing;parallel processing;checkpoint interval;failure distributions;fault tolerant mechanisms;high performance computing;large scale HPC system;optimal checkpoint-restart model;performance loss;reliability-aware method;rollback time;Checkpointing;Cost function;Data analysis;Fault tolerant systems;High performance computing;Large-scale systems;Mathematical model;Mathematics;Reliability;Stochastic processes;HPC;Large-scale distributed system events log analysis;checkpoint/restart model;fault-tolerance;reliability},
	Month = {April},
	Pages = {1-9},
	Series = {IPDPS 2008},
	Title = {{An optimal checkpoint/restart model for a large scale high performance computing system}},
	Year = {2008},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPS.2008.4536279}}

@article{1997-1,
	Abstract = {Consistent global checkpoints have many uses in distributed computations.
	A central question in applications that use consistent global checkpoints
	is to determine whether a consistent global checkpoint that includes
	a given set of local checkpoints can exist. Netzer and Xu [16] presented
	the necessary and sufficient conditions under which such a consistent
	global checkpoint can exist, but they did not explore what checkpoints
	could be constructed. In this paper, we prove exactly which local
	checkpoints can be used for constructing such consistent global checkpoints.
	We illustrate the use of our results with a simple and elegant algorithm
	to enumerate all such consistent global checkpoints.},
	Acmid = {629417},
	Address = {Piscataway, NJ, USA},
	Author = {Manivannan, D. and Netzer, Robert H. B. and Singhal, Mukesh},
	Doi = {10.1109/71.595580},
	Issn = {1045-9219},
	Issue_Date = {June 1997},
	Journal = {IEEE Trans. Parallel Distrib. Syst.},
	Keywords = {Causality, distributed checkpointing, consistent global states, failure recovery, fault tolerance.},
	Month = jun,
	Number = {6},
	Numpages = {5},
	Pages = {623--627},
	Publisher = {IEEE Press},
	Title = {{Finding Consistent Global Checkpoints in a Distributed Computation}},
	Url = {http://dx.doi.org/10.1109/71.595580},
	Volume = {8},
	Year = {1997},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/71.595580}}

@inproceedings{2005-10,
	Abstract = {The running times of many computational science applications are much
	longer than the mean-time-between-failures (MTBF) of current high-performance
	computing platforms. To run to completion, such applications must
	tolerate hardware failures. Checkpoint-and-rest art (CPR) is the
	most commonly used scheme for accomplishing this - the state of the
	computation is saved periodically on stable storage, and when a hardware
	failure is detected, the computation is restarted from the most recently
	saved state. Most automatic CPR, schemes in the literature can be
	classified as system-level checkpointing schemes because they take
	core-dump style snapshots of the computational state when all the
	processes are blocked at global barriers in the program. Unfortunately,
	a system that implements this style of checkpointing is tied to a
	particular platform amd cannot optimize the checkpointing process
	using application-specific knowledge. We are exploring an alternative
	called automatic application-level checkpointing. In our approach,
	programs are transformed by a pre-processor so that they become self-checkpointing
	and self-rest art able on any platform. In this paper, we evaluate
	a mechanism that utilizes application knowledge to minimize the amount
	of information saved in a checkpoint.},
	Author = {Marques, D. and Bronevetsky, G. and Fernandes, R. and Pingali, K. and Stodghill, P.},
	Booktitle = {{Proceedings of the 19th IEEE International Parallel and Distributed Processing Symposium}},
	Doi = {10.1109/IPDPS.2005.316},
	Keywords = {checkpointing;fault tolerance;parallel processing;automatic application-level checkpointing;checkpoint optimization;checkpoint-and-rest art;computational science application;high-performance computing platform;mean-time-between-failures;system-level checkpointing scheme;Application software;Automatic logic units;Checkpointing;Computer applications;Computer science;Concurrent computing;Hardware;Programming profession;Protocols;Software systems},
	Pages = {7 pp.-},
	Series = {IPDPS 2005},
	Title = {{Optimizing checkpoint sizes in the C3 system}},
	Url = {http://dx.doi.org/10.1109/IPDPS.2005.316},
	Year = {2005},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPS.2005.316}}

@inproceedings{2013-28,
	Abstract = {The U. S. Department of Energy has identified resilience and energy
	consumption as key challenges for future extreme-scale systems. All
	checkpoint/restart methods require I/O to local or remote storage.
	Efforts are under way to minimize the amount of data movement and
	increase scalability. Nevertheless, the energy consumed by fault
	resilience methods will increase with system size. It is therefore
	important to understand the performance overhead in conjunction with
	the energy consumption of each fault resilience method. In this paper
	we explore throttling CPU power consumption during I/O intensive
	checkpoint operations of real applications. We find that 10\% total
	energy savings are possible with little impact on application time
	to solution.},
	Acmid = {2536432},
	Address = {New York, NY, USA},
	Articleno = {6},
	Author = {Mills, Bryan and Grant, Ryan E. and Ferreira, Kurt B. and Riesen, Rolf},
	Booktitle = {{Proceedings of the 1st International Workshop on Energy Efficient Supercomputing}},
	Doi = {10.1145/2536430.2536432},
	Isbn = {978-1-4503-2504-2},
	Keywords = {checkpointing, energy, energy saving, fault tolerance, power, power saving},
	Location = {Denver, Colorado},
	Numpages = {8},
	Pages = {6:1--6:8},
	Publisher = {ACM},
	Series = {E2SC 2013},
	Title = {{Evaluating Energy Savings for Checkpoint/Restart}},
	Url = {http://doi.acm.org/10.1145/2536430.2536432},
	Year = {2013},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2536430.2536432},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2536430.2536432}}

@inproceedings{2013-27,
	Author = {Mills, Bryan and Znati, Taieb and Melhem, Rami and Grant, Ryan E and Ferreira, Kurt B},
	Booktitle = {{22nd Euromicro International Conference Parallel, Distributed and Network-Based Processing (PDP)}},
	Title = {{Energy consumption of resilience mechanisms in large scale systems}},
	Year = {2014}}

@inproceedings{2010-8,
	Abstract = {High-performance computing (HPC) systems are growing more powerful
	by utilizing more hardware components. As the system mean-time-before-failure
	correspondingly drops, applications must checkpoint more frequently
	to make progress. However, as the system memory sizes grow faster
	than the bandwidth to the parallel file system, the cost of checkpointing
	begins to dominate application run times. Multi-level checkpointing
	potentially solves this problem through multiple types of checkpoints
	with different costs and different levels of resiliency in a single
	run. This solution employs lightweight checkpoints to handle the
	most common failure modes and relies on more expensive checkpoints
	for less common, but more severe failures. This theoretically promising
	approach has not been fully evaluated in a large- scale, production
	system context. We have designed the Scalable Checkpoint/Restart
	(SCR) library, a multi-level checkpoint system that writes checkpoints
	to RAM, Flash, or disk on the compute nodes in addition to the parallel
	file system. We present the performance and reliability properties
	of SCR as well as a probabilistic Markov model that predicts its
	performance on current and future systems. We show that multi-level
	checkpointing improves efficiency on existing large-scale systems
	and that this benefit increases as the system size grows. In particular,
	we developed low-cost checkpoint schemes that are 100x-1000x faster
	than the parallel file system and effective against 85\% of our system
	failures. This leads to a gain in machine efficiency of up to 35\%,
	and it reduces the the load on the parallel file system by a factor
	of two on current and future systems.},
	Acmid = {1884666},
	Address = {Washington, DC, USA},
	Author = {Moody, Adam and Bronevetsky, Greg and Mohror, Kathryn and Supinski, Bronis R. de},
	Booktitle = {{Proceedings of the ACM/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis}},
	Doi = {10.1109/SC.2010.18},
	Isbn = {978-1-4244-7559-9},
	Numpages = {11},
	Pages = {1--11},
	Publisher = {IEEE Computer Society},
	Series = {SC 2010},
	Title = {{Design, Modeling, and Evaluation of a Scalable Multi-level Checkpointing System}},
	Url = {http://dx.doi.org/10.1109/SC.2010.18},
	Year = {2010},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/SC.2010.18}}

@inproceedings{1997-2,
	Abstract = {Debugging long program runs can be difficult because of the delays
	required to repeatedly re-run the execution. Even a moderately long
	run of five minutes can incur aggravating delays. To address this
	problem, techniques exist that allow re-executing a distributed program
	from intermediate points by using combinations of checkpointing and
	message logging. In this paper we explore another idea: how to support
	replay without logging the contents of any message. When no messages
	are logged, the set of global states from which replay is possible
	is constrained, and it has been unknown how to compute this set without
	exhaustively searching the space of all global states, whose size
	is exponential in the number of processes. We present a simple and
	efficient hybrid on-the-fly/post-mortem algorithm for detecting the
	necessary and sufficient conditions under which parts of the execution
	can be replayed without message logs. A small amount of trace (two
	vectors) is recorded at each checkpoint and a fast post-mortem algorithm
	computes global states from which replay can begin. This algorithm
	is independent of the checkpointing technique used},
	Author = {Netzer, R.H.B. and Xu, Y.},
	Booktitle = {{Proceedings of the Sixth IEEE International Symposium on High Performance Distributed Computing}},
	Doi = {10.1109/HPDC.1997.622370},
	Issn = {1082-8907},
	Keywords = {parallel programming;program debugging;checkpointing;debugging;distributed program;distributed programs;global states;message logging;post-mortem algorithm;replay;Checkpointing;Computer science;Costs;Debugging;Delay;Fault tolerance},
	Pages = {137-147},
	Title = {{Replaying distributed programs without message logging}},
	Year = {1997},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/HPDC.1997.622370}}

@inproceedings{2013-14,
	Acmid = {2503266},
	Address = {New York, NY, USA},
	Articleno = {7},
	Author = {Ni, Xiang and Meneses, Esteban and Jain, Nikhil and Kal{\'e}, Laxmikant V.},
	Booktitle = {{Proceedings of SC13: International Conference for High Performance Computing, Networking, Storage and Analysis}},
	Doi = {10.1145/2503210.2503266},
	Isbn = {978-1-4503-2378-9},
	Keywords = {checkpoint/restart, fault-tolerance, redundancy, silent data corruption},
	Location = {Denver, Colorado},
	Numpages = {12},
	Pages = {7:1--7:12},
	Publisher = {ACM},
	Series = {SC 2013},
	Title = {{ACR: Automatic Checkpoint/Restart for Soft and Hard Error Protection}},
	Url = {http://doi.acm.org/10.1145/2503210.2503266},
	Year = {2013},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2503210.2503266},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2503210.2503266}}

@inproceedings{2012-21,
	Abstract = {The HPC community has seen a steady increase in the number of components
	in every generation of supercomputers. Assembling a large number
	of components into a single cluster makes a machine more powerful,
	but also much more prone to failures. Therefore, fault tolerance
	has become a major concern in HPC. To deal with node crashes in large
	systems, checkpoint/restart is by far the preferred method. A typical
	way to implement checkpoints is by using a blocking algorithm, which
	suspends the execution of the application while the checkpoint is
	safely stored. One limitation of the blocking algorithm is that it
	saturates the network bandwidth at the time of checkpoint. This problem
	will become even more critical because the projected network bandwidth
	increase will not match the increase in memory per node. To alleviate
	this problem, we have developed a semi-blocking checkpoint algorithm
	that overlaps execution of the application with transmission of checkpoints.
	Our implementation decomposes a checkpoint into small messages that
	are interleaved with application messages. The experimental results
	show a dramatic reduction in the checkpoint overhead for various
	applications. We present a model for our approach and use this model
	to compute the benefit of the semi-blocking algorithm for different
	failure rates predicted at Exascale. We estimate our method can reduce
	up to 22\% the total execution time of an iterative scientific application.},
	Author = {Xiang Ni and Meneses, E. and Kale, L.V.},
	Booktitle = {{IEEE International Conference on Cluster Computing (CLUSTER)}},
	Doi = {10.1109/CLUSTER.2012.82},
	Keywords = {checkpointing;distributed processing;fault tolerant computing;natural sciences computing;HPC applications;checkpoint overhead hiding;checkpoint-restart method;exascale;fault tolerance;iterative scientific application;node crash;semiblocking checkpointing algorithm;supercomputers;Bandwidth;Checkpointing;Computational modeling;Interference;Message systems;Protocols;Synchronization;SSD;adaptive runtime system;checkpoint/restart;fault tolerance;semi-blocking algorithm},
	Pages = {364-372},
	Title = {{Hiding Checkpoint Overhead in HPC Applications with a Semi-Blocking Algorithm}},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/CLUSTER.2012.82}}

@inproceedings{2013-19,
	Abstract = {With increasing scale and complexity of supercomputing and cloud computing
	architectures, faults are becoming a frequent occurrence. For a large
	class of applications that run for a long time and are tightly coupled,
	Checkpoint-Restart (CR) is the only feasible method to survive failures.
	However, exploding checkpoint sizes that need to be dumped to storage
	pose a major scalability challenge, prompting the need to reduce
	the amount of check pointing data. This paper contributes with a
	novel collective memory contents deduplication scheme that attempts
	to identify and eliminate duplicate memory pages before they are
	saved to storage. Unlike previous approaches that concentrate on
	the checkpoints of the same process, our approach identifies duplicate
	memory pages shared by different processes (regardless whether on
	the same or different node). We show both how to achieve such a global
	deduplication in a scalable fashion and how to leverage it effectively
	to optimize the data layout in such way that it minimizes I/O bottlenecks.
	Large scale experiments show significant reduction of storage space
	consumption and performance overhead compared to several state-of-art
	approaches, both in synthetic benchmarks and for a real life high
	performance computing application},
	Acmid = {2511417},
	Address = {Washington, DC, USA},
	Author = {Nicolae, Bogdan},
	Booktitle = {{Proceedings of the IEEE 27th International Symposium on Parallel and Distributed Processing}},
	Doi = {10.1109/IPDPS.2013.14},
	Isbn = {978-0-7695-4971-2},
	Keywords = {scientific computing, high performance computing, fault tolerance, checkpoint restart, memory checkpointing, deduplication, I/O load balancing},
	Numpages = {10},
	Pages = {19--28},
	Publisher = {IEEE Computer Society},
	Series = {IPDPS 2013},
	Title = {{Towards Scalable Checkpoint Restart: A Collective Inline Memory Contents Deduplication Proposal}},
	Url = {http://dx.doi.org/10.1109/IPDPS.2013.14},
	Year = {2013},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPS.2013.14}}

@inproceedings{2013-3,
	Abstract = {With increasing scale and complexity of supercomputing and cloud computing
	architectures, faults are becoming a frequent occurrence, which makes
	reliability a difficult challenge. Although for some applications
	it is enough to restart failed tasks, there is a large class of applications
	where tasks run for a long time or are tightly coupled, thus making
	a restart from scratch unfeasible. Checkpoint-Restart (CR), the main
	method to survive failures for such applications faces additional
	challenges in this context: not only does it need to minimize the
	performance overhead on the application due to checkpointing, but
	it also needs to operate with scarce resources. Given the iterative
	nature of the targeted applications, we launch the assumption that
	first-time writes to memory during asynchronous checkpointing generate
	the same kind of interference as they did in past iterations. Based
	on this assumption, we propose novel asynchronous checkpointing approach
	that leverages both current and past access pattern trends in order
	to optimize the order in which memory pages are flushed to stable
	storage. Large scale experiments show up to 60\% improvement when
	compared to state-of-art checkpointing approaches, all this achievable
	with an extra memory requirement of less than 5\% of the total application
	memory.},
	Acmid = {2462918},
	Address = {New York, NY, USA},
	Area = {Asynchronous checkpoint},
	Author = {Nicolae, Bogdan and Cappello, Franck},
	Benchs = {CM1 (three-dimensional, non-hydrostatic, non-linear, time-dependent numerical model suitable for idealized studies of atmospheric phenomena. This application is used to study small-scale processes that occur in the atmosphere of the Earth, such as hurricanes). MILC (MIMD Lattice Computation. This high performance computing application is particularly useful in the field of quantum chromodynamics (QCD), which describes the interactions of the quarks and gluons that form particles such as protons, neutrons and mesons. For the purpose of this work, we adapted the NERSC-6 procurement version of the MILC benchmark).},
	Booktitle = {{Proceedings of the 22nd international symposium on High-performance parallel and distributed computing}},
	Detailedtb = {For the Grid'5000 experiments, we used 42 nodes of the Rennes site, each of which is equipped with a quadcore Intel Xeon X5570 x86 64 CPU, local disk storage of 500 GB (access speed ≃55 MB/s using SATA II ahci driver) and 24 GB of RAM. The nodes are interconnected with Gigabit Ethernet (measured 117.5 MB/s for TCP sockets with MTU = 1500 B with a latency of ≃0.1 ms). Each node is powered by recently updated Debian Sid distribution where OpenMPI 1.4.3 was installed and set up. In this setting, we store the checkpoints in a ``conventional'' fashion by using a parallel file system. To this end, we reserve 10 nodes to act at storage elements and deploy the PVFSv2 [9] parallel file system on them. The rest of 32 nodes are used to runour MPI applications and have access to the PVFS deployment through the POSIX interface made available through the PVFS FUSE module. The Shamrock testbed consists of 160 nodes interconnected with Gigabit Ethernet, each of which features an Intel Xeon X5670 CPU (6 cores, 12 hardware threads), HDD local storage of 1 TB and 128 GB of RAM. For the purpose of this work, we used a reservation of 28 nodes. Each node runs the Red Hat 6.2 Enterprise Linux distribution, while the MPI library installed is MPICH2 1.4.1. In this case, all nodes are reserved for running the applications, while the checkpoints are written to local storage. This setting has a potential for higher I/O scalability (as discussed in Section 3.2) and thus pushes our approach to the limits, as there are fewer opportunities to take of long I/O delays.},
	Doi = {10.1145/2462902.2462918},
	Isbn = {978-1-4503-1910-2},
	Keywords = {access pattern adaptation, asynchronous checkpointing, checkpoint restart, cloud computing, fault tolerance, high performance computing, reliability, scientific computing},
	Location = {New York, New York, USA},
	Numpages = {12},
	Pages = {155--166},
	Publisher = {ACM},
	Series = {HPDC 2013},
	Summary = {Automatically learn from the application memory access pattern (iterative applications) to asynchronously checkpoint (checkpoint in the background, while continuing the execution). They predict which pages will be used first, to checkpoint them first. They use a checkpoint thread to flush pages to PFS. If a memory page still not checkpointed is requested to be changed, the checkpoint thread copies the old page to a temporary buffer (called copy-on-write), which will have to be flushed to PFS. They assume the application uses almost all the memory (the technique uses only 5\% of the memory used by the app, for the copy-on-write buffer). They only show scalability up to 32 nodes (1 process per node and 10 procs/node) using MILC and CM1 real-apps. Really good idea and implementation, not so good evaluation.},
	Testbed = {Grid'5000 (experimental testbed for distributed computing that federates 9 sites in France), Shamrock (experimental platform of the Exascale Systems group of IBM Research in Dublin, Ireland},
	Title = {{AI-Ckpt: leveraging memory access patterns for adaptive asynchronous incremental checkpointing}},
	Url = {http://doi.acm.org/10.1145/2462902.2462918},
	Year = {2013},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2462902.2462918},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2462902.2462918}}

@inproceedings{2011-10,
	Abstract = {Infrastructure-as-a-Service (IaaS) cloud computing is gaining significant
	interest in industry and academia as an alternative platform for
	running scientific applications. Given the dynamic nature of IaaS
	clouds and the long runtime and resource utilization of such applications,
	an efficient checkpoint-restart mechanism becomes paramount in this
	context. This paper proposes a solution to the aforementioned challenge
	that aims at minimizing the storage space and performance overhead
	of checkpoint-restart. We introduce an approach that leverages virtual
	machine (VM) disk-image multi-snapshotting and multi-deployment inside
	checkpoint-restart protocols running at guest level in order to efficiently
	capture and potentially roll back the complete state of the application,
	including file system modifications. Experiments on the G5K testbed
	show substantial improvement for MPI applications over existing approaches,
	both for the case when customized checkpointing is available at application
	level and the case when it needs to be handled at process level.},
	Acmid = {2063429},
	Address = {New York, NY, USA},
	Articleno = {34},
	Author = {Nicolae, Bogdan and Cappello, Franck},
	Booktitle = {{Proceedings of International Conference for High Performance Computing, Networking, Storage and Analysis}},
	Doi = {10.1145/2063384.2063429},
	Isbn = {978-1-4503-0771-0},
	Keywords = {IaaS, MPI applications, capture application state, checkpoint-restart, cloud computing, disk snapshots, fault tolerance, science clouds, scientific computing},
	Location = {Seattle, Washington},
	Numpages = {12},
	Pages = {34:1--34:12},
	Publisher = {ACM},
	Series = {SC 2011},
	Summary = {no comment},
	Title = {{BlobCR: Efficient Checkpoint-restart for HPC Applications on IaaS Clouds Using Virtual Disk Image Snapshots}},
	Url = {http://doi.acm.org/10.1145/2063384.2063429},
	Year = {2011},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2063384.2063429},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2063384.2063429}}

@article{2006-4,
	Acmid = {1125985},
	Address = {Thousand Oaks, CA, USA},
	Author = {Nieplocha, Jarek and Palmer, Bruce and Tipparaju, Vinod and Krishnan, Manojkumar and Trease, Harold and Apra, Edoardo},
	Doi = {10.1177/1094342006064503},
	Issn = {1094-3420},
	Issue_Date = {May 2006},
	Journal = {Int. J. High Perform. Comput. Appl.},
	Keywords = {Data Abstraction, Data Distribution, Global Address Space, Global Arrays, Parallel Programming, Shared Memory},
	Month = {May},
	Number = {2},
	Numpages = {29},
	Pages = {203--231},
	Publisher = {Sage Publications, Inc.},
	Title = {{Advances, Applications and Performance of the Global Arrays Shared Memory Programming Toolkit}},
	Url = {http://dx.doi.org/10.1177/1094342006064503},
	Volume = {20},
	Year = {2006},
	Bdsk-Url-1 = {http://dx.doi.org/10.1177/1094342006064503}}

@misc{2014-2,
	Author = {ornl.gov},
	Howpublished = {\url{https://www.olcf.ornl.gov/kb_articles/atlas-transition/}},
	Note = {Accessed: 2014-03-15},
	Title = {{Atlas transition}},
	Year = {2014}}

@inproceedings{2010-2,
	Abstract = {With the ever-growing size of computer clusters and applications,
	system failures are becoming inevitable. Checkpointing, a strategy
	to ensure fault tolerance, has become imperative in such an environment.
	Howeverexisting mechanism of checkpoint writing to parallel systems
	doesn't perform well with increasing job size. Solid State Disk(SSD)
	is attracting more and more attention due to its technical merits
	such as good random access performance, low power consumption and
	shock resistance. However, how to apply SSDs into a parallel storage
	system to improve checkpoint writing still remains an open question.
	In this paper we propose a new strategy to enhance checkpoint writing
	performance by aggregating checkpoint writing at client side, and
	utilizing staging IO on data servers. We also explore the potentials
	to substitute traditional hard disks with SSDs on data server to
	achieve better write bandwidth. Our strategy achieves up to 6.3 times
	higher write bandwidth than a popular parallel file system PVFS2
	with 8 client nodes and 4 data servers. In experiments with real
	applications using 64 application processes and 4 data servers, our
	strategy can accelerate checkpoint writing by up to 9.9 times compared
	to PVFS2.},
	Acmid = {1849835},
	Address = {Washington, DC, USA},
	Author = {Ouyang, Xiangyong and Marcarelli, Sonya and Panda, Dhabaleswar K.},
	Booktitle = {{Proceedings of the International Workshop on Storage Network Architecture and Parallel I/Os}},
	Doi = {10.1109/SNAPI.2010.10},
	Isbn = {978-0-7695-4025-2},
	Keywords = {Checkpoint, IO Aggregation, SSD},
	Numpages = {8},
	Pages = {13--20},
	Publisher = {IEEE Computer Society},
	Series = {SNAPI 2010},
	Summary = {Locally aggregate checkpoints created by BLCR of all the processes in one node before sending it to stable storage, which is improved by adding SSD. ssdcheckpoint},
	Title = {{Enhancing Checkpoint Performance with Staging IO and SSD}},
	Url = {http://dx.doi.org/10.1109/SNAPI.2010.10},
	Year = {2010},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/SNAPI.2010.10}}

@inproceedings{2011-20,
	Abstract = {Checkpoint/Restart (C/R) mechanisms have been widely adopted by many
	MPI libraries [1--3] to achieve fault-tolerance. However, a major
	limitation of such mechanisms is the intensive IO bottleneck caused
	by the need to dump the snapshots of all processes into persistent
	storage. Several studies have been conducted to minimize this overhead
	[4, 5], but most of these proposed optimizations are performed inside
	specific MPI stack or checkpointing library or applications, hence
	they are not portable enough to be applied to ...},
	Author = {Ouyang, Xiangyong and Rajachandrasekar, Raghunath and Besseron, Xavier and Wang, Hao and Huang, Jian and Panda, Dhabaleswar K},
	Booktitle = {{International Conference on Parallel Processing (ICPP)}},
	Organization = {IEEE},
	Pages = {375--384},
	Title = {{CRFS: A lightweight user-level filesystem for generic checkpoint/restart}},
	Year = {2011}}

@inproceedings{1996-1,
	Abstract = {Coordinated checkpointing systems are popular and general-purpose
	tools for implementing process migration, coarse-grained job swapping,
	and fault-tolerance on networks of workstations. Though simple in
	concept, there are several design decisions concerning the placement
	of checkpoint files that can impact the performance and functionality
	of coordinated checkpointers. Although several such checkpointers
	have been implemented for popular programming platforms like PVM
	and MPI, none have taken this issue into consideration. This paper
	addresses the issue of checkpoint placement and its impact on the
	performance and functionality of coordinated checkpointing systems.
	Several strategies, both old and new, are described and implemented
	on a network of SPARC-5 workstations running PVM. These strategies
	range from very simple to more complex borrowing heavily from ideas
	in RAID (Redundant Arrays of Inexpensive Disks) fault-tolerance.
	The results of this paper will serve as a guide so that future implementations
	of coordinated checkpointing can allow their users to achieve the
	combination of performance and functionality that is right for their
	applications},
	Author = {Plank, J.S.},
	Booktitle = {{Reliable Distributed Systems, 1996. Proceedings., 15th Symposium on}},
	Doi = {10.1109/RELDIS.1996.559700},
	Issn = {1060-9857},
	Keywords = {computer networks;fault tolerant computing;performance evaluation;workstations;RAID techniques;SPARC-5 workstations;coarse-grained job swapping;coordinated checkpointers;fault-tolerance;networks of workstations;performance;process migration;workstations;Checkpointing;Clocks;Computer science;Fault tolerance;Fault tolerant systems;History;Marketing and sales;Optimization methods;Parallel programming;Workstations},
	Pages = {76-85},
	Title = {{Improving the performance of coordinated checkpointers on networks of workstations using RAID techniques}},
	Year = {1996},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/RELDIS.1996.559700}}

@article{1998-1,
	Abstract = {Diskless Checkpointing is a technique for checkpointing the state
	of a long-running computation on a distributed system without relying
	on stable storage. As such, it eliminates the performance bottleneck
	of traditional checkpointing on distributed systems. In this paper,
	we motivate diskless checkpointing and present the basic diskless
	checkpointing scheme along with several variants for improved performance.
	The performance of the basic scheme and its variants is evaluated
	on a high-performance network of workstations and compared to traditional
	disk-based checkpointing. We conclude that diskless checkpointing
	is a desirable alternative to disk-based checkpointing that can improve
	the performance of distributed applications in the face of failures},
	Author = {Plank, J.S. and Li, K. and Puening, M.A.},
	Doi = {10.1109/71.730527},
	Issn = {1045-9219},
	Journal = {Parallel and Distributed Systems, IEEE Transactions on},
	Keywords = {distributed processing;fault tolerant computing;system recovery;disk-based checkpointing;diskless checkpointing;distributed applications;distributed system;high-performance network;long-running computation;performance bottleneck;Checkpointing;Computer Society;Distributed computing;Error correction codes;Fault tolerance;Fault tolerant systems;Hardware;Programming environments;Redundancy;Workstations},
	Number = {10},
	Pages = {972-986},
	Title = {{Diskless checkpointing}},
	Volume = {9},
	Year = {1998},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/71.730527}}

@inproceedings{1999-5,
	Abstract = {This paper presents a checkpointing scheme for optimistic simulation
	which is a mixed approach between periodic and probabilistic checkpointing.
	The latter based on statistical data collected during the simulation,
	aims at recording as checkpoints states of a logical process that
	have high probability to be restored due to rollback (this is done
	in order to make those states immediately available). The periodic
	part prevents performance degradation due to state reconstruction
	(coasting forward) cost whenever the collected statistics do not
	allow to identify states highly likely to be restored. Hence, this
	scheme can be seen as a highly general solution to tackle the checkpoint
	problem in optimistic simulation. A performance comparison with previous
	solutions is carried out through a simulation study of a store-and-forward
	communication network in a two-dimensional torus topology},
	Author = {Quaglia, F.},
	Booktitle = {{Parallel and Distributed Simulation, 1999. Proceedings. Thirteenth Workshop on}},
	Doi = {10.1109/PADS.1999.766167},
	Keywords = {distributed processing;software fault tolerance;software performance evaluation;system recovery;time warp simulation;logical process;optimistic simulation;performance degradation;periodic checkpointing;probabilistic checkpointing;rollback;simulation;state reconstruction;statistical data;store-and-forward communication network;time warp simulation;two-dimensional torus topology;Checkpointing;Costs;Discrete event simulation;Processor scheduling;Protocols;Remuneration;Statistics;Time warp simulation},
	Pages = {109-116},
	Title = {{Combining periodic and probabilistic checkpointing in optimistic simulation}},
	Year = {1999},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/PADS.1999.766167}}

@inproceedings{2012-27,
	Abstract = {Fault-detection and prediction in HPC clusters and Cloud-computing
	systems are increasingly challenging issues. Several system middleware
	such as job schedulers and MPI implementations provide support for
	both reactive and proactive mechanisms to tolerate faults. These
	techniques rely on external components such as system logs and infrastructure
	monitors to provide information about hardware/software failure either
	through detection, or as a prediction. However, these middleware
	work in isolation, without disseminating the knowledge of faults
	encountered. In this context, we propose a light-weight multi-threaded
	service, namely FTB-IPMI, which provides distributed fault-monitoring
	using the Intelligent Platform Management Interface (IPMI) and coordinated
	propagation of fault information using the Fault-Tolerance Backplane
	(FTB). In essence, it serves as a middleman between system hardware
	and the software stack by translating raw hardware events to structured
	software events and delivering it to any interested component using
	a publish-subscribe framework. Fault-predictors and other decision-making
	engines that rely on distributed failure information can benefit
	from FTB-IPMI to facilitate proactive fault-tolerance mechanisms
	such as preemptive job migration. We have developed a fault-prediction
	engine within MVAPICH2, an RDMA-based MPI implementation, to demonstrate
	this capability. Failure predictions made by this engine are used
	to trigger migration of processes from failing nodes to healthy spare
	nodes, thereby providing resilience to the MPI application. Experimental
	evaluation clearly indicates that a single instance of FTB-IPMI can
	scale to several hundreds of nodes with a remarkably low resource-utilization
	footprint. A deployment of FTB-IPMI that services a cluster with
	128 compute-nodes, sweeps the entire cluster and collects IPMI sensor
	information on CPU temperature, system voltages and fan speeds in
	about 0.75 seconds. The average CPU utilization of this service running
	on a single node is 0.35%.},
	Acmid = {2357738},
	Address = {Washington, DC, USA},
	Author = {Rajachandrasekar, Raghunath and Besseron, Xavier and Panda, Dhabaleswar K.},
	Booktitle = {{Proceedings of the IEEE 26th International Parallel and Distributed Processing Symposium Workshops \& PhD Forum}},
	Doi = {10.1109/IPDPSW.2012.139},
	Isbn = {978-0-7695-4676-6},
	Keywords = {Fault detection, coordinated fault propogation, IPMI, FTB. HPC Clusters},
	Numpages = {8},
	Pages = {1136--1143},
	Publisher = {IEEE Computer Society},
	Series = {IPDPSW 2012},
	Title = {{Monitoring and Predicting Hardware Failures in HPC Clusters with FTB-IPMI}},
	Url = {http://dx.doi.org/10.1109/IPDPSW.2012.139},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPSW.2012.139}}

@inproceedings{2013-2,
	Abstract = {With the massive scale of high-performance computing systems, long-running
	scientific parallel applications periodically save the state of their
	execution to files called checkpoints to recover from system failures.
	Checkpoints are stored on external parallel file systems, but limited
	bandwidth makes this a time- consuming operation. Multilevel checkpointing
	systems, like the Scalable Checkpoint/Restart (SCR) library, alleviate
	this bottleneck by caching checkpoints in storage located close to
	the compute nodes. However, most large scale systems do not provide
	file storage on compute nodes, preventing the use of SCR.},
	Acmid = {2462908},
	Address = {New York, NY, USA},
	Author = {Rajachandrasekar, Raghunath and Moody, Adam and Mohror, Kathryn and Panda, Dhabaleswar K. (DK)},
	Benchs = {NUMA (Non-Uniform Memory Access), We developed tests to study the performance penalties involved with saving parts of a checkpoint in memory and the rest to an SSD.},
	Booktitle = {{Proceedings of the 22nd international symposium on High-performance parallel and distributed computing}},
	Detailedtb = {OSU-RI (178-node Linux cluster running RHEL 6 at The Ohio State University. Each node has dual Intel Xeon processors with 4 CPUs and 12 GB of memory. OSU-RI also has 16 dedicated storage nodes, each with 24 GB of memory and a 300GB OCZ VeloDrive PCIe SSD. We used the GCC compilers for our experiments, version 4.6.3.) Sierra and Zin (Linux clusters at Lawrence Livermore National Laboratory that run the TOSS 2.0 operating system, a variant of RHEL 6.2. Both of these are equipped with Intel Xeon processors. On Sierra, each node has dual 6-core processors and 24 GB of memory; and on Zin, each node has dual 8-core processors and 32 GB of memory. Both clusters use the InfiniBand QDR interconnect. The total node counts on the clusters are 1,944 and 2,916 respectively. We used the Intel compiler, version 11.1.) Sequoia (IBM Blue Gene/Q system with 98,304 compute nodes. Each node has 16 compute cores and 16 GB of memory. The compute nodes run IBM's Compute Node Kernel and are connected with the IBM Blue Gene torus network. We used the native IBM compiler, version 12.1).},
	Doi = {10.1145/2462902.2462908},
	Isbn = {978-1-4503-1910-2},
	Keywords = {HPC, RDMA, SSD, fault-tolerance, file systems, multilevel checkpointing, persistent- memory},
	Location = {New York, New York, USA},
	Numpages = {12},
	Pages = {143--154},
	Publisher = {ACM},
	Series = {HPDC 2013},
	Summary = {This paper presents CRUISE, an (inMemory with asynchronous pushes to RAMdisk/SSD/HDD and finally to stable storage) file system aimed at holding checkpoints (evaluated with SCR), on Sequoia with 50MB checkpoint per rank. It achieves the same performance of memcpy.},
	Title = {{A 1 PB/s file system to checkpoint three million MPI tasks}},
	Url = {http://doi.acm.org/10.1145/2462902.2462908},
	Year = {2013},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2462902.2462908},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2462902.2462908}}

@inproceedings{2012-23,
	Abstract = {Given the ever-increasing size of supercomputers, fault resilience
	and the ability to tolerate faults have become more of a necessity
	than an option. Checkpoint-Restart protocols have been widely adopted
	as a practical solution to provide reliability. However, traditional
	checkpointing mechanisms suffer from heavy I/O bottleneck while dumping
	process snapshots to a shared filesystem. In this context, we study
	the benefits of data staging, using a proposed hierarchical and modular
	data staging framework which reduces the burden of checkpointing
	on client nodes without penalizing them in terms of performance.
	During a checkpointing operation in this framework, the compute nodes
	transmit their process snapshots to a set of dedicated staging I/O
	servers through a high-throughput RDMA-based data pipeline. Unlike
	the conventional checkpointing mechanisms that block an application
	until the checkpoint data has been written to a shared filesystem,
	we allow the application to resume its execution immediately after
	the snapshots have been pipelined to the staging I/O servers, while
	data is simultaneously being moved from these servers to a backend
	shared filesystem. This framework eases the bottleneck caused by
	simultaneous writes from multiple clients to the underlying storage
	subsystem. The staging framework considered in this study is able
	to reduce the time penalty an application pays to save a checkpoint
	by 8.3 times.},
	Acmid = {2238475},
	Address = {Berlin, Heidelberg},
	Author = {Rajachandrasekar, Raghunath and Ouyang, Xiangyong and Besseron, Xavier and Meshram, Vilobh and Panda, Dhabaleswar K.},
	Booktitle = {{Proceedings of the International Conference on Parallel Processing - Volume 2}},
	Doi = {10.1007/978-3-642-29740-3_35},
	Isbn = {978-3-642-29739-7},
	Keywords = {RDMA, aggregation, checkpoint-restart, data staging},
	Location = {Bordeaux, France},
	Numpages = {10},
	Pages = {312--321},
	Publisher = {Springer-Verlag},
	Series = {Euro-Par 2011},
	Summary = {Use staging nodes to cache the checkpoints (via RDMA) before sending them to stable storage.},
	Title = {{Can Checkpoint/Restart Mechanisms Benefit from Hierarchical Data Staging?}},
	Url = {http://dx.doi.org/10.1007/978-3-642-29740-3_35},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-29740-3_35}}

@inproceedings{1999-1,
	Author = {Rebaudengo, M. and Reorda, M.S. and Torchiano, Marco and Violante, M.},
	Booktitle = {{International Symposium on Defect and Fault Tolerance in VLSI Systems.}},
	Doi = {10.1109/DFTVS.1999.802887},
	Issn = {1550-5774},
	Keywords = {error detection, high level languages, redundancy, software fault tolerance, code redundancy, data redundancy, fault coverage, high-level language, soft-error detection, software fault-tolerance techniques, Circuit faults, Computer errors, Costs, Fault detection, Fault tolerance, Hardware, High level languages, Programming profession, Read only memory, Safety devices},
	Pages = {210-218},
	Series = {DFT 1999},
	Title = {{Soft-error detection through software fault-tolerance techniques}},
	Year = {1999},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/DFTVS.1999.802887}}

@article{1960-1,
	Author = {Reed, I. and Solomon, G.},
	Doi = {10.1137/0108018},
	Eprint = {http://epubs.siam.org/doi/pdf/10.1137/0108018},
	Journal = {Journal of the Society for Industrial and Applied Mathematics},
	Number = {2},
	Pages = {300-304},
	Title = {{Polynomial Codes Over Certain Finite Fields}},
	Url = {http://epubs.siam.org/doi/abs/10.1137/0108018},
	Volume = {8},
	Year = {1960},
	Bdsk-Url-1 = {http://epubs.siam.org/doi/abs/10.1137/0108018},
	Bdsk-Url-2 = {http://dx.doi.org/10.1137/0108018}}

@article{2008-2,
	Author = {REISS, CHARLES A and LOFSTEAD, JAY and OLDFIELD, RONA},
	Journal = {CSRI SUMMER PROCEEDINGS 2008},
	Pages = {131},
	Summary = {Use staging nodes to cache the checkpoints before sending them to stable storage.},
	Title = {{IMPLEMENTATION AND EVALUATION OF A STAGING PROXY FOR CHECKPOINT I/O}},
	Year = {2008}}

@inproceedings{2012-8,
	Acmid = {2389021},
	Address = {Los Alamitos, CA, USA},
	Articleno = {18},
	Author = {Riesen, Rolf and Ferreira, Kurt and Da Silva, Dilma and Lemarinier, Pierre and Arnold, Dorian and Bridges, Patrick G.},
	Booktitle = {{Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis}},
	Isbn = {978-1-4673-0804-5},
	Location = {Salt Lake City, Utah},
	Numpages = {11},
	Pages = {18:1--18:11},
	Publisher = {IEEE Computer Society Press},
	Series = {SC 2012},
	Title = {{Alleviating scalability issues of checkpointing protocols}},
	Url = {http://dl.acm.org/citation.cfm?id=2388996.2389021},
	Year = {2012},
	Bdsk-Url-1 = {http://dl.acm.org/citation.cfm?id=2388996.2389021}}

@inproceedings{2012-7,
	Acmid = {2238466},
	Address = {Berlin, Heidelberg},
	Author = {Riesen, Rolf and Ferreira, Kurt B. and Varela, Maria Ruiz and Taufer, Michela and Rodrigues, Arun},
	Booktitle = {{Proceedings of the international conference on Parallel Processing - Volume 2}},
	Doi = {10.1007/978-3-642-29740-3_26},
	Isbn = {978-3-642-29739-7},
	Location = {Bordeaux, France},
	Numpages = {10},
	Pages = {221--230},
	Publisher = {Springer-Verlag},
	Series = {Euro-Par 2011},
	Title = {{Simulating application resilience at exascale}},
	Url = {http://dx.doi.org/10.1007/978-3-642-29740-3_26},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-29740-3_26}}

@inproceedings{2013-25,
	Abstract = {The high failure rate expected for future supercomputers requires
	the design of new fault tolerant solutions. Most checkpointing protocols
	are designed to work with any message-passing application but suffer
	from scalability issues at extreme scale. We take a different approach:
	We identify a property common to many HPC applications, namely channel-determinism,
	and introduce a new partial order relation, called always-happens-before
	relation, between events of such applications. Leveraging these two
	concepts, we design a protocol that combines an unprecedented set
	of features. Our protocol called SPBC combines in a hierarchical
	way coordinated checkpointing and message logging. It is the first
	protocol that provides failure containment without logging any information
	reliably apart from process checkpoints, and this, without penalizing
	recovery performance. Experiments run with a representative set of
	HPC workloads demonstrate a good performance of our protocol during
	both, failure-free execution and recovery.},
	Acmid = {2503271},
	Address = {New York, NY, USA},
	Articleno = {8},
	Author = {Ropars, Thomas and Martsinkevich, Tatiana V. and Guermouche, Amina and Schiper, Andr{\'e} and Cappello, Franck},
	Booktitle = {{Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}},
	Doi = {10.1145/2503210.2503271},
	Isbn = {978-1-4503-2378-9},
	Location = {Denver, Colorado},
	Numpages = {12},
	Pages = {8:1--8:12},
	Publisher = {ACM},
	Series = {SC 2013},
	Title = {{SPBC: Leveraging the Characteristics of MPI HPC Applications for Scalable Checkpointing}},
	Url = {http://doi.acm.org/10.1145/2503210.2503271},
	Year = {2013},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2503210.2503271},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2503210.2503271}}

@inproceedings{2007-2,
	Author = {Ruscio, J.F. and Heffner, M.A. and Varadarajan, S.},
	Booktitle = {{Parallel and Distributed Processing Symposium, 2007.}},
	Doi = {10.1109/IPDPS.2007.370309},
	Keywords = {application program interfaces, checkpointing, fault tolerant computing, message passing, parallel processing, system monitoring, DejaVu fault tolerance system, MPI code, communication architecture, distributed system automatic migration, distributed system automatic recovery, runtime mechanism, system failure, transparent incremental checkpointing, transparent parallel user-level checkpointing, Application software, Checkpointing, Computer networks, Computer science, Concurrent computing, Distributed computing, Fault tolerant systems, Laboratories, Runtime, Stability},
	Pages = {1-10},
	Publisher = {IEEE International},
	Series = {IPDPS 2007},
	Title = {{DejaVu: Transparent User-Level Checkpointing, Migration, and Recovery for Distributed Systems}},
	Year = {2007},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/IPDPS.2007.370309}}

@article{2005-6,
	Author = {Saito, Yasushi and Shapiro, Marc},
	Journal = {ACM Computing Surveys (CSUR)},
	Number = {1},
	Pages = {42--81},
	Publisher = {ACM},
	Title = {{Optimistic replication}},
	Volume = {37},
	Year = {2005}}

@inproceedings{2012-26,
	Abstract = {As the capability and component count of systems increase, the MTBF
	decreases. Typically, applications tolerate failures with checkpoint/restart
	to a parallel file system (PFS). While simple, this approach can
	suffer from contention for PFS resources. Multi-level checkpointing
	is a promising solution. However, while multi-level checkpointing
	is successful on today's machines, it is not expected to be sufficient
	for exascale class machines, which are predicted to have orders of
	magnitude larger memory sizes and failure rates. Our solution combines
	the benefits of non-blocking and multi-level checkpointing. In this
	paper, we present the design of our system and model its performance.
	Our experiments show that our system can improve efficiency by 1.1
	to 2.0x on future machines. Additionally, applications using our
	checkpointing system can achieve high efficiency even when using
	a PFS with lower bandwidth.},
	Acmid = {2389022},
	Address = {Los Alamitos, CA, USA},
	Articleno = {19},
	Author = {Sato, Kento and Maruyama, Naoya and Mohror, Kathryn and Moody, Adam and Gamblin, Todd and de Supinski, Bronis R. and Matsuoka, Satoshi},
	Booktitle = {{Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis}},
	Isbn = {978-1-4673-0804-5},
	Keywords = {Markov model, checkpoint/restart, fault tolerance},
	Location = {Salt Lake City, Utah},
	Numpages = {10},
	Pages = {19:1--19:10},
	Publisher = {IEEE Computer Society Press},
	Series = {SC 2012},
	Title = {{Design and Modeling of a Non-blocking Checkpointing System}},
	Url = {http://dl.acm.org/citation.cfm?id=2388996.2389022},
	Year = {2012},
	Bdsk-Url-1 = {http://dl.acm.org/citation.cfm?id=2388996.2389022}}

@misc{2011-21,
	Author = {Sawley, Marie-Christine and Wuyts, Roel},
	Institution = {ExaScience Lab Intel Labs Europe},
	Series = {SC 2011},
	Title = {{Resilient Software for ExaScale Computing}},
	Year = {2011}}

@article{1990-1,
	Acmid = {98167},
	Address = {New York, NY, USA},
	Author = {Schneider, Fred B.},
	Doi = {10.1145/98163.98167},
	Issn = {0360-0300},
	Issue_Date = {Dec. 1990},
	Journal = {ACM Comput. Surv.},
	Month = dec,
	Number = {4},
	Numpages = {21},
	Pages = {299--319},
	Publisher = {ACM},
	Title = {{Implementing fault-tolerant services using the state machine approach: a tutorial}},
	Url = {http://doi.acm.org/10.1145/98163.98167},
	Volume = {22},
	Year = {1990},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/98163.98167},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/98163.98167}}

@inproceedings{2007-3,
	Abstract = {With petascale computers only a year or two away there is a pressing
	need to anticipate and compensate for a probable increase in failure
	and application interruption rates. Researchers, designers and integrators
	have available to them far too little detailed information on the
	failures and interruptions that even smaller terascale computers
	experience. The information that is available suggests that application
	interruptions will become far more common in the coming decade, and
	the largest applications may surrender large fractions of the computer's
	resources to taking checkpoints and restarting from a checkpoint
	after an interruption. This paper reviews sources of failure information
	for compute clusters and storage systems, projects failure rates
	and the corresponding decrease in application effectiveness, and
	discusses coping strategies such as application-level checkpoint
	compression and system level process-pairs fault-tolerance for supercomputing.
	The need for a public repository for detailed failure and interruption
	records is particularly concerning, as projections from one architectural
	family of machines to another are widely disputed. To this end, this
	paper introduces the Computer Failure Data Repository and issues
	a call for failure history data to publish in it.},
	Author = {Schroeder, Bianca and Gibson, Garth A},
	Booktitle = {{Journal of Physics: Conference Series}},
	Number = {1},
	Numpages = {012022},
	Publisher = {IOP Publishing},
	Series = {Conference Series},
	Title = {{Understanding failures in petascale computers}},
	Volume = {78},
	Year = {2007}}

@inproceedings{2004-4,
	Acmid = {1049982},
	Address = {Washington, DC, USA},
	Author = {Schulz, Martin and Bronevetsky, Greg and Fernandes, Rohit and Marques, Daniel and Pingali, Keshav and Stodghill, Paul},
	Booktitle = {{Proceedings of the 2004 ACM/IEEE conference on Supercomputing}},
	Doi = {10.1109/SC.2004.29},
	Isbn = {0-7695-2153-3},
	Publisher = {IEEE Computer Society},
	Series = {SC 2004},
	Title = {{Implementation and Evaluation of a Scalable Application-Level Checkpoint-Recovery Scheme for MPI Programs}},
	Url = {http://dx.doi.org/10.1109/SC.2004.29},
	Year = {2004},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/SC.2004.29}}

@article{2010-1,
	Abstract = {he goal of this paper is to address the selection of efficient checkpoint
	interval which reduces the total overhead cost due to the checkpointing
	and restarting of the applications in a distributed system environment.
	Coordinated checkpointing rollback recovery protocol is used for
	making the application programs fault tolerant on a stand-alone system
	under no load conditions using BLCR and OPEN MPI at system level.
	We have presented an experimental study in which we have used the
	optimum checkpoint interval determined by an existing model to compare
	the performance of coordinated checkpointing protocol using two types
	of checkpointing intervals namely fixed and incremental checkpoint
	intervals. We measured the checkpoint cost, rollback cost and total
	cost of overheads caused by the above two methods of checkpointing
	intervals Failures are simulated using the Poisson distribution with
	one failure per hour and the inter arrival time between the failures
	follow exponential distribution. We have observed from the results
	that, rollback overhead and total cost of overheads due to checkpointing
	the application are very high in incremental checkpoint interval
	method than in fixed checkpoint interval method. Hence, we conclude
	that fixed checkpointing interval method is more efficient as it
	reduces the rollback overhead and also total cost of overheads considerably.},
	Author = {Shastry, PM and Venkatesh, K},
	Journal = {International Journal on Computer Science and Engineering},
	Keywords = {Checkpoint; Checkpoint Interval; Fault tolerance, Marker, Checkpoint Overheads.},
	Number = {6},
	Pages = {2064--2070},
	Series = {IJCSE 2010},
	Title = {{Selection of a Checkpoint Interval in Coordinated Checkpointing Protocol for Fault Tolerant Open MPI.}},
	Volume = {2},
	Year = {2010}}

@book{2002-3,
	Address = {New York, NY, USA},
	Author = {Shooman, Martin L.},
	Isbn = {0471293423},
	Publisher = {John Wiley \& Sons, Inc.},
	Title = {{Reliability of Computer Systems and Networks: Fault Tolerance,Analysis,and Design}},
	Year = {2002}}

@inproceedings{1999-6,
	Author = {Silva, Luis Moura and Silva, Joao Gabriel},
	Booktitle = {{The 13th International Parallel Processing Symposiym and 10th Symposium on Parallel and Distributed Processing, IPPS/SPDP}},
	Organization = {IEEE},
	Pages = {280--284},
	Title = {{The performance of coordinated and independent checkpointing}},
	Year = {1999}}

@inproceedings{2006-2,
	Address = {Orlando, Florida, USA},
	Author = {Jared C. Smolens and Brian T. Gold and Babak Falsafi and James C. Hoe},
	Booktitle = {{39th Annual IEEE/ACM International Symposium on Microarchitecture}},
	Doi = {http://doi.ieeecomputersociety.org/10.1109/MICRO.2006.42},
	Keywords = {redundancy, C++},
	Pages = {223-234},
	Publisher = {IEEE Computer Society},
	Series = {MICRO-39 2006},
	Title = {{Reunion: Complexity-Effective Multicore Redundancy}},
	Url = {http://researchr.org/publication/SmolensGFH06},
	Year = {2006},
	Bdsk-Url-1 = {http://researchr.org/publication/SmolensGFH06},
	Bdsk-Url-2 = {http://doi.ieeecomputersociety.org/10.1109/MICRO.2006.42}}

@book{2013-5,
	Abstract = {Abstract not provided},
	Author = {Snir, M. and Wisniewski, R. W. and Abraham, J. A. and Adve, S. V. and Bagchi, S. and Balaji, P. and Belak, J. and Bose, P. and Cappello, F. and Carlson, B. and et al.},
	Doi = {10.2172/1078029},
	Publisher = {U.S. DoE},
	Title = {{Addressing Failures in Exascale Computing}},
	Url = {http://www.osti.gov/scitech/servlets/purl/1078029},
	Year = {2013},
	Bdsk-Url-1 = {http://www.osti.gov/scitech/servlets/purl/1078029},
	Bdsk-Url-2 = {http://dx.doi.org/10.2172/1078029}}

@inproceedings{2003-4,
	Author = {Jeffrey M. Squyres and Andrew Lumsdaine},
	Booktitle = {{In Proceedings, 10th European PVM/MPI Users' Group Meeting}},
	Pages = {379--387},
	Publisher = {Springer-Verlag},
	Series = {Lecture Notes in Computer Science},
	Title = {{A Component Architecture for LAM/MPI}},
	Year = {2003}}

@inproceedings{2013-10,
	Author = {Srinivasan, Jay and Canon, Richard Shane},
	Summary = {Method of asynchronous staging of files to and from pool of enterprise-class Flash (SSD) into larger pools of lower-performing storage, allowing the user some control over where the data have to be. The flash is configured as a 2nd Lustre filesystem, aside of the compute nodes.},
	Title = {{Evaluation of A Flash Storage Filesystem on the Cray XE-6}},
	Year = {2013}}

@inproceedings{2012-6,
	Author = {Jon Stearley and Kurt B. Ferreira and David J. Robinson and Jim Laros and Kevin T. Pedretti and Dorian Arnold and Patrick G. Bridges and Rolf Riesen},
	Booktitle = {{DSN Workshops}},
	Ee = {http://dx.doi.org/10.1109/DSNW.2012.},
	Pages = {1-6},
	Title = {{Does partial replication pay off?}},
	Year = {2012}}

@inproceedings{2009-8,
	Abstract = {In this paper, a tool named CheCUDA is designed to checkpoint CUDA
	applications that use GPUs as accelerators. As existing checkpoint/restart
	implementations do not support checkpointing the GPU status, CheCUDA
	hooks a part of basic CUDA driver API calls in order to record the
	status changes on the main memory. At checkpointing, CheCUDA stores
	the status changes in a file after copying all necessary data in
	the video memory to the main memory and then disabling the CUDA runtime.
	At restarting, CheCUDA reads the file, re-initializes the CUDA runtime,
	and recovers the resources on GPUs so as to restart from the stored
	status. This paper demonstrates that a prototype implementation of
	CheCUDA can correctly checkpoint and restart a CUDA application written
	with basic APIs. This also indicates that CheCUDA can migrate a process
	from one PC to another even if the process uses a GPU. Accordingly,
	CheCUDA is useful not only to enhance the dependability of CUDA applications
	but also to enable dynamic task scheduling of CUDA applications required
	especially on heterogeneous GPU cluster systems. This paper also
	shows the timing overhead for checkpointing.},
	Author = {Takizawa, H. and Sato, K. and Komatsu, K. and Kobayashi, H.},
	Booktitle = {{International Conference on Parallel and Distributed Computing, Applications and Technologies}},
	Doi = {10.1109/PDCAT.2009.78},
	Keywords = {application program interfaces;checkpointing;computer graphics;coprocessors;device drivers;scheduling;software tools;task analysis;CUDA driver API;CheCUDA;GPU;accelerators;application program interfaces;checkpoint CUDA applications;checkpoint-restart tool;compute unified device architecture;dynamic task scheduling;graphics processing units;heterogeneous GPU cluster systems;video memory;Checkpointing;Computer applications;Computer architecture;Distributed computing;Dynamic scheduling;Processor scheduling;Prototypes;Runtime;Throughput;Timing;Graphics processing units;checkpoint/restart;compute unified device architecture},
	Pages = {408-413},
	Title = {{CheCUDA: A Checkpoint/Restart Tool for CUDA Applications}},
	Year = {2009},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/PDCAT.2009.78}}

@article{2010-7,
	Author = {Thakur, Rajeev and Balaji, Pavan and Buntinas, Darius and Goodell, David and Gropp, William and Hoefler, Torsten and Kumar, Sameer and Lusk, Ewing and Traff, Jesper Larsson},
	Journal = {Procceedings of SciDAC},
	Title = {{MPI at Exascale}},
	Volume = {2},
	Year = {2010}}

@misc{2013-32,
	Author = {Top500.org},
	Howpublished = {\url{http://www.top500.org/list/2013/11/}},
	Note = {Accessed: 2014-03-18},
	Publisher = {http://www.top500.org},
	Title = {{The Top500 List.}},
	Year = {2013}}

@inproceedings{2000-1,
	Abstract = {We describe and test a software approach to overcoming radiation-induced
	errors in spaceborne applications running on commercial off-the-shelf
	components. The approach uses checksum methods to validate results
	returned by a numerical subroutine operating subject to unpredictable
	errors in data. We can treat subroutines that return results satisfying
	a necessary condition having a linear form; the checksum tests compliance
	with this condition. We discuss the theory and practice of setting
	numerical tolerances to separate errors caused by a fault from those
	inherent infinite-precision numerical calculations. We test both
	the general effectiveness of the linear fault tolerant schemes we
	propose, and the correct behavior of our parallel implementation
	of them},
	Author = {Turmon, M. and Granat, R. and Katz, D.S.},
	Booktitle = {{Proceedings International Conference on Dependable Systems and Networks, DSN}},
	Doi = {10.1109/ICDSN.2000.857522},
	Keywords = {aerospace computing;mathematics computing;software fault tolerance;software performance evaluation;subroutines;checksum methods;commercial off-the-shelf components;errors;high-performance space applications;linear fault tolerant schemes;numerical subroutine;numerical tolerance;radiation-induced errors;software-implemented fault detection;spaceborne applications;unpredictable errors;Algorithms;Application software;Cosmic rays;Electrical capacitance tomography;Fault detection;Image analysis;Laboratories;Space technology;Space vehicles;Testing},
	Pages = {107-116},
	Title = {{Software-implemented fault detection for high-performance space applications}},
	Year = {2000},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/ICDSN.2000.857522}}

@article{2003-7,
	Abstract = {We describe and test a software approach to fault detection in common
	numerical algorithms. Such result checking or algorithm-based fault
	tolerance (ABFT) methods may be used, for example, to overcome single-event
	upsets in computational hardware or to detect errors in complex,
	high-efficiency implementations of the algorithms. Following earlier
	work, we use checksum methods to validate results returned by a numerical
	subroutine operating subject to unpredictable errors in data. We
	consider common matrix and Fourier algorithms which return results
	satisfying a necessary condition having a linear form; the checksum
	tests compliance with this condition. We discuss the theory and practice
	of setting numerical tolerances to separate errors caused by a fault
	from those inherent in finite-precision floating-point calculations.
	We concentrate on comprehensively defining and evaluating tests having
	various accuracy/computational burden tradeoffs, and we emphasize
	average-case algorithm behavior rather than using worst-case upper,
	bounds on error.},
	Author = {Turmon, M. and Granat, R. and Katz, D.S. and Lou, J.Z.},
	Doi = {10.1109/TC.2003.1197125},
	Issn = {0018-9340},
	Journal = {IEEE Transactions on Computers},
	Keywords = {error analysis;fault tolerant computing;parallel algorithms;roundoff errors;software fault tolerance;Fourier algorithms;algorithm-based fault tolerance methods;checksum methods;checksum tests compliance;common numerical algorithms;high-performance software-implemented fault detection;worst case upper bounds;Algorithms;Application software;Delay;Fault detection;Fault tolerance;Hardware;Single event transient;Single event upset;Software testing;Space technology},
	Number = {5},
	Pages = {579-591},
	Title = {{Tests and tolerances for high-performance software-implemented fault detection}},
	Volume = {52},
	Year = {2003},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/TC.2003.1197125}}

@article{2003-1,
	Abstract = {The ability to produce malleable parallel applications that can be
	stopped and reconfigured during the execution can other attractive
	benefits for both the system and the applications. The reconfiguration
	can be in terms of varying the parallelism for the applications,
	changing the data distributions during the executions or dynamically
	changing the software components involved in the application execution.
	In dis- tributed and Grid computing systems, migration and reconfiguration
	of such malleable applications across distributed heterogeneous sites
	which do not share common file systems provides exibility for scheduling
	and resource management in such distributed environments. The present
	re- configuration systems do not support migration of parallel applications
	to distributed locations. In this paper, we discuss a framework for
	developing malleable and migratable MPI message-passing parallel
	applications for distributed systems. The framework includes a user-level
	checkpointing library called SRS and a runtime support system that
	manages the checkpointed data for distribution to distributed locations.
	Our experi- ment results indicate that the parallel applications,
	with instrumentation to SRS library, were able to achieve reconfigurability
	incurring about 15- 35\% overhead},
	Author = {Sathish S. Vadhiyar and Jack Dongarra},
	Bibsource = {DBLP, http://dblp.uni-trier.de},
	Ee = {http://dx.doi.org/10.1142/S0129626403001288},
	Journal = {Parallel Processing Letters},
	Number = {2},
	Pages = {291-312},
	Summary = {SRS is a library to checkpoint data semi-transparently and migrate applications upon failure. The user has to insert calls in his program (C or Fortran) to specify the data for checkpointing and to restore the application state in the event of a restart (hence, not automatic recovery). The checkpoints are performed collectively upon user request. The actual storing of checkpoints and the redistribution of data in the event of a reconfiguration are handled internally by the library. Also, SRS does not tolerate node failures, because the IBP component used to store checkpoints is local to the node. Also, it has a central component (RSS), which is assumed to be run on a failure-free environment. No experiments with real/simulated failures are provided; only migration of execution, failure-free applications.},
	Title = {{SRS: A Framework for Developing Malleable and Migratable Parallel Applications for Distributed Systems}},
	Volume = {13},
	Year = {2003}}

@inproceedings{2002-6,
	Abstract = {Clusters are attractive for executing sequential and parallel applications.
	However, there is a need to design a cluster distributed operating
	system to provide a Single System Image. A cluster operating system
	providing both a DSM system and load balancing is attractive for
	efficiently executing a workload of sequential applications and shared
	memory parallel applications. Gobelins is a distributed operating
	system dedicated to clusters that provides both a DSM system and
	a process migration mechanism to support load balancing. In this
	paper, we present the implementation of Gobelins process migration
	mechanism which exploits Gobelins kernel level DSM system. We show
	that Gobelins DSM allows to implement simply an efficient migration
	mechanism, that can be used to move processes or threads among cluster
	nodes. A prototype of Gobelins has been implemented. Some performance
	results are presented in this paper.},
	Author = {Vallee, G. and Morin, C. and Lottiaux, R. and Berthou, J. and Malen, I.D.},
	Booktitle = {{2nd IEEE/ACM International Symposium on Cluster Computing and the Grid}},
	Doi = {10.1109/CCGRID.2002.1017154},
	Keywords = {Concurrent computing;Kernel;Load management;Message passing;Operating systems;Parallel programming;Programming profession;Prototypes;Runtime environment;Yarn},
	Pages = {325-325},
	Title = {{Process Migration Based on Gobelins Distributed Shared Memory}},
	Year = {2002},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/CCGRID.2002.1017154}}

@inproceedings{1990-2,
	Abstract = {A two-stage approach to the design of algorithm-based fault-tolerant
	(ABFT) systems is proposed. In the first stage a code is chosen to
	encode the data used in the algorithm. In the second stage the optimal
	architecture for implementing the scheme is chosen through the use
	of dependence graphs. Dependence graphs are a graph-theoretic form
	of algorithm representation. It is demonstrated that not all architectures
	are ideal for the implementation of a particular ABFT scheme. The
	authors propose new measures for characterizing the fault-tolerance
	capability of a system in order to better exploit the proposed design
	method. Dependence graphs can also be used for the synthesis of ABFT
	schemes for nonlinear problems. An example of a fault-tolerant median
	filter is provided to illustrate the usefulness of the dependence
	graph as a design tool for nonlinear system synthesis.<>},
	Author = {Vinnakota, B. and Jha, N.K.},
	Booktitle = {{20th International Symposium on Fault-Tolerant Computing. FTCS-20. Digest of Papers.}},
	Doi = {10.1109/FTCS.1990.89347},
	Keywords = {control system CAD;fault tolerant computing;graph theory;algorithm representation;algorithm-based fault tolerant systems;dependence graph-based approach;encode;graph-theoretic form;median filter;nonlinear problems;nonlinear system synthesis;optimal architecture;two-stage approach;Algorithm design and analysis;Computer applications;Computer architecture;Concurrent computing;Design methodology;Encoding;Fault detection;Fault tolerance;Fault tolerant systems;Parallel architectures},
	Pages = {122-129},
	Title = {{A dependence graph-based approach to the design of algorithm-based fault tolerant systems}},
	Year = {1990},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/FTCS.1990.89347}}

@inproceedings{2008-5,
	Abstract = {As the number of nodes in high-performance computing environments
	keeps increasing, faults are becoming common place. Reactive fault
	tolerance (FT) often does not scale due to massive I/O requirements
	and relies on manual job resubmission. This work complements reactive
	with proactive FT at the process level. Through health monitoring,
	a subset of node failures can be anticipated when one's health deteriorates.
	A novel process-level live migration mechanism supports continued
	execution of applications during much of processes migration. This
	scheme is integrated into an MPI execution environment to transparently
	sustain health-inflicted node failures, which eradicates the need
	to restart and requeue MPI jobs. Experiments indicate that 1-6.5
	seconds of prior warning are required to successfully trigger live
	process migration while similar operating system virtualization mechanisms
	require 13-24 seconds. This self-healing approach complements reactive
	FT by nearly cutting the number of checkpoints in half when 70\%
	of the faults are handled proactively.},
	Author = {Chao Wang and Mueller, F. and Engelmann, C. and Scott, S.L.},
	Booktitle = {{International Conference for High Performance Computing, Networking, Storage and Analysis.}},
	Doi = {10.1109/SC.2008.5222634},
	Keywords = {fault tolerant computing;message passing;parallel processing;system monitoring;HPC environment;I/O requirements;MPI execution environment;health monitoring;health-inflicted node failure;high-performance computing environment;manual job resubmission;node failures;proactive process-level live migration;reactive fault tolerance;Chaos;Computer science;Condition monitoring;Fault tolerance;Fault tolerant systems;Mathematics;Middleware;Operating systems;Temperature sensors;US Department of Energy},
	Pages = {1-12},
	Title = {{Proactive process-level live migration in HPC environments}},
	Year = {2008},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/SC.2008.5222634}}

@article{1994-1,
	Abstract = {Algorithm-based fault tolerance (ABFT) is a low-overhead system-level
	fault tolerance technique. Many ABFT schemes have been proposed in
	the past for fast Fourier transform (FFT) networks. In this paper,
	a new ABFT scheme for FFT networks is proposed. We show that the
	new approach maintains the high throughput of previous schemes, yet
	needs lower hardware overhead and achieves higher fault converge
	than previous schemes by J.Y. Jou et al. (1988) and D.I. Tao et al.
	(1990)},
	Author = {Sying-Jyan Wang and Jha, N.K.},
	Doi = {10.1109/12.293265},
	Issn = {0018-9340},
	Journal = {IEEE Transactions on Computers},
	Keywords = {fast Fourier transforms;fault tolerant computing;reliability;FFT networks;algorithm-based fault tolerance;high throughput;system-level fault tolerance technique;Circuit faults;Digital signal processing;Discrete Fourier transforms;Fast Fourier transforms;Fault tolerance;Fault tolerant systems;Hardware;Redundancy;Signal processing algorithms;Throughput},
	Number = {7},
	Pages = {849-854},
	Title = {{Algorithm-based fault tolerance for FFT networks}},
	Volume = {43},
	Year = {1994},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/12.293265}}

@article{1974-1,
	Abstract = {To avoid having to restart a job from the beginning in case of random
	failure, it is standard practice to save periodically sufficient
	information to enable the job to be restarted at the previous point
	at which information was saved. Such points are referred to as checkpoints,
	and the saving of such information at these points is called checkpointing
	[1].},
	Acmid = {361115},
	Address = {New York, NY, USA},
	Author = {Young, John W.},
	Doi = {10.1145/361147.361115},
	Issn = {0001-0782},
	Issue_Date = {Sept. 1974},
	Journal = {Commun. ACM},
	Keywords = {checkpoint, job failure, operations, programming, programming checkpoint, random failure},
	Month = sep,
	Number = {9},
	Numpages = {2},
	Pages = {530--531},
	Publisher = {ACM},
	Title = {{A First Order Approximation to the Optimum Checkpoint Interval}},
	Url = {http://doi.acm.org/10.1145/361147.361115},
	Volume = {17},
	Year = {1974},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/361147.361115},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/361147.361115}}

@inproceedings{charm++,
	Abstract = {As the size of supercomputers increases, the probability of system
	failure grows substantially, posing an increasingly significant challenge
	for scalability. It is important to provide resilience for long running
	applications. Checkpoint-based fault tolerance methods are effective
	approaches at dealing with faults. With these methods, the state
	of the entire parallel application is checkpointed to reliable storage.
	When a failure occurs, the application is restarted from a recent
	checkpoint. In previous work, we have demonstrated an efficient double
	in-memory checkpoint and restart fault tolerance scheme, which leverages
	Charm++'s parallel objects for checkpointing. In this paper, we further
	optimize the scheme by eliminating several bottlenecks caused by
	serialized communication. We extend the in-memory checkpointing scheme
	to work on MPI communication layer, and demonstrate the performance
	on very large scale supercomputers. For example, when running a one
	million atom molecular dynamics simulation on up to 64K cores of
	a BlueGene/P machine, the checkpoint time was in milliseconds. The
	restart time was measured to be less than 0.15 seconds on 64K cores.},
	Author = {Gengbin Zheng and Xiang Ni and Laxmikant V. Kal{\'e}},
	Booktitle = {{42nd Int. Conf. Dependable Systems and Networks Workshops (DSN-W)}},
	Doi = {10.1109/DSNW.2012.6264677},
	Keywords = {application program interfaces;checkpointing; fault tolerant computing;mainframes;message passing; parallel processing;MPI communication layer;exascale; checkpoint-based fault tolerance methods;restart scheme; double in-memory checkpointing scheme;parallel application; very large scale supercomputers;Checkpointing;Computer crashes; Fault tolerance;Fault tolerant systems;Optimization; Program processors;Protocols},
	Pages = {1-6},
	Title = {{A scalable double in-memory checkpoint and restart scheme towards exascale}},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/DSNW.2012.6264677}}

@inproceedings{2004-6,
	Abstract = {As high performance clusters continue to grow in size, the mean time
	between failures shrinks. Thus, the issues of fault tolerance and
	reliability are becoming one of the challenging factors for application
	scalability. The traditional disk-based method of dealing with faults
	is to checkpoint the state of the entire application periodically
	to reliable storage and restart from the recent checkpoint. The recovery
	of the application from faults involves (often manually) restarting
	applications on all processors and having it read the data from disks
	on all processors. The restart can therefore take minutes after it
	has been initiated. Such a strategy requires that the failed processor
	can be replaced so that the number of processors at checkpoint-time
	and recovery-time are the same. We present FTC-Charms ++, a fault-tolerant
	runtime based on a scheme for fast and scalable in-memory checkpoint
	and restart. At restart, when there is no extra processor, the program
	can continue to run on the remaining processors while minimizing
	the performance penalty due to losing processors. The method is useful
	for applications whose memory footprint is small at the checkpoint
	state, while a variation of this scheme - in-disk checkpoint/restart
	can be applied to applications with large memory footprint. The scheme
	does not require any individual component to be fault-free. We have
	implemented this scheme for Charms++ and AMPI (an adaptive version
	of MPl). This work describes the scheme and shows performance data
	on a cluster using 128 processors.},
	Author = {Gengbin Zheng and Lixia Shi and Kale, L.V.},
	Booktitle = {{IEEE Int. Conf. Cluster Computing}},
	Doi = {10.1109/CLUSTR.2004.1392606},
	Issn = {1552-5244},
	Keywords = {application program interfaces;fault tolerant computing;message passing;system recovery;workstation clusters;AMPI;Charm++;FTC-Charm++;MPI;application scalability;checkpoint-time;disk-based method;fault-tolerant runtime;high performance clusters;in-disk checkpoint;in-disk restart;in-memory checkpoint-based fault tolerant runtime;in-memory restart;recovery-time;Application software;Checkpointing;Computational modeling;Computer crashes;Computer science;Data engineering;Fault tolerance;Mission critical systems;Programming profession;Runtime},
	Pages = {93-103},
	Title = {{FTC-Charm++: an in-memory checkpoint-based fault tolerant runtime for Charm++ and MPI}},
	Year = {2004},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/CLUSTR.2004.1392606}}

@inproceedings{2013-1,
	Abstract = { A key challenge in developing large scale applications is finding
	bugs that are latent at the small scales of testing, but manifest
	themselves when the application is deployed at a large scale. Here,
	we ascribe a dual meaning to "large scale"---it could mean a large
	number of executing processes or applications ingesting large amounts
	of input data (or both). Traditional statistical techniques fail
	to detect or diagnose such kinds of bugs because no error-free run
	is available at the large deployment scales for training purposes.
	Prior work used scaling models to detect anomalous behavior at large
	scales without training on correct behavior at that scale. However,
	that work cannot localize bugs automatically, i.e. cannot pinpoint
	the region of code responsible for the error. In this paper, we resolve
	that shortcoming by making the following three contributions: (i)
	we develop an automatic diagnosis technique, based on feature reconstruction;
	(ii) we design a heuristic to effectively prune the large feature
	space; and (iii) we demonstrate that our system scales well, in terms
	of both accuracy and overhead. We validate our design through a large-scale
	fault-injection study and two case-studies of real-world bugs, finding
	that our system can effectively localize bugs in 92.5\% of the cases,
	dramatically easing the challenge of finding bugs in large-scale
	programs.},
	Acmid = {2462907},
	Address = {New York, NY, USA},
	Author = {Zhou, Bowen and Too, Jonathan and Kulkarni, Milind and Bagchi, Saurabh},
	Benchs = {AMG2006 (a benchmark application from the Sequoia benchmark suite [2]. Is a parallel algebraic multigrid solver for linear systems, written in 104K lines of C code. The application is configured to solve the default 3D Laplace type problem with the GMRES algorithm), Transmission (a popular P2P file sharing application on Linux platforms),5 programs from the NAS Parallel Benchmarks, namely CG, FT, IS, LU, MG and SP},
	Booktitle = {{Proceedings of the 22nd international symposium on High-performance parallel and distributed computing}},
	Detailedtb = {The fault injection experiments were conducted on a Cray XT5 cluster, as part of the XSEDE computing environment, with 112,896 cores in 9,408 compute nodes. The case studies were conducted on a local cluster with 128 cores in 16 nodes running Linux 2.6.18. The statistical analysis was done on a dual-core computer running Windows 7.},
	Doi = {10.1145/2462902.2462907},
	Isbn = {978-1-4503-1910-2},
	Keywords = {feature reconstruction, program behavior prediction, scale-dependent bug},
	Location = {New York, New York, USA},
	Numpages = {12},
	Pages = {131--142},
	Publisher = {ACM},
	Series = {HPDC 2013},
	Title = {{WuKong: automatically detecting and localizing bugs that manifest at large system scales}},
	Url = {http://doi.acm.org/10.1145/2462902.2462907},
	Year = {2013},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/2462902.2462907},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/2462902.2462907}}

@inproceedings{2006-10,
	Abstract = {A long-term trend in high-performance computing is the increasing number of nodes in parallel computing platforms, which entails a higher failure probability. Fault programming environments should be used to guarantee the safe execution of critical applications. Research in fault tolerant MPI has led to the development of several fault tolerant MPI environments. Different approaches are being proposed using a variety of fault tolerant message passing protocols based on coordinated checkpointing or message logging. The most popular approach is with coordinated checkpointing. In the literature, two different concepts of coordinated checkpointing have been proposed: blocking and non-blocking. However they have never been compared quantitatively and their respective scalability remains unknown. The contribution of this paper is to provide the first comparison between these two approaches and a study of their scalability. We have implemented the two approaches within the MPICH environments and evaluate their performance using the NAS parallel benchmarks},
	Author = {Coti, C. and Herault, T. and Lemarinier, P. and Pilard, L. and Rezmerita, A. and Rodriguez, E. and Cappello, F.},
	Booktitle = {{ACM/IEEE Proceedings of the Int. Conf. High Performance Computing, Networking, Storage and Analysis}},
	Doi = {10.1109/SC.2006.15},
	Keywords = {application program interfaces;checkpointing;message passing;parallel machines;software fault tolerance;high performance computing;large-scale fault tolerant MPI;message passing protocol;nonblock coordinated checkpoint;parallel computing platform;Application software;Checkpointing;Concurrent computing;Fault tolerance;Large-scale systems;Message passing;Programming environments;Protocols;Scalability;Supercomputers},
	Month = {Nov},
	Pages = {18-18},
	Title = {{Blocking vs. Non-Blocking Coordinated Checkpointing for Large-Scale Fault Tolerant MPI}},
	Year = {2006},
	Bdsk-Url-1 = {http://dx.doi.org/10.1109/SC.2006.15}}

@inproceedings{Gamell:HPDC15,
 author = {{Gamell et al.}, Marc},
 title = {{Exploring Failure Recovery for Stencil-based Applications at Extreme Scales}},
 booktitle = {The 24th Int. ACM Symposium on High-Performance Parallel and Distributed Computing},
 series = {HPDC '15},
 year = {2015},
 month = {June},
 location = {Portland, Oregon}
}

@inproceedings{laguna2014evaluating,
  title={Evaluating user-level fault tolerance for {MPI} applications},
  author={{Laguna et al.}, Ignacio},
  booktitle={Proceedings of the 21st European MPI Users' Group Meeting},
  pages={57},
  year={2014},
  organization={ACM}
}

@article{laguna2016evaluating,
  title={Evaluating and extending user-level fault tolerance in {MPI} applications},
  author={{Laguna et al.}, Ignacio},
  journal={Int. J. High Performance Computing Applications},
  year={2016},
  publisher={SAGE Publications}
}

@inproceedings{van2016comparing,
  title={Comparing Runtime Systems with Exascale Ambitions Using the Parallel Research Kernels},
  author={{Van der Wijngaart et al.}, Rob F.},
  booktitle={Int. Conf. High Performance Computing},
  pages={321--339},
  year={2016},
  organization={Springer}
}

@inproceedings{AMRPRK,
  title={A new Parallel Research Kernel to Expand Research on Dynamic Load-balancing Capabilities},
  author={{Van der Wijngaart et al.}, Rob F.},
  booktitle={Accepted for publication at Int. Conf. High Performance Computing},
  year={2017},
  organization={Springer}
}

@article{moody2010detailed,
  title={Detailed modeling, design, and evaluation of a scalable multi-level checkpointing system},
  author={{Moody et al.}, A},
  journal={Lawrence Livermore National Laboratory (LLNL), Tech. Rep. LLNL-TR-440491},
  year={2010}
}

@article{chien2015versioned,
  title={Versioned Distributed Arrays for Resilience in Scientific Applications: Global View Resilience},
  author={{Chien et al.}, A},
  journal={J. Computational Science},
  year={2015},
  publisher={Elsevier}
}

@techreport{fenixspec,
author = {{Gamell et al.}, Marc},
institution = {Sandia National Laboratories, Livermore, CA},
title = {Fenix, A Fault Tolerant Programming Framework for {MPI} Applications},
number= {SAND2016-9171},
year = {2016}}

@article{Bailey:1991:IJHPCA:NAS,
    title={{The NAS Parallel Benchmarks}},
    longauthor={{Bailey et al.}, David H.},
    author={Bailey, David H and others},
    journal={Int. J. High Performance Computing Applications},
    volume={5},
    number={3},
    pages={63--73},
    year={1991},
    publisher={SAGE Publications}
}

@INPROCEEDINGS{Hassani14FAMPI, 
author={A. {Hassani et al}}, 
booktitle={44th Annual IEEE/IFIP Int. Conf. Dependable Systems and Networks}, 
title={Design and Evaluation of {FA-MPI}, a Transactional Resilience Scheme for Non-blocking {MPI}}, 
year={2014}, 
keywords={fault tolerant computing;message passing;middleware;FA-MPI;MPI programming model;failure detection;failure isolation;failure mitigation;fault-aware MPI;fault-tolerant method;nonblocking MPI;scalable programming middleware;supercomputer;transactional resilience scheme;Computational modeling;Fault tolerance;Fault tolerant systems;Semantics;Standards;Synchronization;Fault-Awareness;Fault-Tolerance;MPI}, 
doi={10.1109/DSN.2014.78}, 
ISSN={1530-0889}, 
month={June},}

@inproceedings{Hassani15FAMPI,
 author = {{Hassani et al.}, A.},
 title = {Practical Resilient Cases for {FA-MPI}, a Transactional Fault-tolerant {MPI}},
 booktitle = {Proc. 3rd Workshop on Exascale {MPI}},
 series = {ExaMPI '15},
 year = {2015},
 isbn = {978-1-4503-3998-8},
 location = {Austin, Texas},
 url = {http://doi.acm.org/10.1145/2831129.2831130},
 doi = {10.1145/2831129.2831130},
 acmid = {2831130}
} 

@misc{PRKrepo,
howpublished="\url{https://github.com/ParRes/Kernels}"
}

@misc{openmpi,
howpublished="\url{https://www.open-mpi.org/}"
}