Add Martin paper

lichess-org · Feb 10, 2025 · 79d74ff · 79d74ff
1 parent 84f2cc7
commit 79d74ff
Showing 1 changed file with 13 additions and 0 deletions.
diff --git a/lichess.bib b/lichess.bib
@@ -445,6 +445,19 @@ @article{maharaj:2022:gambits-theory-evidence
   abstract      = {Abstract Gambits are central to human decision-making. Our goal is to provide a theory of Gambits. A Gambit is a combination of psychological and technical factors designed to disrupt predictable play. Chess provides an environment to study gambits and behavioral game theory. Our theory is based on the Bellman optimality path for sequential decision-making. This allows us to calculate the Q\$\$ Q \$\$-values of a Gambit where material (usually a pawn) is sacrificed for dynamic play. On the empirical side, we study the effectiveness of a number of popular chess Gambits. This is a natural setting as chess Gambits require a sequential assessment of a set of moves (a.k.a. policy) after the Gambit has been accepted. Our analysis uses Stockfish 14.1 to calculate the optimal Bellman Q\$\$ Q \$\$-values, which fundamentally measures if a position is winning or losing. To test whether Bellman's equation holds in play, we estimate the transition probabilities to the next board state via a database of expert human play. This then allows us to test whether the Gambiteer is following the optimal path in his decision-making. Our methodology is applied to the popular Stafford and reverse Stafford (a.k.a. Boden–Kieretsky–Morphy) Gambit and other common ones including the Smith–Morra, Goring, Danish and Halloween Gambits. We build on research in human decision-making by proving an irrational skewness preference within agents in chess. We conclude with directions for future research.},
 }
 
+@article{martin:2025:re-evaluating-metamorphic-testing-chess-engines-replication-study,
+  title         = {Re-evaluating metamorphic testing of chess engines: A replication study},
+  author        = {Axel Martin and Djamel Eddine Khelladi and Th\'{e}o Matricon and Mathieu Acher},
+  year          = {2025},
+  journal       = {Information and Software Technology},
+  pages         = {107679},
+  doi           = {https://doi.org/10.1016/j.infsof.2025.107679},
+  issn          = {0950-5849},
+  url           = {https://www.sciencedirect.com/science/article/pii/S0950584925000187},
+  keywords      = {Reproducibility, Replicability, Metamorphic testing, Chess engines},
+  abstract      = {Context: This study aims to confirm, replicate and extend the findings of a previous article entitled ''Metamorphic Testing of Chess Engines'' that reported inconsistencies in the analyses provided by Stockfish, the most widely used chess engine, for transformed chess positions that are fundamentally identical. Initial findings, under conditions strictly identical to those of the original study, corroborate the reported inconsistencies. Objective: However, the original article considers a specific dataset (including randomly generated chess positions, end-games, or checkmate problems) and very low analysis depth (10 plies11A ply refers to a single turn taken by one player in a game. Two plies, one from each player, together constitute a complete move., corresponding to 5 moves). These decisions pose threats that limit generalizability of the results, but also their practical usefulness both for chess players and maintainers of Stockfish. Thus, we replicate the original study. Methods: We consider this time (1) positions derived from actual chess games, (2) analyses at appropriate and larger depths, and (3) different versions of Stockfish. We conduct novel experiments on thousands of positions, employing significantly deeper searches. Results: The replication results show that the Stockfish chess engines demonstrate significantly greater consistency in its evaluations. The metamorphic relations are not as effective as in the original article, especially on realistic chess positions. We also demonstrate that, for any given position, there exists a depth threshold beyond which further increases in depth do not result in any evaluation differences for the studied metamorphic relations. We perform an in-depth analysis to identify and clarify the implementation reasons behind Stockfish's inconsistencies when dealing with transformed positions. Conclusion: A first concrete result is thus that metamorphic testing of chess engines is not yet an effective technique for finding faults of Stockfish. Another result is the lessons learned through this replication effort: metamorphic relations must be verified in the context of the domain's specificities; without such contextual validation, they may lead to misleading or irrelevant conclusions; changes in parameters and input dataset can drastically alter the effectiveness of a testing method.},
+}
+
 @inproceedings{mcilroy-young:2020:aligning-superhuman-ai-human-behavior,
   title         = {Aligning Superhuman {AI} with Human Behavior: Chess as a Model System},
   author        = {Reid McIlroy{-}Young and Siddhartha Sen and Jon M. Kleinberg and Ashton Anderson},