From d2718ddc4a923a63405e0caaf181e9d3952185df Mon Sep 17 00:00:00 2001
From: Michael Adler <therisen06@gmail.com>
Date: Mon, 4 Dec 2023 22:36:26 +0100
Subject: [PATCH] Initial commit

---
 .clang-format             |   13 +
 .github/dependabot.yml    |    6 +
 .github/workflows/ci.yml  |   12 +
 .gitignore                |    2 +
 LICENSE                   |  201 +++
 README.md                 |   57 +
 flake.lock                |   78 +
 flake.nix                 |   22 +
 include/aoc/all.h         |   15 +
 include/aoc/io.h          |   22 +
 include/aoc/log.h         |   58 +
 include/aoc/macros.h      |   34 +
 include/aoc/math.h        |   59 +
 include/aoc/md5.h         |   21 +
 include/aoc/parser.h      |   55 +
 include/aoc/point.h       |   21 +
 include/aoc/stb_sprintf.h | 1906 +++++++++++++++++++++++
 include/aoc/string.h      |    3 +
 include/solve.h           |   11 +
 input/README.md           |    6 +
 justfile                  |   32 +
 lib/aoc/io.c              |   36 +
 lib/aoc/log.c             |  154 ++
 lib/aoc/math.c            |   69 +
 lib/aoc/md5.c             |  227 +++
 lib/aoc/parser.c          |   37 +
 lib/aoc/point.c           |    4 +
 lib/aoc/stb_sprintf.c     |    2 +
 lib/aoc/string.c          |    6 +
 meson.build               |   48 +
 meson.options             |    1 +
 meson_options.txt         |    1 +
 puzzle/day01.md           |   57 +
 puzzle/day02.md           |   67 +
 puzzle/day03.md           |   76 +
 puzzle/day04.md           |   87 ++
 src/day01/solve.c         |   86 +
 src/day01/solve_test.c    |   45 +
 src/day02/solve.c         |   69 +
 src/day02/solve_test.c    |   33 +
 src/day03/solve.c         |  107 ++
 src/day03/solve_test.c    |   38 +
 src/day04/solve.c         |   74 +
 src/day04/solve_test.c    |   34 +
 src/day04/sort.c          |   10 +
 src/main.c                |   23 +
 src/template/solve.c      |   30 +
 src/template/solve_test.c |   30 +
 vendor/ctest/ctest.h      |  610 ++++++++
 vendor/ctl/ctl.h          |   19 +
 vendor/ctl/deq.h          |  470 ++++++
 vendor/ctl/lst.h          |  417 +++++
 vendor/ctl/pqu.h          |  139 ++
 vendor/ctl/que.h          |   49 +
 vendor/ctl/set.h          |  761 +++++++++
 vendor/ctl/stk.h          |   53 +
 vendor/ctl/str.h          |  195 +++
 vendor/ctl/ust.h          |  455 ++++++
 vendor/ctl/vec.h          |  407 +++++
 vendor/sort/sort.h        | 3097 +++++++++++++++++++++++++++++++++++++
 60 files changed, 10757 insertions(+)
 create mode 100644 .clang-format
 create mode 100644 .github/dependabot.yml
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 flake.lock
 create mode 100644 flake.nix
 create mode 100644 include/aoc/all.h
 create mode 100644 include/aoc/io.h
 create mode 100644 include/aoc/log.h
 create mode 100644 include/aoc/macros.h
 create mode 100644 include/aoc/math.h
 create mode 100644 include/aoc/md5.h
 create mode 100644 include/aoc/parser.h
 create mode 100644 include/aoc/point.h
 create mode 100644 include/aoc/stb_sprintf.h
 create mode 100644 include/aoc/string.h
 create mode 100644 include/solve.h
 create mode 100644 input/README.md
 create mode 100644 justfile
 create mode 100644 lib/aoc/io.c
 create mode 100644 lib/aoc/log.c
 create mode 100644 lib/aoc/math.c
 create mode 100644 lib/aoc/md5.c
 create mode 100644 lib/aoc/parser.c
 create mode 100644 lib/aoc/point.c
 create mode 100644 lib/aoc/stb_sprintf.c
 create mode 100644 lib/aoc/string.c
 create mode 100644 meson.build
 create mode 100644 meson.options
 create mode 120000 meson_options.txt
 create mode 100644 puzzle/day01.md
 create mode 100644 puzzle/day02.md
 create mode 100644 puzzle/day03.md
 create mode 100644 puzzle/day04.md
 create mode 100644 src/day01/solve.c
 create mode 100644 src/day01/solve_test.c
 create mode 100644 src/day02/solve.c
 create mode 100644 src/day02/solve_test.c
 create mode 100644 src/day03/solve.c
 create mode 100644 src/day03/solve_test.c
 create mode 100644 src/day04/solve.c
 create mode 100644 src/day04/solve_test.c
 create mode 100644 src/day04/sort.c
 create mode 100644 src/main.c
 create mode 100644 src/template/solve.c
 create mode 100644 src/template/solve_test.c
 create mode 100644 vendor/ctest/ctest.h
 create mode 100644 vendor/ctl/ctl.h
 create mode 100644 vendor/ctl/deq.h
 create mode 100644 vendor/ctl/lst.h
 create mode 100644 vendor/ctl/pqu.h
 create mode 100644 vendor/ctl/que.h
 create mode 100644 vendor/ctl/set.h
 create mode 100644 vendor/ctl/stk.h
 create mode 100644 vendor/ctl/str.h
 create mode 100644 vendor/ctl/ust.h
 create mode 100644 vendor/ctl/vec.h
 create mode 100644 vendor/sort/sort.h

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..805cdf8
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,13 @@
+---
+# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+BasedOnStyle: LLVM
+IndentWidth: 4
+UseTab: Never
+BreakBeforeBraces: Attach
+IndentCaseLabels: false
+AllowShortIfStatementsOnASingleLine: AllIfsAndElse
+AllowShortBlocksOnASingleLine: Always
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortLoopsOnASingleLine: true
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..8ac6b8c
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "monthly"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..39c1905
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,12 @@
+name: ci
+on: [push, pull_request, workflow_dispatch]
+jobs:
+
+  test:
+    runs-on: ubuntu-latest
+    name: Test
+    steps:
+      - uses: actions/checkout@v4
+      - run: sudo apt-get update && sudo apt-get install -y meson ninja-build
+      - run: meson setup build --buildtype debugoptimized -Db_sanitize=address -Db_lundef=false
+      - run: ninja -C build test
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..03538aa
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+compile_commands.json
+input/*.txt
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..55ab4f9
--- /dev/null
+++ b/README.md
@@ -0,0 +1,57 @@
+[![ci](https://github.com/michaeladler/aoc-2023/actions/workflows/ci.yml/badge.svg)](https://github.com/michaeladler/aoc-2023/actions/workflows/ci.yml)
+
+# Advent of Code 2023 Solutions in C
+
+Welcome to my repository where I share my journey through the [Advent of Code 2023](https://adventofcode.com/2023) challenges.
+This year, I've chosen to tackle the puzzles using the good old C programming language.
+
+## Usage
+
+For the initial day's puzzle, ensure that the input file is titled `day01.txt` and is located within the [input](input) directory, specifically at `input/day01.txt`.
+Follow the same pattern for subsequent days.
+
+## Building and Running Tests
+
+Each puzzle in this repository is accompanied by tests, which are designed using both the examples provided in the puzzle and the actual `input.txt` file.
+Since **sharing the puzzle input data is discouraged by AoC guidelines**, the puzzle input data is not part of this repository.
+
+Tests using the actual `input.txt` have to be enabled at compile time with the `-Dhave-inputs=true` option.
+**Note**: This does not make sense unless you are me since your `input.txt` will differ from mine and thus those tests will fail.
+
+### Release Build
+
+```bash
+meson setup build --buildtype release -Dhave-inputs=true
+```
+
+### Debug Build
+
+For a debug-optimized build with address sanitization:
+
+```bash
+meson setup build_debug --buildtype debugoptimized -Db_sanitize=address -Db_lundef=false -Dhave-inputs=true
+```
+
+## Performance Benchmarks
+
+I've benchmarked the solutions on different processors to give you an idea of their performance:
+
+| Day | Intel i7-11850H | AMD Ryzen 5 PRO 4650U |
+| --- | --------------- | --------------------- |
+| 1   | 351 µs          | 520 µs                |
+| 2   | 282 µs          | 209 µs                |
+| 3   | 348 µs          | 678 µs                |
+| 4   | 447 µs          | 1 ms                  |
+
+## Acknowledgments and Resources
+
+I've utilized several libraries and drawn inspiration from various sources in the C programming community.
+A huge thanks to the authors of these resources:
+
+- [Cauldron](https://github.com/camel-cdr/cauldron) - A collection of useful C code snippets and utilities.
+- [C Algorithms](https://github.com/fragglet/c-algorithms) - A library of common data structures and algorithms written in C.
+- [CTL](https://github.com/glouw/ctl/) - A library for container types for C.
+- [CTest](https://github.com/bvdberg/ctest) - A unit testing framework for C.
+- [STB](https://github.com/nothings/stb) - Single-file public domain libraries for C/C++.
+- [Log.c](https://github.com/rxi/log.c) - A simple logging library implemented in C.
+- [Sort](https://github.com/swenson/sort/) - A collection of sorting algorithms implemented in C.
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 0000000..995a9de
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,78 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1694529238,
+        "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "mini-compile-commands": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1689962671,
+        "narHash": "sha256-qPyV3iU8rXhIlHEELAtu2UwXtdZgsWMK9dHySlVa8Jo=",
+        "owner": "danielbarter",
+        "repo": "mini_compile_commands",
+        "rev": "073ad72d27287f3e8073c3ef0c9069223fb2048c",
+        "type": "github"
+      },
+      "original": {
+        "owner": "danielbarter",
+        "repo": "mini_compile_commands",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1701068326,
+        "narHash": "sha256-vmMceA+q6hG1yrjb+MP8T0YFDQIrW3bl45e7z24IEts=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "8cfef6986adfb599ba379ae53c9f5631ecd2fd9c",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "flake-utils": "flake-utils",
+        "mini-compile-commands": "mini-compile-commands",
+        "nixpkgs": "nixpkgs"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 0000000..e5747dd
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,22 @@
+{
+  inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
+  inputs.flake-utils.url = "github:numtide/flake-utils";
+  inputs.mini-compile-commands = { url = github:danielbarter/mini_compile_commands; flake = false; };
+
+  outputs = { self, nixpkgs, flake-utils, mini-compile-commands }:
+    flake-utils.lib.eachDefaultSystem (system:
+      let pkgs = nixpkgs.legacyPackages.${system};
+
+      in
+      rec {
+
+        devShell = with pkgs;
+          let
+            mcc-env = (callPackage mini-compile-commands { }).wrap clangStdenv;
+          in
+          (mkShell.override { stdenv = mcc-env; }) {
+            buildInputs = [ meson ninja ];
+          };
+
+      });
+}
diff --git a/include/aoc/all.h b/include/aoc/all.h
new file mode 100644
index 0000000..4ce6b2a
--- /dev/null
+++ b/include/aoc/all.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <assert.h>
+#include <limits.h>
+#include <stdbool.h>
+
+#include "aoc/io.h"
+#include "aoc/log.h"
+#include "aoc/macros.h"
+#include "aoc/math.h"
+#include "aoc/md5.h"
+#include "aoc/parser.h"
+#include "aoc/point.h"
+#include "aoc/stb_sprintf.h"
+#include "aoc/string.h"
diff --git a/include/aoc/io.h b/include/aoc/io.h
new file mode 100644
index 0000000..c6f63b3
--- /dev/null
+++ b/include/aoc/io.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <stddef.h>
+
+/**
+ * Reads data from a file into a buffer.
+ *
+ * This function opens the file specified by `fname` and reads its contents
+ * into a buffer `buf` provided by the caller. The amount of data read is
+ * limited by `buf_size`.
+ *
+ * @param fname Pointer to a null-terminated string that specifies the name of
+ * the file to be read.
+ * @param buf Pointer to a buffer where the read data should be stored.
+ * @param buf_size The size of the buffer `buf`, representing the maximum number
+ * of bytes to be read.
+ *
+ * @return The function returns the number of bytes successfully read into the
+ * buffer. If an error occurs, a negative value is returned to indicate the
+ * error.
+ */
+int read_input(const char *fname, char *buf, size_t buf_size);
diff --git a/include/aoc/log.h b/include/aoc/log.h
new file mode 100644
index 0000000..ce99393
--- /dev/null
+++ b/include/aoc/log.h
@@ -0,0 +1,58 @@
+/**
+ * Copyright (c) 2017 rxi
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the MIT license. See `log.c` for details.
+ */
+
+#ifndef LOG_H
+#define LOG_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdarg.h>
+
+#define LOG_VERSION "0.2.0"
+
+typedef void (*log_LockFn)(void *udata, int lock);
+
+enum { LOG_TRACE, LOG_DEBUG, LOG_INFO, LOG_WARN, LOG_ERROR, LOG_FATAL };
+
+#if defined(NDEBUG) || defined(RELEASE)
+#define log_trace(...) do { } while(0)
+#define log_debug(...) do { } while(0)
+#else
+#define log_trace(...) log_log(LOG_TRACE, 0, __FILE__, __LINE__, __VA_ARGS__)
+#define log_debug(...) log_log(LOG_DEBUG, 0, __FILE__, __LINE__, __VA_ARGS__)
+#endif
+#define log_info(...)  log_log(LOG_INFO,  0, __FILE__, __LINE__, __VA_ARGS__)
+#define log_warn(...)  log_log(LOG_WARN,  0, __FILE__, __LINE__, __VA_ARGS__)
+#define log_error(...) log_log(LOG_ERROR, 0, __FILE__, __LINE__, __VA_ARGS__)
+#define log_perror(...) log_log(LOG_ERROR, 1, __FILE__, __LINE__, __VA_ARGS__)
+#define log_fatal(...) log_log(LOG_FATAL, 0, __FILE__, __LINE__, __VA_ARGS__)
+
+void log_set_udata(void *udata);
+void log_set_lock(log_LockFn fn);
+void log_set_fp(FILE *fp);
+void log_set_level(int level);
+void log_set_file_level(int level);
+void log_set_quiet(int enable);
+
+void log_log(int level, const int do_perror, const char *file, int line, const char *fmt, ...);
+
+#define LOGT log_trace
+#define LOGD log_debug
+#define LOGI log_info
+#define LOGW log_warn
+#define LOGE log_error
+#define LOGP log_perror
+#define LOGF log_fatal
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/aoc/macros.h b/include/aoc/macros.h
new file mode 100644
index 0000000..b31c940
--- /dev/null
+++ b/include/aoc/macros.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <stddef.h>
+
+#define DIE(...)                                                               \
+    do {                                                                       \
+        fprintf(stderr, __VA_ARGS__);                                          \
+        return -1;                                                             \
+    } while (0)
+
+#define _unused_ __attribute__((__unused__))
+#define _cleanup_(x) __attribute__((__cleanup__(x)))
+#define _likely_(x) (__builtin_expect(!!(x), 1))
+#define _unlikely_(x) (__builtin_expect(!!(x), 0))
+#define MAX(a, b)                                                              \
+    ({                                                                         \
+        typeof(a) _a = (a);                                                    \
+        typeof(b) _b = (b);                                                    \
+        _a > _b ? _a : _b;                                                     \
+    })
+#define MIN(a, b)                                                              \
+    ({                                                                         \
+        typeof(a) _a = (a);                                                    \
+        typeof(b) _b = (b);                                                    \
+        _a < _b ? _a : _b;                                                     \
+    })
+
+#define SWAP_VARS(x, y)                                                        \
+    do {                                                                       \
+        typeof(x) temp = (x);                                                  \
+        (x) = (y);                                                             \
+        (y) = temp;                                                            \
+    } while (0)
+#define ARRAY_LENGTH(x) (sizeof(x) / sizeof((x)[0]))
diff --git a/include/aoc/math.h b/include/aoc/math.h
new file mode 100644
index 0000000..96771e9
--- /dev/null
+++ b/include/aoc/math.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <stddef.h>
+
+/**
+ *  "itoa" based on Kernighan & Ritchie's "Ansi C".
+ *  Converts value to a string and stores the result in str.
+ *  Returns the length of str.
+ */
+size_t itoa(int value, char *str, int base);
+
+/**
+ * Calculate the greatest common divisor (GCD) of two integers 'a' and 'b'
+ * using the extended Euclidean algorithm. Additionally, compute the Bézout
+ * coefficients 's' and 't' such that:
+ *
+ *   gcd(a, b) = a*s + b*t
+ *
+ * The function updates the values pointed to by 's' and 't' to store the
+ * Bézout coefficients.
+ *
+ * @param a The first integer
+ * @param b The second integer
+ * @param s Pointer to an integer to store the Bézout coefficient 's'
+ * @param t Pointer to an integer to store the Bézout coefficient 't'
+ * @return The greatest common divisor (GCD) of 'a' and 'b'
+ */
+int gcdx(int a, int b, int *x, int *y);
+
+/**
+ * Calculate the modular residue of an integer 'n' modulo 'mod' using
+ * a faster computation technique suitable for positive integers.
+ *
+ * @param n The integer to compute the modular residue for
+ * @param mod The modulus value
+ * @return The modular residue of 'n' modulo 'mod'
+ */
+int fastmod(int n, int mod);
+
+/**
+ * Calculate the modular multiplicative inverse of an integer 'b' modulo 'mod'.
+ * The function finds an integer 'x' such that (b * x) % mod == 1.
+ *
+ * @param b The integer for which to find the modular inverse
+ * @param mod The modulus value
+ * @return The modular multiplicative inverse of 'b' modulo 'mod', or -1 if no
+ *         inverse exists (when 'b' and 'mod' are not coprime).
+ */
+int modinv(int b, int mod);
+
+/**
+ * Determine the sign of an integer 'val'.
+ *
+ * @param val The integer to determine the sign of
+ * @return  1 if 'val' is positive,
+ *         -1 if 'val' is negative,
+ *          0 if 'val' is zero.
+ */
+int sgn(int val);
diff --git a/include/aoc/md5.h b/include/aoc/md5.h
new file mode 100644
index 0000000..75d3ffd
--- /dev/null
+++ b/include/aoc/md5.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+    uint64_t size;      // Size of input in bytes
+    uint32_t buffer[4]; // Current accumulation of hash
+    uint8_t input[64];  // Input to be used in the next step
+    uint8_t digest[16]; // Result of algorithm
+} MD5Context;
+
+void md5Init(MD5Context *ctx);
+void md5Update(MD5Context *ctx, uint8_t *input, size_t input_len);
+void md5Finalize(MD5Context *ctx);
+void md5Step(uint32_t *buffer, uint32_t *input);
+
+void md5String(char *input, uint8_t *result);
+void md5File(FILE *file, uint8_t *result);
diff --git a/include/aoc/parser.h b/include/aoc/parser.h
new file mode 100644
index 0000000..ba8859b
--- /dev/null
+++ b/include/aoc/parser.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <stddef.h>
+
+/**
+ * Parses a non-negative integer from a given string.
+ *
+ * This function attempts to parse and return a non-negative integer from the
+ * provided character array. It begins parsing from the position pointed to by
+ * `pos` and updates this value to the position immediately after the last
+ * parsed character. This function skips any leading whitespace.
+ *
+ * If the parsing is successful, the function returns the parsed non-negative
+ * integer. In case of an error, the function returns a negative number
+ * indicating an error.
+ *
+ * @param buf A pointer to the null-terminated character array from which the
+ * integer is parsed.
+ * @param pos A pointer to a `size_t` variable. On function entry, it should
+ * point to the index in `buf` from where the parsing should start. On function
+ * exit, it is updated to the index immediately after the last parsed character.
+ *
+ * @return The parsed non-negative integer on success, or a negative number on
+ * error.
+ */
+int aoc_parse_nonnegative(const char *buf, size_t *pos);
+
+/**
+ * Skips over any whitespace characters in the buffer starting from the position
+ * pointed by pos.
+ *
+ * @param buf The buffer to parse for whitespace characters.
+ * @param pos Pointer to the size_t variable that holds the starting position
+ * for skipping whitespace and is updated to the position after the skipped
+ * whitespace.
+ *
+ * @note The function updates *pos to the position immediately following the
+ * last whitespace character encountered. If there are no whitespace characters,
+ * *pos remains unchanged.
+ */
+void aoc_parse_skip_ws(const char *buf, size_t *pos);
+
+/**
+ * Searches for the first occurrence of the specified character (needle) in the
+ * buffer starting from the position pointed by pos.
+ *
+ * @param buf The buffer in which to search for the character.
+ * @param pos Pointer to the size_t variable that holds the starting position
+ * for the search and is updated to the position of the found character.
+ * @param needle The character to search for in the buffer.
+ *
+ * @note The function updates *pos to the position of the first occurrence of
+ * needle. If the needle is not found, *pos is set to the length of the buffer.
+ */
+void aoc_parse_seek(const char *buf, size_t *pos, char needle);
diff --git a/include/aoc/point.h b/include/aoc/point.h
new file mode 100644
index 0000000..8881865
--- /dev/null
+++ b/include/aoc/point.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <stddef.h>
+
+typedef struct {
+    int x;
+    int y;
+} Point2D;
+
+inline int Point2D_equal(Point2D *lhs, Point2D *rhs) {
+    return lhs->x == rhs->x && lhs->y == rhs->y;
+}
+
+inline size_t Point2D_hash(Point2D *p) {
+    const size_t prime = 31;
+    size_t hash = 17;
+    // Combine the hash of each struct member
+    hash = hash * prime + (unsigned int)p->x;
+    hash = hash * prime + (unsigned int)p->y;
+    return hash;
+}
diff --git a/include/aoc/stb_sprintf.h b/include/aoc/stb_sprintf.h
new file mode 100644
index 0000000..ca432a6
--- /dev/null
+++ b/include/aoc/stb_sprintf.h
@@ -0,0 +1,1906 @@
+// stb_sprintf - v1.10 - public domain snprintf() implementation
+// originally by Jeff Roberts / RAD Game Tools, 2015/10/20
+// http://github.com/nothings/stb
+//
+// allowed types:  sc uidBboXx p AaGgEef n
+// lengths      :  hh h ll j z t I64 I32 I
+//
+// Contributors:
+//    Fabian "ryg" Giesen (reformatting)
+//    github:aganm (attribute format)
+//
+// Contributors (bugfixes):
+//    github:d26435
+//    github:trex78
+//    github:account-login
+//    Jari Komppa (SI suffixes)
+//    Rohit Nirmal
+//    Marcin Wojdyr
+//    Leonard Ritter
+//    Stefano Zanotti
+//    Adam Allison
+//    Arvid Gerstmann
+//    Markus Kolb
+//
+// LICENSE:
+//
+//   See end of file for license information.
+
+#ifndef STB_SPRINTF_H_INCLUDE
+#define STB_SPRINTF_H_INCLUDE
+
+/*
+Single file sprintf replacement.
+
+Originally written by Jeff Roberts at RAD Game Tools - 2015/10/20.
+Hereby placed in public domain.
+
+This is a full sprintf replacement that supports everything that
+the C runtime sprintfs support, including float/double, 64-bit integers,
+hex floats, field parameters (%*.*d stuff), length reads backs, etc.
+
+Why would you need this if sprintf already exists?  Well, first off,
+it's *much* faster (see below). It's also much smaller than the CRT
+versions code-space-wise. We've also added some simple improvements
+that are super handy (commas in thousands, callbacks at buffer full,
+for example). Finally, the format strings for MSVC and GCC differ
+for 64-bit integers (among other small things), so this lets you use
+the same format strings in cross platform code.
+
+It uses the standard single file trick of being both the header file
+and the source itself. If you just include it normally, you just get
+the header file function definitions. To get the code, you include
+it from a C or C++ file and define STB_SPRINTF_IMPLEMENTATION first.
+
+It only uses va_args macros from the C runtime to do it's work. It
+does cast doubles to S64s and shifts and divides U64s, which does
+drag in CRT code on most platforms.
+
+It compiles to roughly 8K with float support, and 4K without.
+As a comparison, when using MSVC static libs, calling sprintf drags
+in 16K.
+
+API:
+====
+int stbsp_sprintf( char * buf, char const * fmt, ... )
+int stbsp_snprintf( char * buf, int count, char const * fmt, ... )
+  Convert an arg list into a buffer.  stbsp_snprintf always returns
+  a zero-terminated string (unlike regular snprintf).
+
+int stbsp_vsprintf( char * buf, char const * fmt, va_list va )
+int stbsp_vsnprintf( char * buf, int count, char const * fmt, va_list va )
+  Convert a va_list arg list into a buffer.  stbsp_vsnprintf always returns
+  a zero-terminated string (unlike regular snprintf).
+
+int stbsp_vsprintfcb( STBSP_SPRINTFCB * callback, void * user, char * buf, char const * fmt, va_list va )
+    typedef char * STBSP_SPRINTFCB( char const * buf, void * user, int len );
+  Convert into a buffer, calling back every STB_SPRINTF_MIN chars.
+  Your callback can then copy the chars out, print them or whatever.
+  This function is actually the workhorse for everything else.
+  The buffer you pass in must hold at least STB_SPRINTF_MIN characters.
+    // you return the next buffer to use or 0 to stop converting
+
+void stbsp_set_separators( char comma, char period )
+  Set the comma and period characters to use.
+
+FLOATS/DOUBLES:
+===============
+This code uses a internal float->ascii conversion method that uses
+doubles with error correction (double-doubles, for ~105 bits of
+precision).  This conversion is round-trip perfect - that is, an atof
+of the values output here will give you the bit-exact double back.
+
+One difference is that our insignificant digits will be different than
+with MSVC or GCC (but they don't match each other either).  We also
+don't attempt to find the minimum length matching float (pre-MSVC15
+doesn't either).
+
+If you don't need float or doubles at all, define STB_SPRINTF_NOFLOAT
+and you'll save 4K of code space.
+
+64-BIT INTS:
+============
+This library also supports 64-bit integers and you can use MSVC style or
+GCC style indicators (%I64d or %lld).  It supports the C99 specifiers
+for size_t and ptr_diff_t (%jd %zd) as well.
+
+EXTRAS:
+=======
+Like some GCCs, for integers and floats, you can use a ' (single quote)
+specifier and commas will be inserted on the thousands: "%'d" on 12345
+would print 12,345.
+
+For integers and floats, you can use a "$" specifier and the number
+will be converted to float and then divided to get kilo, mega, giga or
+tera and then printed, so "%$d" 1000 is "1.0 k", "%$.2d" 2536000 is
+"2.53 M", etc. For byte values, use two $:s, like "%$$d" to turn
+2536000 to "2.42 Mi". If you prefer JEDEC suffixes to SI ones, use three
+$:s: "%$$$d" -> "2.42 M". To remove the space between the number and the
+suffix, add "_" specifier: "%_$d" -> "2.53M".
+
+In addition to octal and hexadecimal conversions, you can print
+integers in binary: "%b" for 256 would print 100.
+
+PERFORMANCE vs MSVC 2008 32-/64-bit (GCC is even slower than MSVC):
+===================================================================
+"%d" across all 32-bit ints (4.8x/4.0x faster than 32-/64-bit MSVC)
+"%24d" across all 32-bit ints (4.5x/4.2x faster)
+"%x" across all 32-bit ints (4.5x/3.8x faster)
+"%08x" across all 32-bit ints (4.3x/3.8x faster)
+"%f" across e-10 to e+10 floats (7.3x/6.0x faster)
+"%e" across e-10 to e+10 floats (8.1x/6.0x faster)
+"%g" across e-10 to e+10 floats (10.0x/7.1x faster)
+"%f" for values near e-300 (7.9x/6.5x faster)
+"%f" for values near e+300 (10.0x/9.1x faster)
+"%e" for values near e-300 (10.1x/7.0x faster)
+"%e" for values near e+300 (9.2x/6.0x faster)
+"%.320f" for values near e-300 (12.6x/11.2x faster)
+"%a" for random values (8.6x/4.3x faster)
+"%I64d" for 64-bits with 32-bit values (4.8x/3.4x faster)
+"%I64d" for 64-bits > 32-bit values (4.9x/5.5x faster)
+"%s%s%s" for 64 char strings (7.1x/7.3x faster)
+"...512 char string..." ( 35.0x/32.5x faster!)
+*/
+
+#if defined(__clang__)
+ #if defined(__has_feature) && defined(__has_attribute)
+  #if __has_feature(address_sanitizer)
+   #if __has_attribute(__no_sanitize__)
+    #define STBSP__ASAN __attribute__((__no_sanitize__("address")))
+   #elif __has_attribute(__no_sanitize_address__)
+    #define STBSP__ASAN __attribute__((__no_sanitize_address__))
+   #elif __has_attribute(__no_address_safety_analysis__)
+    #define STBSP__ASAN __attribute__((__no_address_safety_analysis__))
+   #endif
+  #endif
+ #endif
+#elif defined(__GNUC__) && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+ #if defined(__SANITIZE_ADDRESS__) && __SANITIZE_ADDRESS__
+  #define STBSP__ASAN __attribute__((__no_sanitize_address__))
+ #endif
+#endif
+
+#ifndef STBSP__ASAN
+#define STBSP__ASAN
+#endif
+
+#ifdef STB_SPRINTF_STATIC
+#define STBSP__PUBLICDEC static
+#define STBSP__PUBLICDEF static STBSP__ASAN
+#else
+#ifdef __cplusplus
+#define STBSP__PUBLICDEC extern "C"
+#define STBSP__PUBLICDEF extern "C" STBSP__ASAN
+#else
+#define STBSP__PUBLICDEC extern
+#define STBSP__PUBLICDEF STBSP__ASAN
+#endif
+#endif
+
+#if defined(__has_attribute)
+ #if __has_attribute(format)
+   #define STBSP__ATTRIBUTE_FORMAT(fmt,va) __attribute__((format(printf,fmt,va)))
+ #endif
+#endif
+
+#ifndef STBSP__ATTRIBUTE_FORMAT
+#define STBSP__ATTRIBUTE_FORMAT(fmt,va)
+#endif
+
+#ifdef _MSC_VER
+#define STBSP__NOTUSED(v)  (void)(v)
+#else
+#define STBSP__NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#include <stdarg.h> // for va_arg(), va_list()
+#include <stddef.h> // size_t, ptrdiff_t
+
+#ifndef STB_SPRINTF_MIN
+#define STB_SPRINTF_MIN 512 // how many characters per callback
+#endif
+typedef char *STBSP_SPRINTFCB(const char *buf, void *user, int len);
+
+#ifndef STB_SPRINTF_DECORATE
+#define STB_SPRINTF_DECORATE(name) stbsp_##name // define this before including if you want to change the names
+#endif
+
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(vsprintf)(char *buf, char const *fmt, va_list va);
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(vsnprintf)(char *buf, int count, char const *fmt, va_list va);
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(sprintf)(char *buf, char const *fmt, ...) STBSP__ATTRIBUTE_FORMAT(2,3);
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(snprintf)(char *buf, int count, char const *fmt, ...) STBSP__ATTRIBUTE_FORMAT(3,4);
+
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback, void *user, char *buf, char const *fmt, va_list va);
+STBSP__PUBLICDEC void STB_SPRINTF_DECORATE(set_separators)(char comma, char period);
+
+#endif // STB_SPRINTF_H_INCLUDE
+
+#ifdef STB_SPRINTF_IMPLEMENTATION
+
+#define stbsp__uint32 unsigned int
+#define stbsp__int32 signed int
+
+#ifdef _MSC_VER
+#define stbsp__uint64 unsigned __int64
+#define stbsp__int64 signed __int64
+#else
+#define stbsp__uint64 unsigned long long
+#define stbsp__int64 signed long long
+#endif
+#define stbsp__uint16 unsigned short
+
+#ifndef stbsp__uintptr
+#if defined(__ppc64__) || defined(__powerpc64__) || defined(__aarch64__) || defined(_M_X64) || defined(__x86_64__) || defined(__x86_64) || defined(__s390x__)
+#define stbsp__uintptr stbsp__uint64
+#else
+#define stbsp__uintptr stbsp__uint32
+#endif
+#endif
+
+#ifndef STB_SPRINTF_MSVC_MODE // used for MSVC2013 and earlier (MSVC2015 matches GCC)
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#define STB_SPRINTF_MSVC_MODE
+#endif
+#endif
+
+#ifdef STB_SPRINTF_NOUNALIGNED // define this before inclusion to force stbsp_sprintf to always use aligned accesses
+#define STBSP__UNALIGNED(code)
+#else
+#define STBSP__UNALIGNED(code) code
+#endif
+
+#ifndef STB_SPRINTF_NOFLOAT
+// internal float utility functions
+static stbsp__int32 stbsp__real_to_str(char const **start, stbsp__uint32 *len, char *out, stbsp__int32 *decimal_pos, double value, stbsp__uint32 frac_digits);
+static stbsp__int32 stbsp__real_to_parts(stbsp__int64 *bits, stbsp__int32 *expo, double value);
+#define STBSP__SPECIAL 0x7000
+#endif
+
+static char stbsp__period = '.';
+static char stbsp__comma = ',';
+static struct
+{
+   short temp; // force next field to be 2-byte aligned
+   char pair[201];
+} stbsp__digitpair =
+{
+  0,
+   "00010203040506070809101112131415161718192021222324"
+   "25262728293031323334353637383940414243444546474849"
+   "50515253545556575859606162636465666768697071727374"
+   "75767778798081828384858687888990919293949596979899"
+};
+
+STBSP__PUBLICDEF void STB_SPRINTF_DECORATE(set_separators)(char pcomma, char pperiod)
+{
+   stbsp__period = pperiod;
+   stbsp__comma = pcomma;
+}
+
+#define STBSP__LEFTJUST 1
+#define STBSP__LEADINGPLUS 2
+#define STBSP__LEADINGSPACE 4
+#define STBSP__LEADING_0X 8
+#define STBSP__LEADINGZERO 16
+#define STBSP__INTMAX 32
+#define STBSP__TRIPLET_COMMA 64
+#define STBSP__NEGATIVE 128
+#define STBSP__METRIC_SUFFIX 256
+#define STBSP__HALFWIDTH 512
+#define STBSP__METRIC_NOSPACE 1024
+#define STBSP__METRIC_1024 2048
+#define STBSP__METRIC_JEDEC 4096
+
+static void stbsp__lead_sign(stbsp__uint32 fl, char *sign)
+{
+   sign[0] = 0;
+   if (fl & STBSP__NEGATIVE) {
+      sign[0] = 1;
+      sign[1] = '-';
+   } else if (fl & STBSP__LEADINGSPACE) {
+      sign[0] = 1;
+      sign[1] = ' ';
+   } else if (fl & STBSP__LEADINGPLUS) {
+      sign[0] = 1;
+      sign[1] = '+';
+   }
+}
+
+static STBSP__ASAN stbsp__uint32 stbsp__strlen_limited(char const *s, stbsp__uint32 limit)
+{
+   char const * sn = s;
+
+   // get up to 4-byte alignment
+   for (;;) {
+      if (((stbsp__uintptr)sn & 3) == 0)
+         break;
+
+      if (!limit || *sn == 0)
+         return (stbsp__uint32)(sn - s);
+
+      ++sn;
+      --limit;
+   }
+
+   // scan over 4 bytes at a time to find terminating 0
+   // this will intentionally scan up to 3 bytes past the end of buffers,
+   // but becase it works 4B aligned, it will never cross page boundaries
+   // (hence the STBSP__ASAN markup; the over-read here is intentional
+   // and harmless)
+   while (limit >= 4) {
+      stbsp__uint32 v = *(stbsp__uint32 *)sn;
+      // bit hack to find if there's a 0 byte in there
+      if ((v - 0x01010101) & (~v) & 0x80808080UL)
+         break;
+
+      sn += 4;
+      limit -= 4;
+   }
+
+   // handle the last few characters to find actual size
+   while (limit && *sn) {
+      ++sn;
+      --limit;
+   }
+
+   return (stbsp__uint32)(sn - s);
+}
+
+STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback, void *user, char *buf, char const *fmt, va_list va)
+{
+   static char hex[] = "0123456789abcdefxp";
+   static char hexu[] = "0123456789ABCDEFXP";
+   char *bf;
+   char const *f;
+   int tlen = 0;
+
+   bf = buf;
+   f = fmt;
+   for (;;) {
+      stbsp__int32 fw, pr, tz;
+      stbsp__uint32 fl;
+
+      // macros for the callback buffer stuff
+      #define stbsp__chk_cb_bufL(bytes)                        \
+         {                                                     \
+            int len = (int)(bf - buf);                         \
+            if ((len + (bytes)) >= STB_SPRINTF_MIN) {          \
+               tlen += len;                                    \
+               if (0 == (bf = buf = callback(buf, user, len))) \
+                  goto done;                                   \
+            }                                                  \
+         }
+      #define stbsp__chk_cb_buf(bytes)    \
+         {                                \
+            if (callback) {               \
+               stbsp__chk_cb_bufL(bytes); \
+            }                             \
+         }
+      #define stbsp__flush_cb()                      \
+         {                                           \
+            stbsp__chk_cb_bufL(STB_SPRINTF_MIN - 1); \
+         } // flush if there is even one byte in the buffer
+      #define stbsp__cb_buf_clamp(cl, v)                \
+         cl = v;                                        \
+         if (callback) {                                \
+            int lg = STB_SPRINTF_MIN - (int)(bf - buf); \
+            if (cl > lg)                                \
+               cl = lg;                                 \
+         }
+
+      // fast copy everything up to the next % (or end of string)
+      for (;;) {
+         while (((stbsp__uintptr)f) & 3) {
+         schk1:
+            if (f[0] == '%')
+               goto scandd;
+         schk2:
+            if (f[0] == 0)
+               goto endfmt;
+            stbsp__chk_cb_buf(1);
+            *bf++ = f[0];
+            ++f;
+         }
+         for (;;) {
+            // Check if the next 4 bytes contain %(0x25) or end of string.
+            // Using the 'hasless' trick:
+            // https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
+            stbsp__uint32 v, c;
+            v = *(stbsp__uint32 *)f;
+            c = (~v) & 0x80808080;
+            if (((v ^ 0x25252525) - 0x01010101) & c)
+               goto schk1;
+            if ((v - 0x01010101) & c)
+               goto schk2;
+            if (callback)
+               if ((STB_SPRINTF_MIN - (int)(bf - buf)) < 4)
+                  goto schk1;
+            #ifdef STB_SPRINTF_NOUNALIGNED
+                if(((stbsp__uintptr)bf) & 3) {
+                    bf[0] = f[0];
+                    bf[1] = f[1];
+                    bf[2] = f[2];
+                    bf[3] = f[3];
+                } else
+            #endif
+            {
+                *(stbsp__uint32 *)bf = v;
+            }
+            bf += 4;
+            f += 4;
+         }
+      }
+   scandd:
+
+      ++f;
+
+      // ok, we have a percent, read the modifiers first
+      fw = 0;
+      pr = -1;
+      fl = 0;
+      tz = 0;
+
+      // flags
+      for (;;) {
+         switch (f[0]) {
+         // if we have left justify
+         case '-':
+            fl |= STBSP__LEFTJUST;
+            ++f;
+            continue;
+         // if we have leading plus
+         case '+':
+            fl |= STBSP__LEADINGPLUS;
+            ++f;
+            continue;
+         // if we have leading space
+         case ' ':
+            fl |= STBSP__LEADINGSPACE;
+            ++f;
+            continue;
+         // if we have leading 0x
+         case '#':
+            fl |= STBSP__LEADING_0X;
+            ++f;
+            continue;
+         // if we have thousand commas
+         case '\'':
+            fl |= STBSP__TRIPLET_COMMA;
+            ++f;
+            continue;
+         // if we have kilo marker (none->kilo->kibi->jedec)
+         case '$':
+            if (fl & STBSP__METRIC_SUFFIX) {
+               if (fl & STBSP__METRIC_1024) {
+                  fl |= STBSP__METRIC_JEDEC;
+               } else {
+                  fl |= STBSP__METRIC_1024;
+               }
+            } else {
+               fl |= STBSP__METRIC_SUFFIX;
+            }
+            ++f;
+            continue;
+         // if we don't want space between metric suffix and number
+         case '_':
+            fl |= STBSP__METRIC_NOSPACE;
+            ++f;
+            continue;
+         // if we have leading zero
+         case '0':
+            fl |= STBSP__LEADINGZERO;
+            ++f;
+            goto flags_done;
+         default: goto flags_done;
+         }
+      }
+   flags_done:
+
+      // get the field width
+      if (f[0] == '*') {
+         fw = va_arg(va, stbsp__uint32);
+         ++f;
+      } else {
+         while ((f[0] >= '0') && (f[0] <= '9')) {
+            fw = fw * 10 + f[0] - '0';
+            f++;
+         }
+      }
+      // get the precision
+      if (f[0] == '.') {
+         ++f;
+         if (f[0] == '*') {
+            pr = va_arg(va, stbsp__uint32);
+            ++f;
+         } else {
+            pr = 0;
+            while ((f[0] >= '0') && (f[0] <= '9')) {
+               pr = pr * 10 + f[0] - '0';
+               f++;
+            }
+         }
+      }
+
+      // handle integer size overrides
+      switch (f[0]) {
+      // are we halfwidth?
+      case 'h':
+         fl |= STBSP__HALFWIDTH;
+         ++f;
+         if (f[0] == 'h')
+            ++f;  // QUARTERWIDTH
+         break;
+      // are we 64-bit (unix style)
+      case 'l':
+         fl |= ((sizeof(long) == 8) ? STBSP__INTMAX : 0);
+         ++f;
+         if (f[0] == 'l') {
+            fl |= STBSP__INTMAX;
+            ++f;
+         }
+         break;
+      // are we 64-bit on intmax? (c99)
+      case 'j':
+         fl |= (sizeof(size_t) == 8) ? STBSP__INTMAX : 0;
+         ++f;
+         break;
+      // are we 64-bit on size_t or ptrdiff_t? (c99)
+      case 'z':
+         fl |= (sizeof(ptrdiff_t) == 8) ? STBSP__INTMAX : 0;
+         ++f;
+         break;
+      case 't':
+         fl |= (sizeof(ptrdiff_t) == 8) ? STBSP__INTMAX : 0;
+         ++f;
+         break;
+      // are we 64-bit (msft style)
+      case 'I':
+         if ((f[1] == '6') && (f[2] == '4')) {
+            fl |= STBSP__INTMAX;
+            f += 3;
+         } else if ((f[1] == '3') && (f[2] == '2')) {
+            f += 3;
+         } else {
+            fl |= ((sizeof(void *) == 8) ? STBSP__INTMAX : 0);
+            ++f;
+         }
+         break;
+      default: break;
+      }
+
+      // handle each replacement
+      switch (f[0]) {
+         #define STBSP__NUMSZ 512 // big enough for e308 (with commas) or e-307
+         char num[STBSP__NUMSZ];
+         char lead[8];
+         char tail[8];
+         char *s;
+         char const *h;
+         stbsp__uint32 l, n, cs;
+         stbsp__uint64 n64;
+#ifndef STB_SPRINTF_NOFLOAT
+         double fv;
+#endif
+         stbsp__int32 dp;
+         char const *sn;
+
+      case 's':
+         // get the string
+         s = va_arg(va, char *);
+         if (s == 0)
+            s = (char *)"null";
+         // get the length, limited to desired precision
+         // always limit to ~0u chars since our counts are 32b
+         l = stbsp__strlen_limited(s, (pr >= 0) ? pr : ~0u);
+         lead[0] = 0;
+         tail[0] = 0;
+         pr = 0;
+         dp = 0;
+         cs = 0;
+         // copy the string in
+         goto scopy;
+
+      case 'c': // char
+         // get the character
+         s = num + STBSP__NUMSZ - 1;
+         *s = (char)va_arg(va, int);
+         l = 1;
+         lead[0] = 0;
+         tail[0] = 0;
+         pr = 0;
+         dp = 0;
+         cs = 0;
+         goto scopy;
+
+      case 'n': // weird write-bytes specifier
+      {
+         int *d = va_arg(va, int *);
+         *d = tlen + (int)(bf - buf);
+      } break;
+
+#ifdef STB_SPRINTF_NOFLOAT
+      case 'A':              // float
+      case 'a':              // hex float
+      case 'G':              // float
+      case 'g':              // float
+      case 'E':              // float
+      case 'e':              // float
+      case 'f':              // float
+         va_arg(va, double); // eat it
+         s = (char *)"No float";
+         l = 8;
+         lead[0] = 0;
+         tail[0] = 0;
+         pr = 0;
+         cs = 0;
+         STBSP__NOTUSED(dp);
+         goto scopy;
+#else
+      case 'A': // hex float
+      case 'a': // hex float
+         h = (f[0] == 'A') ? hexu : hex;
+         fv = va_arg(va, double);
+         if (pr == -1)
+            pr = 6; // default is 6
+         // read the double into a string
+         if (stbsp__real_to_parts((stbsp__int64 *)&n64, &dp, fv))
+            fl |= STBSP__NEGATIVE;
+
+         s = num + 64;
+
+         stbsp__lead_sign(fl, lead);
+
+         if (dp == -1023)
+            dp = (n64) ? -1022 : 0;
+         else
+            n64 |= (((stbsp__uint64)1) << 52);
+         n64 <<= (64 - 56);
+         if (pr < 15)
+            n64 += ((((stbsp__uint64)8) << 56) >> (pr * 4));
+// add leading chars
+
+#ifdef STB_SPRINTF_MSVC_MODE
+         *s++ = '0';
+         *s++ = 'x';
+#else
+         lead[1 + lead[0]] = '0';
+         lead[2 + lead[0]] = 'x';
+         lead[0] += 2;
+#endif
+         *s++ = h[(n64 >> 60) & 15];
+         n64 <<= 4;
+         if (pr)
+            *s++ = stbsp__period;
+         sn = s;
+
+         // print the bits
+         n = pr;
+         if (n > 13)
+            n = 13;
+         if (pr > (stbsp__int32)n)
+            tz = pr - n;
+         pr = 0;
+         while (n--) {
+            *s++ = h[(n64 >> 60) & 15];
+            n64 <<= 4;
+         }
+
+         // print the expo
+         tail[1] = h[17];
+         if (dp < 0) {
+            tail[2] = '-';
+            dp = -dp;
+         } else
+            tail[2] = '+';
+         n = (dp >= 1000) ? 6 : ((dp >= 100) ? 5 : ((dp >= 10) ? 4 : 3));
+         tail[0] = (char)n;
+         for (;;) {
+            tail[n] = '0' + dp % 10;
+            if (n <= 3)
+               break;
+            --n;
+            dp /= 10;
+         }
+
+         dp = (int)(s - sn);
+         l = (int)(s - (num + 64));
+         s = num + 64;
+         cs = 1 + (3 << 24);
+         goto scopy;
+
+      case 'G': // float
+      case 'g': // float
+         h = (f[0] == 'G') ? hexu : hex;
+         fv = va_arg(va, double);
+         if (pr == -1)
+            pr = 6;
+         else if (pr == 0)
+            pr = 1; // default is 6
+         // read the double into a string
+         if (stbsp__real_to_str(&sn, &l, num, &dp, fv, (pr - 1) | 0x80000000))
+            fl |= STBSP__NEGATIVE;
+
+         // clamp the precision and delete extra zeros after clamp
+         n = pr;
+         if (l > (stbsp__uint32)pr)
+            l = pr;
+         while ((l > 1) && (pr) && (sn[l - 1] == '0')) {
+            --pr;
+            --l;
+         }
+
+         // should we use %e
+         if ((dp <= -4) || (dp > (stbsp__int32)n)) {
+            if (pr > (stbsp__int32)l)
+               pr = l - 1;
+            else if (pr)
+               --pr; // when using %e, there is one digit before the decimal
+            goto doexpfromg;
+         }
+         // this is the insane action to get the pr to match %g semantics for %f
+         if (dp > 0) {
+            pr = (dp < (stbsp__int32)l) ? l - dp : 0;
+         } else {
+            pr = -dp + ((pr > (stbsp__int32)l) ? (stbsp__int32) l : pr);
+         }
+         goto dofloatfromg;
+
+      case 'E': // float
+      case 'e': // float
+         h = (f[0] == 'E') ? hexu : hex;
+         fv = va_arg(va, double);
+         if (pr == -1)
+            pr = 6; // default is 6
+         // read the double into a string
+         if (stbsp__real_to_str(&sn, &l, num, &dp, fv, pr | 0x80000000))
+            fl |= STBSP__NEGATIVE;
+      doexpfromg:
+         tail[0] = 0;
+         stbsp__lead_sign(fl, lead);
+         if (dp == STBSP__SPECIAL) {
+            s = (char *)sn;
+            cs = 0;
+            pr = 0;
+            goto scopy;
+         }
+         s = num + 64;
+         // handle leading chars
+         *s++ = sn[0];
+
+         if (pr)
+            *s++ = stbsp__period;
+
+         // handle after decimal
+         if ((l - 1) > (stbsp__uint32)pr)
+            l = pr + 1;
+         for (n = 1; n < l; n++)
+            *s++ = sn[n];
+         // trailing zeros
+         tz = pr - (l - 1);
+         pr = 0;
+         // dump expo
+         tail[1] = h[0xe];
+         dp -= 1;
+         if (dp < 0) {
+            tail[2] = '-';
+            dp = -dp;
+         } else
+            tail[2] = '+';
+#ifdef STB_SPRINTF_MSVC_MODE
+         n = 5;
+#else
+         n = (dp >= 100) ? 5 : 4;
+#endif
+         tail[0] = (char)n;
+         for (;;) {
+            tail[n] = '0' + dp % 10;
+            if (n <= 3)
+               break;
+            --n;
+            dp /= 10;
+         }
+         cs = 1 + (3 << 24); // how many tens
+         goto flt_lead;
+
+      case 'f': // float
+         fv = va_arg(va, double);
+      doafloat:
+         // do kilos
+         if (fl & STBSP__METRIC_SUFFIX) {
+            double divisor;
+            divisor = 1000.0f;
+            if (fl & STBSP__METRIC_1024)
+               divisor = 1024.0;
+            while (fl < 0x4000000) {
+               if ((fv < divisor) && (fv > -divisor))
+                  break;
+               fv /= divisor;
+               fl += 0x1000000;
+            }
+         }
+         if (pr == -1)
+            pr = 6; // default is 6
+         // read the double into a string
+         if (stbsp__real_to_str(&sn, &l, num, &dp, fv, pr))
+            fl |= STBSP__NEGATIVE;
+      dofloatfromg:
+         tail[0] = 0;
+         stbsp__lead_sign(fl, lead);
+         if (dp == STBSP__SPECIAL) {
+            s = (char *)sn;
+            cs = 0;
+            pr = 0;
+            goto scopy;
+         }
+         s = num + 64;
+
+         // handle the three decimal varieties
+         if (dp <= 0) {
+            stbsp__int32 i;
+            // handle 0.000*000xxxx
+            *s++ = '0';
+            if (pr)
+               *s++ = stbsp__period;
+            n = -dp;
+            if ((stbsp__int32)n > pr)
+               n = pr;
+            i = n;
+            while (i) {
+               if ((((stbsp__uintptr)s) & 3) == 0)
+                  break;
+               *s++ = '0';
+               --i;
+            }
+            while (i >= 4) {
+               *(stbsp__uint32 *)s = 0x30303030;
+               s += 4;
+               i -= 4;
+            }
+            while (i) {
+               *s++ = '0';
+               --i;
+            }
+            if ((stbsp__int32)(l + n) > pr)
+               l = pr - n;
+            i = l;
+            while (i) {
+               *s++ = *sn++;
+               --i;
+            }
+            tz = pr - (n + l);
+            cs = 1 + (3 << 24); // how many tens did we write (for commas below)
+         } else {
+            cs = (fl & STBSP__TRIPLET_COMMA) ? ((600 - (stbsp__uint32)dp) % 3) : 0;
+            if ((stbsp__uint32)dp >= l) {
+               // handle xxxx000*000.0
+               n = 0;
+               for (;;) {
+                  if ((fl & STBSP__TRIPLET_COMMA) && (++cs == 4)) {
+                     cs = 0;
+                     *s++ = stbsp__comma;
+                  } else {
+                     *s++ = sn[n];
+                     ++n;
+                     if (n >= l)
+                        break;
+                  }
+               }
+               if (n < (stbsp__uint32)dp) {
+                  n = dp - n;
+                  if ((fl & STBSP__TRIPLET_COMMA) == 0) {
+                     while (n) {
+                        if ((((stbsp__uintptr)s) & 3) == 0)
+                           break;
+                        *s++ = '0';
+                        --n;
+                     }
+                     while (n >= 4) {
+                        *(stbsp__uint32 *)s = 0x30303030;
+                        s += 4;
+                        n -= 4;
+                     }
+                  }
+                  while (n) {
+                     if ((fl & STBSP__TRIPLET_COMMA) && (++cs == 4)) {
+                        cs = 0;
+                        *s++ = stbsp__comma;
+                     } else {
+                        *s++ = '0';
+                        --n;
+                     }
+                  }
+               }
+               cs = (int)(s - (num + 64)) + (3 << 24); // cs is how many tens
+               if (pr) {
+                  *s++ = stbsp__period;
+                  tz = pr;
+               }
+            } else {
+               // handle xxxxx.xxxx000*000
+               n = 0;
+               for (;;) {
+                  if ((fl & STBSP__TRIPLET_COMMA) && (++cs == 4)) {
+                     cs = 0;
+                     *s++ = stbsp__comma;
+                  } else {
+                     *s++ = sn[n];
+                     ++n;
+                     if (n >= (stbsp__uint32)dp)
+                        break;
+                  }
+               }
+               cs = (int)(s - (num + 64)) + (3 << 24); // cs is how many tens
+               if (pr)
+                  *s++ = stbsp__period;
+               if ((l - dp) > (stbsp__uint32)pr)
+                  l = pr + dp;
+               while (n < l) {
+                  *s++ = sn[n];
+                  ++n;
+               }
+               tz = pr - (l - dp);
+            }
+         }
+         pr = 0;
+
+         // handle k,m,g,t
+         if (fl & STBSP__METRIC_SUFFIX) {
+            char idx;
+            idx = 1;
+            if (fl & STBSP__METRIC_NOSPACE)
+               idx = 0;
+            tail[0] = idx;
+            tail[1] = ' ';
+            {
+               if (fl >> 24) { // SI kilo is 'k', JEDEC and SI kibits are 'K'.
+                  if (fl & STBSP__METRIC_1024)
+                     tail[idx + 1] = "_KMGT"[fl >> 24];
+                  else
+                     tail[idx + 1] = "_kMGT"[fl >> 24];
+                  idx++;
+                  // If printing kibits and not in jedec, add the 'i'.
+                  if (fl & STBSP__METRIC_1024 && !(fl & STBSP__METRIC_JEDEC)) {
+                     tail[idx + 1] = 'i';
+                     idx++;
+                  }
+                  tail[0] = idx;
+               }
+            }
+         };
+
+      flt_lead:
+         // get the length that we copied
+         l = (stbsp__uint32)(s - (num + 64));
+         s = num + 64;
+         goto scopy;
+#endif
+
+      case 'B': // upper binary
+      case 'b': // lower binary
+         h = (f[0] == 'B') ? hexu : hex;
+         lead[0] = 0;
+         if (fl & STBSP__LEADING_0X) {
+            lead[0] = 2;
+            lead[1] = '0';
+            lead[2] = h[0xb];
+         }
+         l = (8 << 4) | (1 << 8);
+         goto radixnum;
+
+      case 'o': // octal
+         h = hexu;
+         lead[0] = 0;
+         if (fl & STBSP__LEADING_0X) {
+            lead[0] = 1;
+            lead[1] = '0';
+         }
+         l = (3 << 4) | (3 << 8);
+         goto radixnum;
+
+      case 'p': // pointer
+         fl |= (sizeof(void *) == 8) ? STBSP__INTMAX : 0;
+         pr = sizeof(void *) * 2;
+         fl &= ~STBSP__LEADINGZERO; // 'p' only prints the pointer with zeros
+                                    // fall through - to X
+
+      case 'X': // upper hex
+      case 'x': // lower hex
+         h = (f[0] == 'X') ? hexu : hex;
+         l = (4 << 4) | (4 << 8);
+         lead[0] = 0;
+         if (fl & STBSP__LEADING_0X) {
+            lead[0] = 2;
+            lead[1] = '0';
+            lead[2] = h[16];
+         }
+      radixnum:
+         // get the number
+         if (fl & STBSP__INTMAX)
+            n64 = va_arg(va, stbsp__uint64);
+         else
+            n64 = va_arg(va, stbsp__uint32);
+
+         s = num + STBSP__NUMSZ;
+         dp = 0;
+         // clear tail, and clear leading if value is zero
+         tail[0] = 0;
+         if (n64 == 0) {
+            lead[0] = 0;
+            if (pr == 0) {
+               l = 0;
+               cs = 0;
+               goto scopy;
+            }
+         }
+         // convert to string
+         for (;;) {
+            *--s = h[n64 & ((1 << (l >> 8)) - 1)];
+            n64 >>= (l >> 8);
+            if (!((n64) || ((stbsp__int32)((num + STBSP__NUMSZ) - s) < pr)))
+               break;
+            if (fl & STBSP__TRIPLET_COMMA) {
+               ++l;
+               if ((l & 15) == ((l >> 4) & 15)) {
+                  l &= ~15;
+                  *--s = stbsp__comma;
+               }
+            }
+         };
+         // get the tens and the comma pos
+         cs = (stbsp__uint32)((num + STBSP__NUMSZ) - s) + ((((l >> 4) & 15)) << 24);
+         // get the length that we copied
+         l = (stbsp__uint32)((num + STBSP__NUMSZ) - s);
+         // copy it
+         goto scopy;
+
+      case 'u': // unsigned
+      case 'i':
+      case 'd': // integer
+         // get the integer and abs it
+         if (fl & STBSP__INTMAX) {
+            stbsp__int64 i64 = va_arg(va, stbsp__int64);
+            n64 = (stbsp__uint64)i64;
+            if ((f[0] != 'u') && (i64 < 0)) {
+               n64 = (stbsp__uint64)-i64;
+               fl |= STBSP__NEGATIVE;
+            }
+         } else {
+            stbsp__int32 i = va_arg(va, stbsp__int32);
+            n64 = (stbsp__uint32)i;
+            if ((f[0] != 'u') && (i < 0)) {
+               n64 = (stbsp__uint32)-i;
+               fl |= STBSP__NEGATIVE;
+            }
+         }
+
+#ifndef STB_SPRINTF_NOFLOAT
+         if (fl & STBSP__METRIC_SUFFIX) {
+            if (n64 < 1024)
+               pr = 0;
+            else if (pr == -1)
+               pr = 1;
+            fv = (double)(stbsp__int64)n64;
+            goto doafloat;
+         }
+#endif
+
+         // convert to string
+         s = num + STBSP__NUMSZ;
+         l = 0;
+
+         for (;;) {
+            // do in 32-bit chunks (avoid lots of 64-bit divides even with constant denominators)
+            char *o = s - 8;
+            if (n64 >= 100000000) {
+               n = (stbsp__uint32)(n64 % 100000000);
+               n64 /= 100000000;
+            } else {
+               n = (stbsp__uint32)n64;
+               n64 = 0;
+            }
+            if ((fl & STBSP__TRIPLET_COMMA) == 0) {
+               do {
+                  s -= 2;
+                  *(stbsp__uint16 *)s = *(stbsp__uint16 *)&stbsp__digitpair.pair[(n % 100) * 2];
+                  n /= 100;
+               } while (n);
+            }
+            while (n) {
+               if ((fl & STBSP__TRIPLET_COMMA) && (l++ == 3)) {
+                  l = 0;
+                  *--s = stbsp__comma;
+                  --o;
+               } else {
+                  *--s = (char)(n % 10) + '0';
+                  n /= 10;
+               }
+            }
+            if (n64 == 0) {
+               if ((s[0] == '0') && (s != (num + STBSP__NUMSZ)))
+                  ++s;
+               break;
+            }
+            while (s != o)
+               if ((fl & STBSP__TRIPLET_COMMA) && (l++ == 3)) {
+                  l = 0;
+                  *--s = stbsp__comma;
+                  --o;
+               } else {
+                  *--s = '0';
+               }
+         }
+
+         tail[0] = 0;
+         stbsp__lead_sign(fl, lead);
+
+         // get the length that we copied
+         l = (stbsp__uint32)((num + STBSP__NUMSZ) - s);
+         if (l == 0) {
+            *--s = '0';
+            l = 1;
+         }
+         cs = l + (3 << 24);
+         if (pr < 0)
+            pr = 0;
+
+      scopy:
+         // get fw=leading/trailing space, pr=leading zeros
+         if (pr < (stbsp__int32)l)
+            pr = l;
+         n = pr + lead[0] + tail[0] + tz;
+         if (fw < (stbsp__int32)n)
+            fw = n;
+         fw -= n;
+         pr -= l;
+
+         // handle right justify and leading zeros
+         if ((fl & STBSP__LEFTJUST) == 0) {
+            if (fl & STBSP__LEADINGZERO) // if leading zeros, everything is in pr
+            {
+               pr = (fw > pr) ? fw : pr;
+               fw = 0;
+            } else {
+               fl &= ~STBSP__TRIPLET_COMMA; // if no leading zeros, then no commas
+            }
+         }
+
+         // copy the spaces and/or zeros
+         if (fw + pr) {
+            stbsp__int32 i;
+            stbsp__uint32 c;
+
+            // copy leading spaces (or when doing %8.4d stuff)
+            if ((fl & STBSP__LEFTJUST) == 0)
+               while (fw > 0) {
+                  stbsp__cb_buf_clamp(i, fw);
+                  fw -= i;
+                  while (i) {
+                     if ((((stbsp__uintptr)bf) & 3) == 0)
+                        break;
+                     *bf++ = ' ';
+                     --i;
+                  }
+                  while (i >= 4) {
+                     *(stbsp__uint32 *)bf = 0x20202020;
+                     bf += 4;
+                     i -= 4;
+                  }
+                  while (i) {
+                     *bf++ = ' ';
+                     --i;
+                  }
+                  stbsp__chk_cb_buf(1);
+               }
+
+            // copy leader
+            sn = lead + 1;
+            while (lead[0]) {
+               stbsp__cb_buf_clamp(i, lead[0]);
+               lead[0] -= (char)i;
+               while (i) {
+                  *bf++ = *sn++;
+                  --i;
+               }
+               stbsp__chk_cb_buf(1);
+            }
+
+            // copy leading zeros
+            c = cs >> 24;
+            cs &= 0xffffff;
+            cs = (fl & STBSP__TRIPLET_COMMA) ? ((stbsp__uint32)(c - ((pr + cs) % (c + 1)))) : 0;
+            while (pr > 0) {
+               stbsp__cb_buf_clamp(i, pr);
+               pr -= i;
+               if ((fl & STBSP__TRIPLET_COMMA) == 0) {
+                  while (i) {
+                     if ((((stbsp__uintptr)bf) & 3) == 0)
+                        break;
+                     *bf++ = '0';
+                     --i;
+                  }
+                  while (i >= 4) {
+                     *(stbsp__uint32 *)bf = 0x30303030;
+                     bf += 4;
+                     i -= 4;
+                  }
+               }
+               while (i) {
+                  if ((fl & STBSP__TRIPLET_COMMA) && (cs++ == c)) {
+                     cs = 0;
+                     *bf++ = stbsp__comma;
+                  } else
+                     *bf++ = '0';
+                  --i;
+               }
+               stbsp__chk_cb_buf(1);
+            }
+         }
+
+         // copy leader if there is still one
+         sn = lead + 1;
+         while (lead[0]) {
+            stbsp__int32 i;
+            stbsp__cb_buf_clamp(i, lead[0]);
+            lead[0] -= (char)i;
+            while (i) {
+               *bf++ = *sn++;
+               --i;
+            }
+            stbsp__chk_cb_buf(1);
+         }
+
+         // copy the string
+         n = l;
+         while (n) {
+            stbsp__int32 i;
+            stbsp__cb_buf_clamp(i, n);
+            n -= i;
+            STBSP__UNALIGNED(while (i >= 4) {
+               *(stbsp__uint32 volatile *)bf = *(stbsp__uint32 volatile *)s;
+               bf += 4;
+               s += 4;
+               i -= 4;
+            })
+            while (i) {
+               *bf++ = *s++;
+               --i;
+            }
+            stbsp__chk_cb_buf(1);
+         }
+
+         // copy trailing zeros
+         while (tz) {
+            stbsp__int32 i;
+            stbsp__cb_buf_clamp(i, tz);
+            tz -= i;
+            while (i) {
+               if ((((stbsp__uintptr)bf) & 3) == 0)
+                  break;
+               *bf++ = '0';
+               --i;
+            }
+            while (i >= 4) {
+               *(stbsp__uint32 *)bf = 0x30303030;
+               bf += 4;
+               i -= 4;
+            }
+            while (i) {
+               *bf++ = '0';
+               --i;
+            }
+            stbsp__chk_cb_buf(1);
+         }
+
+         // copy tail if there is one
+         sn = tail + 1;
+         while (tail[0]) {
+            stbsp__int32 i;
+            stbsp__cb_buf_clamp(i, tail[0]);
+            tail[0] -= (char)i;
+            while (i) {
+               *bf++ = *sn++;
+               --i;
+            }
+            stbsp__chk_cb_buf(1);
+         }
+
+         // handle the left justify
+         if (fl & STBSP__LEFTJUST)
+            if (fw > 0) {
+               while (fw) {
+                  stbsp__int32 i;
+                  stbsp__cb_buf_clamp(i, fw);
+                  fw -= i;
+                  while (i) {
+                     if ((((stbsp__uintptr)bf) & 3) == 0)
+                        break;
+                     *bf++ = ' ';
+                     --i;
+                  }
+                  while (i >= 4) {
+                     *(stbsp__uint32 *)bf = 0x20202020;
+                     bf += 4;
+                     i -= 4;
+                  }
+                  while (i--)
+                     *bf++ = ' ';
+                  stbsp__chk_cb_buf(1);
+               }
+            }
+         break;
+
+      default: // unknown, just copy code
+         s = num + STBSP__NUMSZ - 1;
+         *s = f[0];
+         l = 1;
+         fw = fl = 0;
+         lead[0] = 0;
+         tail[0] = 0;
+         pr = 0;
+         dp = 0;
+         cs = 0;
+         goto scopy;
+      }
+      ++f;
+   }
+endfmt:
+
+   if (!callback)
+      *bf = 0;
+   else
+      stbsp__flush_cb();
+
+done:
+   return tlen + (int)(bf - buf);
+}
+
+// cleanup
+#undef STBSP__LEFTJUST
+#undef STBSP__LEADINGPLUS
+#undef STBSP__LEADINGSPACE
+#undef STBSP__LEADING_0X
+#undef STBSP__LEADINGZERO
+#undef STBSP__INTMAX
+#undef STBSP__TRIPLET_COMMA
+#undef STBSP__NEGATIVE
+#undef STBSP__METRIC_SUFFIX
+#undef STBSP__NUMSZ
+#undef stbsp__chk_cb_bufL
+#undef stbsp__chk_cb_buf
+#undef stbsp__flush_cb
+#undef stbsp__cb_buf_clamp
+
+// ============================================================================
+//   wrapper functions
+
+STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(sprintf)(char *buf, char const *fmt, ...)
+{
+   int result;
+   va_list va;
+   va_start(va, fmt);
+   result = STB_SPRINTF_DECORATE(vsprintfcb)(0, 0, buf, fmt, va);
+   va_end(va);
+   return result;
+}
+
+typedef struct stbsp__context {
+   char *buf;
+   int count;
+   int length;
+   char tmp[STB_SPRINTF_MIN];
+} stbsp__context;
+
+static char *stbsp__clamp_callback(const char *buf, void *user, int len)
+{
+   stbsp__context *c = (stbsp__context *)user;
+   c->length += len;
+
+   if (len > c->count)
+      len = c->count;
+
+   if (len) {
+      if (buf != c->buf) {
+         const char *s, *se;
+         char *d;
+         d = c->buf;
+         s = buf;
+         se = buf + len;
+         do {
+            *d++ = *s++;
+         } while (s < se);
+      }
+      c->buf += len;
+      c->count -= len;
+   }
+
+   if (c->count <= 0)
+      return c->tmp;
+   return (c->count >= STB_SPRINTF_MIN) ? c->buf : c->tmp; // go direct into buffer if you can
+}
+
+static char * stbsp__count_clamp_callback( const char * buf, void * user, int len )
+{
+   stbsp__context * c = (stbsp__context*)user;
+   (void) sizeof(buf);
+
+   c->length += len;
+   return c->tmp; // go direct into buffer if you can
+}
+
+STBSP__PUBLICDEF int STB_SPRINTF_DECORATE( vsnprintf )( char * buf, int count, char const * fmt, va_list va )
+{
+   stbsp__context c;
+
+   if ( (count == 0) && !buf )
+   {
+      c.length = 0;
+
+      STB_SPRINTF_DECORATE( vsprintfcb )( stbsp__count_clamp_callback, &c, c.tmp, fmt, va );
+   }
+   else
+   {
+      int l;
+
+      c.buf = buf;
+      c.count = count;
+      c.length = 0;
+
+      STB_SPRINTF_DECORATE( vsprintfcb )( stbsp__clamp_callback, &c, stbsp__clamp_callback(0,&c,0), fmt, va );
+
+      // zero-terminate
+      l = (int)( c.buf - buf );
+      if ( l >= count ) // should never be greater, only equal (or less) than count
+         l = count - 1;
+      buf[l] = 0;
+   }
+
+   return c.length;
+}
+
+STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(snprintf)(char *buf, int count, char const *fmt, ...)
+{
+   int result;
+   va_list va;
+   va_start(va, fmt);
+
+   result = STB_SPRINTF_DECORATE(vsnprintf)(buf, count, fmt, va);
+   va_end(va);
+
+   return result;
+}
+
+STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintf)(char *buf, char const *fmt, va_list va)
+{
+   return STB_SPRINTF_DECORATE(vsprintfcb)(0, 0, buf, fmt, va);
+}
+
+// =======================================================================
+//   low level float utility functions
+
+#ifndef STB_SPRINTF_NOFLOAT
+
+// copies d to bits w/ strict aliasing (this compiles to nothing on /Ox)
+#define STBSP__COPYFP(dest, src)                   \
+   {                                               \
+      int cn;                                      \
+      for (cn = 0; cn < 8; cn++)                   \
+         ((char *)&dest)[cn] = ((char *)&src)[cn]; \
+   }
+
+// get float info
+static stbsp__int32 stbsp__real_to_parts(stbsp__int64 *bits, stbsp__int32 *expo, double value)
+{
+   double d;
+   stbsp__int64 b = 0;
+
+   // load value and round at the frac_digits
+   d = value;
+
+   STBSP__COPYFP(b, d);
+
+   *bits = b & ((((stbsp__uint64)1) << 52) - 1);
+   *expo = (stbsp__int32)(((b >> 52) & 2047) - 1023);
+
+   return (stbsp__int32)((stbsp__uint64) b >> 63);
+}
+
+static double const stbsp__bot[23] = {
+   1e+000, 1e+001, 1e+002, 1e+003, 1e+004, 1e+005, 1e+006, 1e+007, 1e+008, 1e+009, 1e+010, 1e+011,
+   1e+012, 1e+013, 1e+014, 1e+015, 1e+016, 1e+017, 1e+018, 1e+019, 1e+020, 1e+021, 1e+022
+};
+static double const stbsp__negbot[22] = {
+   1e-001, 1e-002, 1e-003, 1e-004, 1e-005, 1e-006, 1e-007, 1e-008, 1e-009, 1e-010, 1e-011,
+   1e-012, 1e-013, 1e-014, 1e-015, 1e-016, 1e-017, 1e-018, 1e-019, 1e-020, 1e-021, 1e-022
+};
+static double const stbsp__negboterr[22] = {
+   -5.551115123125783e-018,  -2.0816681711721684e-019, -2.0816681711721686e-020, -4.7921736023859299e-021, -8.1803053914031305e-022, 4.5251888174113741e-023,
+   4.5251888174113739e-024,  -2.0922560830128471e-025, -6.2281591457779853e-026, -3.6432197315497743e-027, 6.0503030718060191e-028,  2.0113352370744385e-029,
+   -3.0373745563400371e-030, 1.1806906454401013e-032,  -7.7705399876661076e-032, 2.0902213275965398e-033,  -7.1542424054621921e-034, -7.1542424054621926e-035,
+   2.4754073164739869e-036,  5.4846728545790429e-037,  9.2462547772103625e-038,  -4.8596774326570872e-039
+};
+static double const stbsp__top[13] = {
+   1e+023, 1e+046, 1e+069, 1e+092, 1e+115, 1e+138, 1e+161, 1e+184, 1e+207, 1e+230, 1e+253, 1e+276, 1e+299
+};
+static double const stbsp__negtop[13] = {
+   1e-023, 1e-046, 1e-069, 1e-092, 1e-115, 1e-138, 1e-161, 1e-184, 1e-207, 1e-230, 1e-253, 1e-276, 1e-299
+};
+static double const stbsp__toperr[13] = {
+   8388608,
+   6.8601809640529717e+028,
+   -7.253143638152921e+052,
+   -4.3377296974619174e+075,
+   -1.5559416129466825e+098,
+   -3.2841562489204913e+121,
+   -3.7745893248228135e+144,
+   -1.7356668416969134e+167,
+   -3.8893577551088374e+190,
+   -9.9566444326005119e+213,
+   6.3641293062232429e+236,
+   -5.2069140800249813e+259,
+   -5.2504760255204387e+282
+};
+static double const stbsp__negtoperr[13] = {
+   3.9565301985100693e-040,  -2.299904345391321e-063,  3.6506201437945798e-086,  1.1875228833981544e-109,
+   -5.0644902316928607e-132, -6.7156837247865426e-155, -2.812077463003139e-178,  -5.7778912386589953e-201,
+   7.4997100559334532e-224,  -4.6439668915134491e-247, -6.3691100762962136e-270, -9.436808465446358e-293,
+   8.0970921678014997e-317
+};
+
+#if defined(_MSC_VER) && (_MSC_VER <= 1200)
+static stbsp__uint64 const stbsp__powten[20] = {
+   1,
+   10,
+   100,
+   1000,
+   10000,
+   100000,
+   1000000,
+   10000000,
+   100000000,
+   1000000000,
+   10000000000,
+   100000000000,
+   1000000000000,
+   10000000000000,
+   100000000000000,
+   1000000000000000,
+   10000000000000000,
+   100000000000000000,
+   1000000000000000000,
+   10000000000000000000U
+};
+#define stbsp__tento19th ((stbsp__uint64)1000000000000000000)
+#else
+static stbsp__uint64 const stbsp__powten[20] = {
+   1,
+   10,
+   100,
+   1000,
+   10000,
+   100000,
+   1000000,
+   10000000,
+   100000000,
+   1000000000,
+   10000000000ULL,
+   100000000000ULL,
+   1000000000000ULL,
+   10000000000000ULL,
+   100000000000000ULL,
+   1000000000000000ULL,
+   10000000000000000ULL,
+   100000000000000000ULL,
+   1000000000000000000ULL,
+   10000000000000000000ULL
+};
+#define stbsp__tento19th (1000000000000000000ULL)
+#endif
+
+#define stbsp__ddmulthi(oh, ol, xh, yh)                            \
+   {                                                               \
+      double ahi = 0, alo, bhi = 0, blo;                           \
+      stbsp__int64 bt;                                             \
+      oh = xh * yh;                                                \
+      STBSP__COPYFP(bt, xh);                                       \
+      bt &= ((~(stbsp__uint64)0) << 27);                           \
+      STBSP__COPYFP(ahi, bt);                                      \
+      alo = xh - ahi;                                              \
+      STBSP__COPYFP(bt, yh);                                       \
+      bt &= ((~(stbsp__uint64)0) << 27);                           \
+      STBSP__COPYFP(bhi, bt);                                      \
+      blo = yh - bhi;                                              \
+      ol = ((ahi * bhi - oh) + ahi * blo + alo * bhi) + alo * blo; \
+   }
+
+#define stbsp__ddtoS64(ob, xh, xl)          \
+   {                                        \
+      double ahi = 0, alo, vh, t;           \
+      ob = (stbsp__int64)xh;                \
+      vh = (double)ob;                      \
+      ahi = (xh - vh);                      \
+      t = (ahi - xh);                       \
+      alo = (xh - (ahi - t)) - (vh + t);    \
+      ob += (stbsp__int64)(ahi + alo + xl); \
+   }
+
+#define stbsp__ddrenorm(oh, ol) \
+   {                            \
+      double s;                 \
+      s = oh + ol;              \
+      ol = ol - (s - oh);       \
+      oh = s;                   \
+   }
+
+#define stbsp__ddmultlo(oh, ol, xh, xl, yh, yl) ol = ol + (xh * yl + xl * yh);
+
+#define stbsp__ddmultlos(oh, ol, xh, yl) ol = ol + (xh * yl);
+
+static void stbsp__raise_to_power10(double *ohi, double *olo, double d, stbsp__int32 power) // power can be -323 to +350
+{
+   double ph, pl;
+   if ((power >= 0) && (power <= 22)) {
+      stbsp__ddmulthi(ph, pl, d, stbsp__bot[power]);
+   } else {
+      stbsp__int32 e, et, eb;
+      double p2h, p2l;
+
+      e = power;
+      if (power < 0)
+         e = -e;
+      et = (e * 0x2c9) >> 14; /* %23 */
+      if (et > 13)
+         et = 13;
+      eb = e - (et * 23);
+
+      ph = d;
+      pl = 0.0;
+      if (power < 0) {
+         if (eb) {
+            --eb;
+            stbsp__ddmulthi(ph, pl, d, stbsp__negbot[eb]);
+            stbsp__ddmultlos(ph, pl, d, stbsp__negboterr[eb]);
+         }
+         if (et) {
+            stbsp__ddrenorm(ph, pl);
+            --et;
+            stbsp__ddmulthi(p2h, p2l, ph, stbsp__negtop[et]);
+            stbsp__ddmultlo(p2h, p2l, ph, pl, stbsp__negtop[et], stbsp__negtoperr[et]);
+            ph = p2h;
+            pl = p2l;
+         }
+      } else {
+         if (eb) {
+            e = eb;
+            if (eb > 22)
+               eb = 22;
+            e -= eb;
+            stbsp__ddmulthi(ph, pl, d, stbsp__bot[eb]);
+            if (e) {
+               stbsp__ddrenorm(ph, pl);
+               stbsp__ddmulthi(p2h, p2l, ph, stbsp__bot[e]);
+               stbsp__ddmultlos(p2h, p2l, stbsp__bot[e], pl);
+               ph = p2h;
+               pl = p2l;
+            }
+         }
+         if (et) {
+            stbsp__ddrenorm(ph, pl);
+            --et;
+            stbsp__ddmulthi(p2h, p2l, ph, stbsp__top[et]);
+            stbsp__ddmultlo(p2h, p2l, ph, pl, stbsp__top[et], stbsp__toperr[et]);
+            ph = p2h;
+            pl = p2l;
+         }
+      }
+   }
+   stbsp__ddrenorm(ph, pl);
+   *ohi = ph;
+   *olo = pl;
+}
+
+// given a float value, returns the significant bits in bits, and the position of the
+//   decimal point in decimal_pos.  +/-INF and NAN are specified by special values
+//   returned in the decimal_pos parameter.
+// frac_digits is absolute normally, but if you want from first significant digits (got %g and %e), or in 0x80000000
+static stbsp__int32 stbsp__real_to_str(char const **start, stbsp__uint32 *len, char *out, stbsp__int32 *decimal_pos, double value, stbsp__uint32 frac_digits)
+{
+   double d;
+   stbsp__int64 bits = 0;
+   stbsp__int32 expo, e, ng, tens;
+
+   d = value;
+   STBSP__COPYFP(bits, d);
+   expo = (stbsp__int32)((bits >> 52) & 2047);
+   ng = (stbsp__int32)((stbsp__uint64) bits >> 63);
+   if (ng)
+      d = -d;
+
+   if (expo == 2047) // is nan or inf?
+   {
+      *start = (bits & ((((stbsp__uint64)1) << 52) - 1)) ? "NaN" : "Inf";
+      *decimal_pos = STBSP__SPECIAL;
+      *len = 3;
+      return ng;
+   }
+
+   if (expo == 0) // is zero or denormal
+   {
+      if (((stbsp__uint64) bits << 1) == 0) // do zero
+      {
+         *decimal_pos = 1;
+         *start = out;
+         out[0] = '0';
+         *len = 1;
+         return ng;
+      }
+      // find the right expo for denormals
+      {
+         stbsp__int64 v = ((stbsp__uint64)1) << 51;
+         while ((bits & v) == 0) {
+            --expo;
+            v >>= 1;
+         }
+      }
+   }
+
+   // find the decimal exponent as well as the decimal bits of the value
+   {
+      double ph, pl;
+
+      // log10 estimate - very specifically tweaked to hit or undershoot by no more than 1 of log10 of all expos 1..2046
+      tens = expo - 1023;
+      tens = (tens < 0) ? ((tens * 617) / 2048) : (((tens * 1233) / 4096) + 1);
+
+      // move the significant bits into position and stick them into an int
+      stbsp__raise_to_power10(&ph, &pl, d, 18 - tens);
+
+      // get full as much precision from double-double as possible
+      stbsp__ddtoS64(bits, ph, pl);
+
+      // check if we undershot
+      if (((stbsp__uint64)bits) >= stbsp__tento19th)
+         ++tens;
+   }
+
+   // now do the rounding in integer land
+   frac_digits = (frac_digits & 0x80000000) ? ((frac_digits & 0x7ffffff) + 1) : (tens + frac_digits);
+   if ((frac_digits < 24)) {
+      stbsp__uint32 dg = 1;
+      if ((stbsp__uint64)bits >= stbsp__powten[9])
+         dg = 10;
+      while ((stbsp__uint64)bits >= stbsp__powten[dg]) {
+         ++dg;
+         if (dg == 20)
+            goto noround;
+      }
+      if (frac_digits < dg) {
+         stbsp__uint64 r;
+         // add 0.5 at the right position and round
+         e = dg - frac_digits;
+         if ((stbsp__uint32)e >= 24)
+            goto noround;
+         r = stbsp__powten[e];
+         bits = bits + (r / 2);
+         if ((stbsp__uint64)bits >= stbsp__powten[dg])
+            ++tens;
+         bits /= r;
+      }
+   noround:;
+   }
+
+   // kill long trailing runs of zeros
+   if (bits) {
+      stbsp__uint32 n;
+      for (;;) {
+         if (bits <= 0xffffffff)
+            break;
+         if (bits % 1000)
+            goto donez;
+         bits /= 1000;
+      }
+      n = (stbsp__uint32)bits;
+      while ((n % 1000) == 0)
+         n /= 1000;
+      bits = n;
+   donez:;
+   }
+
+   // convert to string
+   out += 64;
+   e = 0;
+   for (;;) {
+      stbsp__uint32 n;
+      char *o = out - 8;
+      // do the conversion in chunks of U32s (avoid most 64-bit divides, worth it, constant denomiators be damned)
+      if (bits >= 100000000) {
+         n = (stbsp__uint32)(bits % 100000000);
+         bits /= 100000000;
+      } else {
+         n = (stbsp__uint32)bits;
+         bits = 0;
+      }
+      while (n) {
+         out -= 2;
+         *(stbsp__uint16 *)out = *(stbsp__uint16 *)&stbsp__digitpair.pair[(n % 100) * 2];
+         n /= 100;
+         e += 2;
+      }
+      if (bits == 0) {
+         if ((e) && (out[0] == '0')) {
+            ++out;
+            --e;
+         }
+         break;
+      }
+      while (out != o) {
+         *--out = '0';
+         ++e;
+      }
+   }
+
+   *decimal_pos = tens;
+   *start = out;
+   *len = e;
+   return ng;
+}
+
+#undef stbsp__ddmulthi
+#undef stbsp__ddrenorm
+#undef stbsp__ddmultlo
+#undef stbsp__ddmultlos
+#undef STBSP__SPECIAL
+#undef STBSP__COPYFP
+
+#endif // STB_SPRINTF_NOFLOAT
+
+// clean up
+#undef stbsp__uint16
+#undef stbsp__uint32
+#undef stbsp__int32
+#undef stbsp__uint64
+#undef stbsp__int64
+#undef STBSP__UNALIGNED
+
+#endif // STB_SPRINTF_IMPLEMENTATION
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/include/aoc/string.h b/include/aoc/string.h
new file mode 100644
index 0000000..2ce47fd
--- /dev/null
+++ b/include/aoc/string.h
@@ -0,0 +1,3 @@
+#pragma once
+
+void strreverse(char *begin, char *end);
diff --git a/include/solve.h b/include/solve.h
new file mode 100644
index 0000000..52d0ed6
--- /dev/null
+++ b/include/solve.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <stddef.h>
+
+typedef struct Solution {
+    char part1[64];
+    char part2[64];
+} Solution;
+
+void solve(const char *buf, size_t buf_size, Solution *result);
+int solve_input(const char *fname, Solution *result);
diff --git a/input/README.md b/input/README.md
new file mode 100644
index 0000000..01f01f4
--- /dev/null
+++ b/input/README.md
@@ -0,0 +1,6 @@
+# Input Data
+
+Since sharing the puzzle input data is discouraged by AoC guidelines, the puzzle input data is **not part of this repository**.
+
+For the first day's puzzle, ensure that the input file is named `day01.txt` and stored in this directory.
+Follow the same pattern for subsequent days.
diff --git a/justfile b/justfile
new file mode 100644
index 0000000..319f0b7
--- /dev/null
+++ b/justfile
@@ -0,0 +1,32 @@
+YEAR := "2023"
+
+_default:
+    just --list
+
+watch:
+    #!/usr/bin/env bash
+    fd ".*c$" -t f | entr ninja -C build_debug test
+
+generate DAY:
+    #!/usr/bin/env bash
+    set -eu
+    just download-puzzle {{ DAY }}
+    [[ -d "src/day{{ DAY }}" ]] || {
+        cp -av src/template "src/day{{ DAY }}"
+        sed -i -e 's/dayXX/day{{ DAY }}/' "src/day{{ DAY }}/solve_test.c"
+    }
+    grep -q "day{{ DAY }}" meson.build || {
+        sed -i "/# XXX: marker/i\  'day{{ DAY }}': [ 'src/day{{ DAY }}/solve.c' ]," meson.build
+    }
+    [[ -e ./input/day{{ DAY }}.txt ]] || {
+        aoc download --year {{ YEAR }} --day {{ DAY }} --input-only --input-file ./input/day{{ DAY }}.txt
+    }
+
+submit DAY PART ANSWER:
+    aoc submit --year {{ YEAR }} --day {{ DAY }} {{ PART }} {{ ANSWER }}
+
+download-puzzle DAY:
+    #!/usr/bin/env bash
+    set -eu
+    mkdir -p puzzle
+    aoc --puzzle-only --year 2023 --day {{ DAY }} > puzzle/day{{ DAY }}.md
diff --git a/lib/aoc/io.c b/lib/aoc/io.c
new file mode 100644
index 0000000..faa1687
--- /dev/null
+++ b/lib/aoc/io.c
@@ -0,0 +1,36 @@
+#include <stdio.h>
+
+#include "aoc/io.h"
+#include "aoc/macros.h"
+
+/* IO */
+static inline void my_fclose(FILE **fp) {
+    FILE *f = *fp;
+    if (f) fclose(f);
+}
+
+int read_input(const char *fname, char *buf, size_t buf_size) {
+    _cleanup_(my_fclose) FILE *f = fopen(fname, "r");
+    if (!f) {
+        perror("Error opening input:");
+        return -1;
+    }
+
+    fseek(f, 0, SEEK_END);
+    size_t fsize = ftell(f);
+    if (fsize >= buf_size) {
+        fprintf(stderr, "File (%zu) too large for buffer (%zu)\n", fsize,
+                buf_size);
+        return -2;
+    }
+    rewind(f);
+
+    size_t end = fread(buf, sizeof(char), fsize, f);
+    if (end != fsize) {
+        perror("Error reading file");
+        return -3;
+    }
+    /* ensure buf is null-terminated */
+    buf[end] = '\0';
+    return end;
+}
diff --git a/lib/aoc/log.c b/lib/aoc/log.c
new file mode 100644
index 0000000..c996c3d
--- /dev/null
+++ b/lib/aoc/log.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017 rxi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <time.h>
+#include <errno.h>
+
+#include "aoc/log.h"
+
+static struct {
+  void *udata;
+  log_LockFn lock;
+  FILE *fp;
+  int level;
+  int filelevel;
+  int quiet;
+} L;
+
+
+static const char *level_names[] = {
+  "TRACE", "DEBUG", "INFO", "WARN", "ERROR", "FATAL"
+};
+
+#ifdef LOG_USE_COLOR
+static const char *level_colors[] = {
+  "\x1b[94m", "\x1b[36m", "\x1b[32m", "\x1b[33m", "\x1b[31m", "\x1b[35m"
+};
+#endif
+
+
+static void lock(void)   {
+  if (L.lock) {
+    L.lock(L.udata, 1);
+  }
+}
+
+
+static void unlock(void) {
+  if (L.lock) {
+    L.lock(L.udata, 0);
+  }
+}
+
+
+void log_set_udata(void *udata) {
+  L.udata = udata;
+}
+
+
+void log_set_lock(log_LockFn fn) {
+  L.lock = fn;
+}
+
+
+void log_set_fp(FILE *fp) {
+  L.fp = fp;
+}
+
+
+void log_set_level(int level) {
+  L.level = level;
+}
+
+void log_set_file_level(int level) {
+  L.filelevel = level;
+}
+
+
+void log_set_quiet(int enable) {
+  L.quiet = enable ? 1 : 0;
+}
+
+
+void log_log(int level, const int do_perror, const char *file, int line, const char *fmt, ...) {
+  if (level < L.level) {
+    return;
+  }
+  int errnum = errno;
+
+  /* Acquire lock */
+  lock();
+
+  /* Get current time */
+  time_t t = time(NULL);
+  struct tm *lt = localtime(&t);
+
+  /* Log to stderr */
+  if (!L.quiet) {
+    va_list args;
+    char buf[16];
+    buf[strftime(buf, sizeof(buf), "%H:%M:%S", lt)] = '\0';
+#ifdef LOG_USE_COLOR
+    fprintf(
+      stderr, "%s %s%-5s\x1b[0m \x1b[90m%s:%d:\x1b[0m ",
+      buf, level_colors[level], level_names[level], file, line);
+#else
+    fprintf(stderr, "%s %-5s %s:%d: ", buf, level_names[level], file, line);
+#endif
+    va_start(args, fmt);
+    vfprintf(stderr, fmt, args);
+    va_end(args);
+
+    if (level >= LOG_ERROR && do_perror) {
+      char buf[1024];
+      strerror_r(errnum, buf, sizeof(buf));
+      fprintf(stderr, ": %s", buf);
+    }
+    fprintf(stderr, "\n");
+    fflush(stderr);
+  }
+
+  /* Log to file */
+  if (L.fp && level >= L.filelevel) {
+    va_list args;
+    char buf[32];
+    buf[strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", lt)] = '\0';
+    fprintf(L.fp, "%s %-5s %s:%d: ", buf, level_names[level], file, line);
+    va_start(args, fmt);
+    vfprintf(L.fp, fmt, args);
+    va_end(args);
+    if (level >= LOG_ERROR && do_perror) {
+      char buf[1024];
+      strerror_r(errnum, buf, sizeof(buf));
+      fprintf(L.fp, ": %s", buf);
+    }
+    fprintf(L.fp, "\n");
+    fflush(L.fp);
+  }
+
+  /* Release lock */
+  unlock();
+}
diff --git a/lib/aoc/math.c b/lib/aoc/math.c
new file mode 100644
index 0000000..18ec6d1
--- /dev/null
+++ b/lib/aoc/math.c
@@ -0,0 +1,69 @@
+#include <assert.h>
+#include <stdint.h>
+
+#include "aoc/macros.h"
+#include "aoc/math.h"
+#include "aoc/string.h"
+
+size_t itoa(int value, char *str, int base) {
+    static char num[] = "0123456789abcdefghijklmnopqrstuvwxyz";
+
+    char *wstr = str;
+    int sign;
+
+    // Validate base
+    assert(base >= 2 && base <= 35);
+
+    // Take care of sign
+    if ((sign = value) < 0) value = -value;
+
+    // Conversion. Number is reversed.
+    do { *wstr++ = num[value % base]; } while (value /= base);
+
+    if (sign < 0) *wstr++ = '-';
+
+    *wstr = '\0';
+    strreverse(str, wstr - 1);
+    return wstr - str;
+}
+
+int gcdx(int a, int b, int *s, int *t) {
+    int s0 = 1, s1 = 0, t0 = 0, t1 = 1;
+    int q, r, m, n;
+
+    while (a) {
+        q = b / a;
+        r = b % a;
+
+        m = s0 - q * s1;
+        n = t0 - q * t1;
+
+        // Update b, a, s0, s1, t0, t1
+        b = a;
+        a = r;
+        s0 = s1;
+        s1 = m;
+        t0 = t1;
+        t1 = n;
+    }
+
+    *s = s0;
+    *t = t0;
+
+    return b;
+}
+
+int fastmod(int n, int mod) {
+    n -= mod & -(n >= mod);
+    return n;
+}
+
+int modinv(int b, int mod) {
+    int x, y;
+    gcdx(b, mod, &x, &y);
+    x += mod & -(x < 0);
+    return x;
+}
+
+// implements signum (-1, 0, or 1).
+int sgn(int val) { return (0 < val) - (val < 0); }
diff --git a/lib/aoc/md5.c b/lib/aoc/md5.c
new file mode 100644
index 0000000..b7666a7
--- /dev/null
+++ b/lib/aoc/md5.c
@@ -0,0 +1,227 @@
+/*
+ * Derived from the RSA Data Security, Inc. MD5 Message-Digest Algorithm
+ * and modified slightly to be functionally identical but condensed into control
+ * structures.
+ *
+ * Source: https://github.com/Zunawe/md5-c
+ */
+
+#include "aoc/md5.h"
+
+/*
+ * Constants defined by the MD5 algorithm
+ */
+#define A 0x67452301
+#define B 0xefcdab89
+#define C 0x98badcfe
+#define D 0x10325476
+
+static uint32_t S[] = {7,  12, 17, 22, 7,  12, 17, 22, 7,  12, 17, 22, 7,
+                       12, 17, 22, 5,  9,  14, 20, 5,  9,  14, 20, 5,  9,
+                       14, 20, 5,  9,  14, 20, 4,  11, 16, 23, 4,  11, 16,
+                       23, 4,  11, 16, 23, 4,  11, 16, 23, 6,  10, 15, 21,
+                       6,  10, 15, 21, 6,  10, 15, 21, 6,  10, 15, 21};
+
+static uint32_t K[] = {
+    0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a,
+    0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
+    0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340,
+    0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
+    0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8,
+    0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
+    0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa,
+    0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+    0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92,
+    0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
+    0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391};
+
+/*
+ * Padding used to make the size (in bits) of the input congruent to 448 mod 512
+ */
+static uint8_t PADDING[] = {
+    0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+
+/*
+ * Bit-manipulation functions defined by the MD5 algorithm
+ */
+#define F(X, Y, Z) ((X & Y) | (~X & Z))
+#define G(X, Y, Z) ((X & Z) | (Y & ~Z))
+#define H(X, Y, Z) (X ^ Y ^ Z)
+#define I(X, Y, Z) (Y ^ (X | ~Z))
+
+/*
+ * Rotates a 32-bit word left by n bits
+ */
+uint32_t rotateLeft(uint32_t x, uint32_t n) {
+    return (x << n) | (x >> (32 - n));
+}
+
+/*
+ * Initialize a context
+ */
+void md5Init(MD5Context *ctx) {
+    ctx->size = (uint64_t)0;
+
+    ctx->buffer[0] = (uint32_t)A;
+    ctx->buffer[1] = (uint32_t)B;
+    ctx->buffer[2] = (uint32_t)C;
+    ctx->buffer[3] = (uint32_t)D;
+}
+
+/*
+ * Add some amount of input to the context
+ *
+ * If the input fills out a block of 512 bits, apply the algorithm (md5Step)
+ * and save the result in the buffer. Also updates the overall size.
+ */
+void md5Update(MD5Context *ctx, uint8_t *input_buffer, size_t input_len) {
+    uint32_t input[16];
+    unsigned int offset = ctx->size % 64;
+    ctx->size += (uint64_t)input_len;
+
+    // Copy each byte in input_buffer into the next space in our context input
+    for (unsigned int i = 0; i < input_len; ++i) {
+        ctx->input[offset++] = (uint8_t) * (input_buffer + i);
+
+        // If we've filled our context input, copy it into our local array input
+        // then reset the offset to 0 and fill in a new buffer.
+        // Every time we fill out a chunk, we run it through the algorithm
+        // to enable some back and forth between cpu and i/o
+        if (offset % 64 == 0) {
+            for (unsigned int j = 0; j < 16; ++j) {
+                // Convert to little-endian
+                // The local variable `input` our 512-bit chunk separated into
+                // 32-bit words we can use in calculations
+                input[j] = (uint32_t)(ctx->input[(j * 4) + 3]) << 24 |
+                           (uint32_t)(ctx->input[(j * 4) + 2]) << 16 |
+                           (uint32_t)(ctx->input[(j * 4) + 1]) << 8 |
+                           (uint32_t)(ctx->input[(j * 4)]);
+            }
+            md5Step(ctx->buffer, input);
+            offset = 0;
+        }
+    }
+}
+
+/*
+ * Pad the current input to get to 448 bytes, append the size in bits to the
+ * very end, and save the result of the final iteration into digest.
+ */
+void md5Finalize(MD5Context *ctx) {
+    uint32_t input[16];
+    unsigned int offset = ctx->size % 64;
+    unsigned int padding_length =
+        offset < 56 ? 56 - offset : (56 + 64) - offset;
+
+    // Fill in the padding and undo the changes to size that resulted from the
+    // update
+    md5Update(ctx, PADDING, padding_length);
+    ctx->size -= (uint64_t)padding_length;
+
+    // Do a final update (internal to this function)
+    // Last two 32-bit words are the two halves of the size (converted from
+    // bytes to bits)
+    for (unsigned int j = 0; j < 14; ++j) {
+        input[j] = (uint32_t)(ctx->input[(j * 4) + 3]) << 24 |
+                   (uint32_t)(ctx->input[(j * 4) + 2]) << 16 |
+                   (uint32_t)(ctx->input[(j * 4) + 1]) << 8 |
+                   (uint32_t)(ctx->input[(j * 4)]);
+    }
+    input[14] = (uint32_t)(ctx->size * 8);
+    input[15] = (uint32_t)((ctx->size * 8) >> 32);
+
+    md5Step(ctx->buffer, input);
+
+    // Move the result into digest (convert from little-endian)
+    for (unsigned int i = 0; i < 4; ++i) {
+        ctx->digest[(i * 4) + 0] = (uint8_t)((ctx->buffer[i] & 0x000000FF));
+        ctx->digest[(i * 4) + 1] =
+            (uint8_t)((ctx->buffer[i] & 0x0000FF00) >> 8);
+        ctx->digest[(i * 4) + 2] =
+            (uint8_t)((ctx->buffer[i] & 0x00FF0000) >> 16);
+        ctx->digest[(i * 4) + 3] =
+            (uint8_t)((ctx->buffer[i] & 0xFF000000) >> 24);
+    }
+}
+
+/*
+ * Step on 512 bits of input with the main MD5 algorithm.
+ */
+void md5Step(uint32_t *buffer, uint32_t *input) {
+    uint32_t AA = buffer[0];
+    uint32_t BB = buffer[1];
+    uint32_t CC = buffer[2];
+    uint32_t DD = buffer[3];
+
+    uint32_t E;
+
+    unsigned int j;
+
+    for (unsigned int i = 0; i < 64; ++i) {
+        switch (i / 16) {
+        case 0:
+            E = F(BB, CC, DD);
+            j = i;
+            break;
+        case 1:
+            E = G(BB, CC, DD);
+            j = ((i * 5) + 1) % 16;
+            break;
+        case 2:
+            E = H(BB, CC, DD);
+            j = ((i * 3) + 5) % 16;
+            break;
+        default:
+            E = I(BB, CC, DD);
+            j = (i * 7) % 16;
+            break;
+        }
+
+        uint32_t temp = DD;
+        DD = CC;
+        CC = BB;
+        BB = BB + rotateLeft(AA + E + K[i] + input[j], S[i]);
+        AA = temp;
+    }
+
+    buffer[0] += AA;
+    buffer[1] += BB;
+    buffer[2] += CC;
+    buffer[3] += DD;
+}
+
+/*
+ * Functions that run the algorithm on the provided input and put the digest
+ * into result. result should be able to store 16 bytes.
+ */
+void md5String(char *input, uint8_t *result) {
+    MD5Context ctx;
+    md5Init(&ctx);
+    md5Update(&ctx, (uint8_t *)input, strlen(input));
+    md5Finalize(&ctx);
+
+    memcpy(result, ctx.digest, 16);
+}
+
+void md5File(FILE *file, uint8_t *result) {
+    char *input_buffer = malloc(1024);
+    size_t input_size = 0;
+
+    MD5Context ctx;
+    md5Init(&ctx);
+
+    while ((input_size = fread(input_buffer, 1, 1024, file)) > 0) {
+        md5Update(&ctx, (uint8_t *)input_buffer, input_size);
+    }
+
+    md5Finalize(&ctx);
+
+    free(input_buffer);
+
+    memcpy(result, ctx.digest, 16);
+}
diff --git a/lib/aoc/parser.c b/lib/aoc/parser.c
new file mode 100644
index 0000000..062cd58
--- /dev/null
+++ b/lib/aoc/parser.c
@@ -0,0 +1,37 @@
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "aoc/parser.h"
+
+void aoc_parse_skip_ws(const char *buf, size_t *pos) {
+    int i = *pos;
+    while (buf[i] == ' ') i++;
+    *pos = i;
+    assert(buf[*pos] != ' ');
+}
+
+int aoc_parse_nonnegative(const char *buf, size_t *pos) {
+    aoc_parse_skip_ws(buf, pos);
+
+    int result = 0;
+    size_t i = *pos;
+    while (buf[i] >= '0' && buf[i] <= '9') {
+        result = (result * 10) + (buf[i] - '0');
+        i++;
+    }
+    if (i == *pos) { // error, nothing parsed
+        return -1;
+    }
+    *pos = i;
+    return result;
+}
+
+void aoc_parse_seek(const char *buf, size_t *pos, char needle) {
+    int i = *pos;
+    while (buf[i] != needle) i++;
+    *pos = i;
+    assert(buf[*pos] == needle);
+}
diff --git a/lib/aoc/point.c b/lib/aoc/point.c
new file mode 100644
index 0000000..e59dd89
--- /dev/null
+++ b/lib/aoc/point.c
@@ -0,0 +1,4 @@
+#include "aoc/point.h"
+
+extern int Point2D_equal(Point2D *lhs, Point2D *rhs);
+extern size_t Point2D_hash(Point2D *p);
diff --git a/lib/aoc/stb_sprintf.c b/lib/aoc/stb_sprintf.c
new file mode 100644
index 0000000..5385da3
--- /dev/null
+++ b/lib/aoc/stb_sprintf.c
@@ -0,0 +1,2 @@
+#define STB_SPRINTF_IMPLEMENTATION
+#include "aoc/stb_sprintf.h"
diff --git a/lib/aoc/string.c b/lib/aoc/string.c
new file mode 100644
index 0000000..5bdd03e
--- /dev/null
+++ b/lib/aoc/string.c
@@ -0,0 +1,6 @@
+#include "aoc/string.h"
+
+void strreverse(char *begin, char *end) {
+    char aux;
+    while (end > begin) aux = *end, *end-- = *begin, *begin++ = aux;
+}
diff --git a/meson.build b/meson.build
new file mode 100644
index 0000000..1524018
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,48 @@
+project('aoc-2023', 'c', version: '0.1', default_options: ['c_std=gnu99', 'warning_level=3', 'b_ndebug=if-release'])
+
+add_project_arguments(['-march=native'], language : 'c')
+
+if get_option('have-inputs')
+  add_project_arguments('-DHAVE_INPUTS', language : 'c')
+endif
+
+c = meson.get_compiler('c')
+
+inc_dirs = ['include', 'vendor/ctl', 'vendor/sort']
+
+aoc_lib = static_library(
+    'aoc_lib',
+    'lib/aoc/io.c',
+    'lib/aoc/log.c',
+    'lib/aoc/math.c',
+    'lib/aoc/md5.c',
+    'lib/aoc/parser.c',
+    'lib/aoc/point.c',
+    'lib/aoc/stb_sprintf.c',
+    'lib/aoc/string.c',
+    include_directories: include_directories(inc_dirs)
+)
+
+days = {
+  'day01': [ 'src/day01/solve.c' ],
+  'day02': [ 'src/day02/solve.c' ],
+  'day03': [ 'src/day03/solve.c' ],
+  'day04': [ 'src/day04/solve.c', 'src/day04/sort.c' ],
+  # XXX: marker
+}
+
+foreach day, sources : days
+  executable(day,
+    [ 'src/main.c' ] + sources,
+    c_args: [f'-DDAY="@day@"'],
+    link_with: aoc_lib,
+    include_directories: include_directories(inc_dirs))
+
+  test(day,
+    executable(f'@day@_test',
+      [ f'src/@day@/solve_test.c'] + sources,
+      c_args: [f'-DDAY="@day@"'],
+      link_with: aoc_lib,
+      include_directories: include_directories(inc_dirs + ['vendor/ctest']))
+  )
+endforeach
diff --git a/meson.options b/meson.options
new file mode 100644
index 0000000..9815933
--- /dev/null
+++ b/meson.options
@@ -0,0 +1 @@
+option('have-inputs', type : 'boolean', description : 'Enable tests using the real input files', value : false)
diff --git a/meson_options.txt b/meson_options.txt
new file mode 120000
index 0000000..7b28df2
--- /dev/null
+++ b/meson_options.txt
@@ -0,0 +1 @@
+meson.options
\ No newline at end of file
diff --git a/puzzle/day01.md b/puzzle/day01.md
new file mode 100644
index 0000000..652d624
--- /dev/null
+++ b/puzzle/day01.md
@@ -0,0 +1,57 @@
+
+--- Day 1: Trebuchet?! ---
+
+Something is wrong with global snow production, and you've been selected to take a look. The Elves have even given you a map; on it, they've used stars to mark the top fifty locations that are likely to be having problems.
+
+You've been doing this long enough to know that to restore snow operations, you need to check all fifty stars by December 25th.
+
+Collect stars by solving puzzles. Two puzzles will be made available on each day in the Advent calendar; the second puzzle is unlocked when you complete the first. Each puzzle grants one star. Good luck!
+
+You try to ask why they can't just use a weather machine ("not powerful enough") and where they're even sending you ("the sky") and why your map looks mostly blank ("you sure ask a lot of questions") and hang on did you just say the sky ("of course, where do you think snow comes from") when you realize that the Elves are already loading you into a trebuchet
+("please hold still, we need to strap you in").
+
+As they're making the final adjustments, they discover that their calibration document (your puzzle input) has been amended by a very young Elf who was apparently just excited to show off her art skills. Consequently, the Elves are having trouble reading the values on the document.
+
+The newly-improved calibration document consists of lines of text; each line originally contained a specific calibration value that the Elves now need to recover. On each line, the calibration value can be found by combining the first digit and the last digit (in that order) to form a single two-digit number.
+
+For example:
+
+1abc2
+pqr3stu8vwx
+a1b2c3d4e5f
+treb7uchet
+
+In this example, the calibration values of these four lines are 12, 38, 15, and 77. Adding these together produces 142.
+
+Consider your entire calibration document. What is the sum of all of the calibration values?
+
+Your puzzle answer was 54081.
+
+--- Part Two ---
+
+Your calculation isn't quite right. It looks like some of the digits are actually spelled out with letters: one, two, three, four, five, six, seven, eight, and nine also count as valid "digits".
+
+Equipped with this new information, you now need to find the real first and last digit on each line. For example:
+
+two1nine
+eightwothree
+abcone2threexyz
+xtwone3four
+4nineeightseven2
+zoneight234
+7pqrstsixteen
+
+In this example, the calibration values are 29, 83, 13, 24, 42, 14, and 76. Adding these together produces 281.
+
+What is the sum of all of the calibration values?
+
+Your puzzle answer was 54649.
+
+Both parts of this puzzle are complete! They provide two gold stars: **
+
+At this point, you should return to your Advent calendar and try another puzzle.
+
+If you still want to see it, you can get your puzzle input.
+
+You can also [Shareon Twitter Mastodon] this puzzle.
+
diff --git a/puzzle/day02.md b/puzzle/day02.md
new file mode 100644
index 0000000..6a7f555
--- /dev/null
+++ b/puzzle/day02.md
@@ -0,0 +1,67 @@
+
+--- Day 2: Cube Conundrum ---
+
+You're launched high into the atmosphere! The apex of your trajectory just barely reaches the surface of a large island floating in the sky. You gently land in a fluffy pile of leaves. It's quite cold, but you don't see much snow. An Elf runs over to greet you.
+
+The Elf explains that you've arrived at Snow Island and apologizes for the lack of snow. He'll be happy to explain the situation, but it's a bit of a walk, so you have some time. They don't get many visitors up here; would you like to play a game in the meantime?
+
+As you walk, the Elf shows you a small bag and some cubes which are either red, green, or blue. Each time you play this game, he will hide a secret number of cubes of each color in the bag, and your goal is to figure out information about the number of cubes.
+
+To get information, once a bag has been loaded with cubes, the Elf will reach into the bag, grab a handful of random cubes, show them to you, and then put them back in the bag. He'll do this a few times per game.
+
+You play several games and record the information from each game (your puzzle input). Each game is listed with its ID number (like the 11 in Game 11: ...) followed by a semicolon-separated list of subsets of cubes that were revealed from the bag (like 3 red, 5 green, 4
+blue).
+
+For example, the record of a few games might look like this:
+
+Game 1: 3 blue, 4 red; 1 red, 2 green, 6 blue; 2 green
+Game 2: 1 blue, 2 green; 3 green, 4 blue, 1 red; 1 green, 1 blue
+Game 3: 8 green, 6 blue, 20 red; 5 blue, 4 red, 13 green; 5 green, 1 red
+Game 4: 1 green, 3 red, 6 blue; 3 green, 6 red; 3 green, 15 blue, 14 red
+Game 5: 6 red, 1 blue, 3 green; 2 blue, 1 red, 2 green
+
+In game 1, three sets of cubes are revealed from the bag (and then put back again). The first set is 3 blue cubes and 4 red cubes; the second set is 1 red cube, 2 green cubes, and 6 blue cubes; the third set is only 2 green cubes.
+
+The Elf would first like to know which games would have been possible if the bag contained only 12 red cubes, 13 green cubes, and 14 blue cubes?
+
+In the example above, games 1, 2, and 5 would have been possible if the bag had been loaded with that configuration. However, game 3 would have been impossible because at one point the Elf showed you 20 red cubes at once; similarly, game 4 would also have been impossible
+because the Elf showed you 15 blue cubes at once. If you add up the IDs of the games that would have been possible, you get 8.
+
+Determine which games would have been possible if the bag had been loaded with only 12 red cubes, 13 green cubes, and 14 blue cubes. What is the sum of the IDs of those games?
+
+Your puzzle answer was 2716.
+
+--- Part Two ---
+
+The Elf says they've stopped producing snow because they aren't getting any water! He isn't sure why the water stopped; however, he can show you how to get to the water source to check it out for yourself. It's just up ahead!
+
+As you continue your walk, the Elf poses a second question: in each game you played, what is the fewest number of cubes of each color that could have been in the bag to make the game possible?
+
+Again consider the example games from earlier:
+
+Game 1: 3 blue, 4 red; 1 red, 2 green, 6 blue; 2 green
+Game 2: 1 blue, 2 green; 3 green, 4 blue, 1 red; 1 green, 1 blue
+Game 3: 8 green, 6 blue, 20 red; 5 blue, 4 red, 13 green; 5 green, 1 red
+Game 4: 1 green, 3 red, 6 blue; 3 green, 6 red; 3 green, 15 blue, 14 red
+Game 5: 6 red, 1 blue, 3 green; 2 blue, 1 red, 2 green
+
+In game 1, the game could have been played with as few as 4 red, 2 green, and 6 blue cubes. If any color had even one fewer cube, the game would have been impossible.
+Game 2 could have been played with a minimum of 1 red, 3 green, and 4 blue cubes.
+Game 3 must have been played with at least 20 red, 13 green, and 6 blue cubes.
+Game 4 required at least 14 red, 3 green, and 15 blue cubes.
+Game 5 needed no fewer than 6 red, 3 green, and 2 blue cubes in the bag.
+
+The power of a set of cubes is equal to the numbers of red, green, and blue cubes multiplied together. The power of the minimum set of cubes in game 1 is 48. In games 2-5 it was 12, 1560, 630, and 36, respectively. Adding up these five powers produces the sum 2286.
+
+For each game, find the minimum set of cubes that must have been present. What is the sum of the power of these sets?
+
+Your puzzle answer was 72227.
+
+Both parts of this puzzle are complete! They provide two gold stars: **
+
+At this point, you should return to your Advent calendar and try another puzzle.
+
+If you still want to see it, you can get your puzzle input.
+
+You can also [Shareon Twitter Mastodon] this puzzle.
+
diff --git a/puzzle/day03.md b/puzzle/day03.md
new file mode 100644
index 0000000..53a2f9c
--- /dev/null
+++ b/puzzle/day03.md
@@ -0,0 +1,76 @@
+
+--- Day 3: Gear Ratios ---
+
+You and the Elf eventually reach a gondola lift station; he says the gondola lift will take you up to the water source, but this is as far as he can bring you. You go inside.
+
+It doesn't take long to find the gondolas, but there seems to be a problem: they're not moving.
+
+"Aaah!"
+
+You turn around to see a slightly-greasy Elf with a wrench and a look of surprise. "Sorry, I wasn't expecting anyone! The gondola lift isn't working right now; it'll still be a while before I can fix it." You offer to help.
+
+The engineer explains that an engine part seems to be missing from the engine, but nobody can figure out which one. If you can add up all the part numbers in the engine schematic, it should be easy to work out which part is missing.
+
+The engine schematic (your puzzle input) consists of a visual representation of the engine. There are lots of numbers and symbols you don't really understand, but apparently any number adjacent to a symbol, even diagonally, is a "part
+number" and should be included in your sum. (Periods (.) do not count as a symbol.)
+
+Here is an example engine schematic:
+
+467..114..
+...*......
+..35..633.
+......#...
+617*......
+.....+.58.
+..592.....
+......755.
+...$.*....
+.664.598..
+
+In this schematic, two numbers are not part numbers because they are not adjacent to a symbol: 114 (top right) and 58 (middle right). Every other number is adjacent to a symbol and so is a part number; their sum is 4361.
+
+Of course, the actual engine schematic is much larger. What is the sum of all of the part numbers in the engine schematic?
+
+Your puzzle answer was 520135.
+
+--- Part Two ---
+
+The engineer finds the missing part and installs it in the engine! As the engine springs to life, you jump in the closest gondola, finally ready to ascend to the water source.
+
+You don't seem to be going very fast, though. Maybe something is still wrong? Fortunately, the gondola has a phone labeled "help", so you pick it up and the engineer answers.
+
+Before you can explain the situation, she suggests that you look out the window. There stands the engineer, holding a phone in one hand and waving with the other. You're going so slowly that you haven't even left the station. You exit the
+gondola.
+
+The missing part wasn't the only issue - one of the gears in the engine is wrong. A gear is any * symbol that is adjacent to exactly two part numbers. Its gear ratio is the result of multiplying those two numbers together.
+
+This time, you need to find the gear ratio of every gear and add them all up so that the engineer can figure out which gear needs to be replaced.
+
+Consider the same engine schematic again:
+
+467..114..
+...*......
+..35..633.
+......#...
+617*......
+.....+.58.
+..592.....
+......755.
+...$.*....
+.664.598..
+
+In this schematic, there are two gears. The first is in the top left; it has part numbers 467 and 35, so its gear ratio is 16345. The second gear is in the lower right; its gear ratio is 451490. (The * adjacent to 617 is not a gear because
+it is only adjacent to one part number.) Adding up all of the gear ratios produces 467835.
+
+What is the sum of all of the gear ratios in your engine schematic?
+
+Your puzzle answer was 72514855.
+
+Both parts of this puzzle are complete! They provide two gold stars: **
+
+At this point, you should return to your Advent calendar and try another puzzle.
+
+If you still want to see it, you can get your puzzle input.
+
+You can also [Shareon Twitter Mastodon] this puzzle.
+
diff --git a/puzzle/day04.md b/puzzle/day04.md
new file mode 100644
index 0000000..0635a9c
--- /dev/null
+++ b/puzzle/day04.md
@@ -0,0 +1,87 @@
+
+--- Day 4: Scratchcards ---
+
+The gondola takes you up. Strangely, though, the ground doesn't seem to be coming with you; you're not climbing a mountain. As the circle of Snow Island recedes below you, an entire new landmass suddenly appears above you! The gondola carries you to the surface of the new
+island and lurches into the station.
+
+As you exit the gondola, the first thing you notice is that the air here is much warmer than it was on Snow Island. It's also quite humid. Is this where the water source is?
+
+The next thing you notice is an Elf sitting on the floor across the station in what seems to be a pile of colorful square cards.
+
+"Oh! Hello!" The Elf excitedly runs over to you. "How may I be of service?" You ask about water sources.
+
+"I'm not sure; I just operate the gondola lift. That does sound like something we'd have, though - this is Island Island, after all! I bet the gardener would know. He's on a different island, though - er, the small kind surrounded by water, not the floating kind. We really
+need to come up with a better naming scheme. Tell you what: if you can help me with something quick, I'll let you borrow my boat and you can go visit the gardener. I got all these scratchcards as a gift, but I can't figure out what I've won."
+
+The Elf leads you over to the pile of colorful cards. There, you discover dozens of scratchcards, all with their opaque covering already scratched off. Picking one up, it looks like each card has two lists of numbers separated by a vertical bar (|): a list of winning
+numbers and then a list of numbers you have. You organize the information into a table (your puzzle input).
+
+As far as the Elf has been able to figure out, you have to figure out which of the numbers you have appear in the list of winning numbers. The first match makes the card worth one point and each match after the first doubles the point value of that card.
+
+For example:
+
+Card 1: 41 48 83 86 17 | 83 86  6 31 17  9 48 53
+Card 2: 13 32 20 16 61 | 61 30 68 82 17 32 24 19
+Card 3:  1 21 53 59 44 | 69 82 63 72 16 21 14  1
+Card 4: 41 92 73 84 69 | 59 84 76 51 58  5 54 83
+Card 5: 87 83 26 28 32 | 88 30 70 12 93 22 82 36
+Card 6: 31 18 13 56 72 | 74 77 10 23 35 67 36 11
+
+In the above example, card 1 has five winning numbers (41, 48, 83, 86, and 17) and eight numbers you have (83, 86, 6, 31, 17, 9, 48, and 53). Of the numbers you have, four of them (48, 83, 17, and 86) are winning numbers! That means card 1 is worth 8 points (1 for the first
+match, then doubled three times for each of the three matches after the first).
+
+Card 2 has two winning numbers (32 and 61), so it is worth 2 points.
+Card 3 has two winning numbers (1 and 21), so it is worth 2 points.
+Card 4 has one winning number (84), so it is worth 1 point.
+Card 5 has no winning numbers, so it is worth no points.
+Card 6 has no winning numbers, so it is worth no points.
+
+So, in this example, the Elf's pile of scratchcards is worth 13 points.
+
+Take a seat in the large pile of colorful cards. How many points are they worth in total?
+
+Your puzzle answer was 23847.
+
+--- Part Two ---
+
+Just as you're about to report your findings to the Elf, one of you realizes that the rules have actually been printed on the back of every card this whole time.
+
+There's no such thing as "points". Instead, scratchcards only cause you to win more scratchcards equal to the number of winning numbers you have.
+
+Specifically, you win copies of the scratchcards below the winning card equal to the number of matches. So, if card 10 were to have 5 matching numbers, you would win one copy each of cards 11, 12, 13, 14, and 15.
+
+Copies of scratchcards are scored like normal scratchcards and have the same card number as the card they copied. So, if you win a copy of card 10 and it has 5 matching numbers, it would then win a copy of the same cards that the original card 10 won: cards 11, 12, 13, 14,
+and 15. This process repeats until none of the copies cause you to win any more cards. (Cards will never make you copy a card past the end of the table.)
+
+This time, the above example goes differently:
+
+Card 1: 41 48 83 86 17 | 83 86  6 31 17  9 48 53
+Card 2: 13 32 20 16 61 | 61 30 68 82 17 32 24 19
+Card 3:  1 21 53 59 44 | 69 82 63 72 16 21 14  1
+Card 4: 41 92 73 84 69 | 59 84 76 51 58  5 54 83
+Card 5: 87 83 26 28 32 | 88 30 70 12 93 22 82 36
+Card 6: 31 18 13 56 72 | 74 77 10 23 35 67 36 11
+
+Card 1 has four matching numbers, so you win one copy each of the next four cards: cards 2, 3, 4, and 5.
+Your original card 2 has two matching numbers, so you win one copy each of cards 3 and 4.
+Your copy of card 2 also wins one copy each of cards 3 and 4.
+Your four instances of card 3 (one original and three copies) have two matching numbers, so you win four copies each of cards 4 and 5.
+Your eight instances of card 4 (one original and seven copies) have one matching number, so you win eight copies of card 5.
+Your fourteen instances of card 5 (one original and thirteen copies) have no matching numbers and win no more cards.
+Your one instance of card 6 (one original) has no matching numbers and wins no more cards.
+
+Once all of the originals and copies have been processed, you end up with 1 instance of card 1, 2 instances of card 2, 4 instances of card 3, 8 instances of card 4, 14 instances of card 5, and 1 instance of card 6. In total, this example pile of scratchcards causes you to
+ultimately have 30 scratchcards!
+
+Process all of the original and copied scratchcards until no more scratchcards are won. Including the original set of scratchcards, how many total scratchcards do you end up with?
+
+Your puzzle answer was 8570000.
+
+Both parts of this puzzle are complete! They provide two gold stars: **
+
+At this point, you should return to your Advent calendar and try another puzzle.
+
+If you still want to see it, you can get your puzzle input.
+
+You can also [Shareon Twitter Mastodon] this puzzle.
+
diff --git a/src/day01/solve.c b/src/day01/solve.c
new file mode 100644
index 0000000..2a447ce
--- /dev/null
+++ b/src/day01/solve.c
@@ -0,0 +1,86 @@
+/*
+ * Author: Michael Adler
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "aoc/all.h"
+#include "solve.h"
+
+static const char ONE[] = {'o', 'n', 'e'};
+static const char TWO[] = {'t', 'w', 'o'};
+static const char THREE[] = {'t', 'h', 'r', 'e', 'e'};
+static const char FOUR[] = {'f', 'o', 'u', 'r'};
+static const char FIVE[] = {'f', 'i', 'v', 'e'};
+static const char SIX[] = {'s', 'i', 'x'};
+static const char SEVEN[] = {'s', 'e', 'v', 'e', 'n'};
+static const char EIGHT[] = {'e', 'i', 'g', 'h', 't'};
+static const char NINE[] = {'n', 'i', 'n', 'e'};
+
+void solve(const char *buf, size_t buf_size, Solution *result) {
+    int part1 = 0, part2 = 0;
+    size_t pos = 0;
+    int digits[64], digits2[64];
+    int alpha = 0, beta = 0;
+    while (pos < buf_size) {
+        int value = buf[pos];
+        if (value >= '0' && value <= '9') {
+            digits[alpha++] = value - '0';
+            digits2[beta++] = value - '0';
+        } else if (value >= 'a' && value <= 'z') {
+            //  attempt to parse word
+            if (pos + sizeof(ONE) < buf_size &&
+                memcmp(&buf[pos], ONE, sizeof(ONE)) == 0) {
+                digits2[beta++] = 1;
+            } else if (pos + sizeof(TWO) < buf_size &&
+                       memcmp(&buf[pos], TWO, sizeof(TWO)) == 0) {
+                digits2[beta++] = 2;
+            } else if (pos + sizeof(THREE) < buf_size &&
+                       memcmp(&buf[pos], THREE, sizeof(THREE)) == 0) {
+                digits2[beta++] = 3;
+            } else if (pos + sizeof(FOUR) < buf_size &&
+                       memcmp(&buf[pos], FOUR, sizeof(FOUR)) == 0) {
+                digits2[beta++] = 4;
+            } else if (pos + sizeof(FIVE) < buf_size &&
+                       memcmp(&buf[pos], FIVE, sizeof(FIVE)) == 0) {
+                digits2[beta++] = 5;
+            } else if (pos + sizeof(SIX) < buf_size &&
+                       memcmp(&buf[pos], SIX, sizeof(SIX)) == 0) {
+                digits2[beta++] = 6;
+            } else if (pos + sizeof(SEVEN) < buf_size &&
+                       memcmp(&buf[pos], SEVEN, sizeof(SEVEN)) == 0) {
+                digits2[beta++] = 7;
+            } else if (pos + sizeof(EIGHT) < buf_size &&
+                       memcmp(&buf[pos], EIGHT, sizeof(EIGHT)) == 0) {
+                digits2[beta++] = 8;
+            } else if (pos + sizeof(NINE) < buf_size &&
+                       memcmp(&buf[pos], NINE, sizeof(NINE)) == 0) {
+                digits2[beta++] = 9;
+            }
+        } else if (value == '\n') {
+            part1 += digits[0] * 10 + digits[MAX(0, alpha - 1)];
+            part2 += digits2[0] * 10 + digits2[MAX(0, beta - 1)];
+            // reset state
+            alpha = 0, beta = 0;
+        }
+        pos++;
+    }
+    stbsp_snprintf(result->part1, sizeof(result->part1), "%d", part1);
+    stbsp_snprintf(result->part2, sizeof(result->part2), "%d", part2);
+}
+
+int solve_input(const char *fname, Solution *result) {
+    char buf[1 << 15];
+    int n = read_input(fname, buf, sizeof(buf));
+    if (n <= 0) {
+        fprintf(stderr, "Failed to read %s\n", fname);
+        return -1;
+    }
+    solve(buf, n, result);
+    return 0;
+}
diff --git a/src/day01/solve_test.c b/src/day01/solve_test.c
new file mode 100644
index 0000000..36f95f2
--- /dev/null
+++ b/src/day01/solve_test.c
@@ -0,0 +1,45 @@
+/*
+ * Author: Michael Adler
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#define CTEST_MAIN
+
+#include "ctest.h"
+#include "solve.h"
+
+CTEST(day01, example1) {
+    const char *buf = "1abc2\n\
+pqr3stu8vwx\n\
+a1b2c3d4e5f\n\
+treb7uchet\n";
+    Solution solution;
+    solve(buf, strlen(buf), &solution);
+    ASSERT_STR("142", solution.part1);
+    ASSERT_STR("142", solution.part2);
+}
+
+CTEST(day01, example2) {
+    const char *buf = "two1nine\n\
+eightwothree\n\
+abcone2threexyz\n\
+xtwone3four\n\
+4nineeightseven2\n\
+zoneight234\n\
+7pqrstsixteen\n";
+    Solution solution;
+    solve(buf, strlen(buf), &solution);
+    ASSERT_STR("281", solution.part2);
+}
+
+#ifdef HAVE_INPUTS
+CTEST(day01, real) {
+    Solution solution;
+    solve_input("input/" DAY ".txt", &solution);
+    ASSERT_STR("54081", solution.part1);
+    ASSERT_STR("54649", solution.part2);
+}
+#endif
+
+int main(int argc, const char *argv[]) { return ctest_main(argc, argv); }
diff --git a/src/day02/solve.c b/src/day02/solve.c
new file mode 100644
index 0000000..b7447c6
--- /dev/null
+++ b/src/day02/solve.c
@@ -0,0 +1,69 @@
+/*
+ * Author: Michael Adler
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "aoc/all.h"
+#include "solve.h"
+
+void solve(const char *buf, size_t buf_size, Solution *result) {
+    int part1 = 0, part2 = 0;
+    size_t pos = 0;
+    int game_id = 1;
+    while (pos < buf_size) {
+        log_debug("Game %d", game_id);
+        bool game_ok = true;
+        long red_min = 0, green_min = 0, blue_min = 0;
+        while (pos < buf_size && buf[pos] != ':') pos++;
+        while (true) {
+            aoc_parse_skip_ws(buf, &pos);
+            int amount = aoc_parse_nonnegative(buf, &pos);
+            aoc_parse_skip_ws(buf, &pos);
+            if (buf[pos] == 'r') {
+                log_debug("red %d", amount);
+                red_min = MAX(red_min, amount);
+                if (amount > 12) game_ok = false;
+            } else if (buf[pos] == 'g') {
+                log_debug("green %d", amount);
+                green_min = MAX(green_min, amount);
+                if (amount > 13) game_ok = false;
+            } else if (buf[pos] == 'b') {
+                log_debug("blue %d", amount);
+                blue_min = MAX(blue_min, amount);
+                if (amount > 14) game_ok = false;
+            }
+            // skip word
+            while (buf[pos] >= 'a' && buf[pos] <= 'z') pos++;
+            if (buf[pos] == '\n') {
+                if (game_ok) part1 += game_id;
+                log_debug("min rgb: %d %d %d", red_min, green_min, blue_min);
+                part2 += red_min * green_min * blue_min;
+                pos++;
+                game_id++;
+                break;
+            } else if (buf[pos] == ';') {
+                log_debug("next set");
+            }
+            pos++;
+        }
+        pos++;
+    }
+    stbsp_snprintf(result->part1, sizeof(result->part1), "%d", part1);
+    stbsp_snprintf(result->part2, sizeof(result->part2), "%d", part2);
+}
+
+int solve_input(const char *fname, Solution *result) {
+    char buf[1 << 14];
+    int n = read_input(fname, buf, sizeof(buf));
+    if (n <= 0) {
+        fprintf(stderr, "Failed to read %s\n", fname);
+        return -1;
+    }
+    solve(buf, n, result);
+    return 0;
+}
diff --git a/src/day02/solve_test.c b/src/day02/solve_test.c
new file mode 100644
index 0000000..fef36a6
--- /dev/null
+++ b/src/day02/solve_test.c
@@ -0,0 +1,33 @@
+/*
+ * Author: Michael Adler
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#define CTEST_MAIN
+
+#include "ctest.h"
+#include "solve.h"
+
+CTEST(day02, example) {
+    const char *buf = "Game 1: 3 blue, 4 red; 1 red, 2 green, 6 blue; 2 green\n\
+Game 2: 1 blue, 2 green; 3 green, 4 blue, 1 red; 1 green, 1 blue\n\
+Game 3: 8 green, 6 blue, 20 red; 5 blue, 4 red, 13 green; 5 green, 1 red\n\
+Game 4: 1 green, 3 red, 6 blue; 3 green, 6 red; 3 green, 15 blue, 14 red\n\
+Game 5: 6 red, 1 blue, 3 green; 2 blue, 1 red, 2 green\n";
+    Solution solution;
+    solve(buf, strlen(buf), &solution);
+    ASSERT_STR("8", solution.part1);
+    ASSERT_STR("2286", solution.part2);
+}
+
+#ifdef HAVE_INPUTS
+CTEST(day02, real) {
+    Solution solution;
+    solve_input("input/" DAY ".txt", &solution);
+    ASSERT_STR("2716", solution.part1);
+    ASSERT_STR("72227", solution.part2);
+}
+#endif
+
+int main(int argc, const char *argv[]) { return ctest_main(argc, argv); }
diff --git a/src/day03/solve.c b/src/day03/solve.c
new file mode 100644
index 0000000..0d58c9f
--- /dev/null
+++ b/src/day03/solve.c
@@ -0,0 +1,107 @@
+/*
+ * Author: Michael Adler
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "solve.h"
+#include "aoc/all.h"
+
+#define MAX_NUMBERS 150
+#define MAX_SYMBOLS 4096
+#define EMPTY -1
+
+typedef struct {
+    int x;
+    int y;
+    char value;
+} Symbol;
+
+static inline bool is_symbol(char s) {
+    return !(s >= '0' && s <= '9') && s != '.';
+}
+
+static inline int collect(int8_t numbers[MAX_NUMBERS][MAX_NUMBERS], int cols,
+                          int candidate_x, int candidate_y) {
+    // go left as far as possible to find start digit
+    while (candidate_x - 1 >= 0 &&
+           numbers[candidate_y][candidate_x - 1] != EMPTY) {
+        candidate_x--;
+    }
+    int value = numbers[candidate_y][candidate_x];
+    // reset to prevent collecting twice
+    numbers[candidate_y][candidate_x] = EMPTY;
+    for (int i = candidate_x + 1; i < cols; i++) {
+        if (numbers[candidate_y][i] == EMPTY) break;
+        value *= 10;
+        value += numbers[candidate_y][i];
+        numbers[candidate_y][i] = EMPTY;
+    }
+    return value;
+}
+
+void solve(const char *buf, size_t buf_size, Solution *result) {
+    int cols = 0;
+    while (buf[cols] != '\n') cols++;
+    int rows = 0;
+
+    int8_t numbers[MAX_NUMBERS][MAX_NUMBERS];
+    memset(numbers, EMPTY, sizeof(numbers));
+    Symbol symbols[MAX_SYMBOLS];
+    size_t symbols_idx = 0;
+
+    size_t pos = 0;
+    while (pos < buf_size) {
+        for (int x = 0; x < cols; x++) {
+            if (is_symbol(buf[pos])) {
+                Symbol s = {.x = x, .y = rows, .value = buf[pos]};
+                symbols[symbols_idx++] = s;
+            } else if (buf[pos] >= '0' && buf[pos] <= '9') {
+                numbers[rows][x] = buf[pos] - '0';
+            }
+            pos++;
+        }
+        if (pos < buf_size && buf[pos] == '\n') {
+            rows++;
+            pos++;
+        }
+    }
+
+    int part1 = 0, part2 = 0;
+    for (size_t i = 0; i < symbols_idx; i++) {
+        int y = symbols[i].y, x = symbols[i].x;
+        bool is_star = symbols[i].value == '*';
+
+        int values[8];
+        int value_idx = 0;
+
+        Point2D deltas[] = {{-1, -1}, {0, -1}, {1, -1}, {-1, 0},
+                            {1, 0},   {-1, 1}, {0, 1},  {1, 1}};
+
+        for (size_t i = 0; i < ARRAY_LENGTH(deltas); i++) {
+            int candidate_x = x + deltas[i].x, candidate_y = y + deltas[i].y;
+            if (candidate_x >= 0 && candidate_y >= 0 && candidate_x < cols &&
+                candidate_y < rows &&
+                numbers[candidate_y][candidate_x] != EMPTY) {
+                values[value_idx++] =
+                    collect(numbers, cols, candidate_x, candidate_y);
+            }
+        }
+        for (int i = 0; i < value_idx; i++) { part1 += values[i]; }
+        if (is_star && value_idx == 2) { part2 += values[0] * values[1]; }
+    }
+
+    stbsp_snprintf(result->part1, sizeof(result->part1), "%d", part1);
+    stbsp_snprintf(result->part2, sizeof(result->part2), "%d", part2);
+}
+
+int solve_input(const char *fname, Solution *result) {
+    char buf[1 << 15];
+    int n = read_input(fname, buf, sizeof(buf));
+    if (n <= 0) {
+        fprintf(stderr, "Failed to read %s\n", fname);
+        return -1;
+    }
+    solve(buf, n, result);
+    return 0;
+}
diff --git a/src/day03/solve_test.c b/src/day03/solve_test.c
new file mode 100644
index 0000000..f390d00
--- /dev/null
+++ b/src/day03/solve_test.c
@@ -0,0 +1,38 @@
+/*
+ * Author: Michael Adler
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#define CTEST_MAIN
+
+#include "ctest.h"
+#include "solve.h"
+
+CTEST(day03, example) {
+    const char *buf = "467..114..\n\
+...*......\n\
+..35..633.\n\
+......#...\n\
+617*......\n\
+.....+.58.\n\
+..592.....\n\
+......755.\n\
+...$.*....\n\
+.664.598..\n";
+    Solution solution;
+    solve(buf, strlen(buf), &solution);
+    ASSERT_STR("4361", solution.part1);
+    ASSERT_STR("467835", solution.part2);
+}
+
+#ifdef HAVE_INPUTS
+CTEST(day03, real) {
+    Solution solution;
+    solve_input("input/" DAY ".txt", &solution);
+    ASSERT_STR("520135", solution.part1);
+    ASSERT_STR("72514855", solution.part2);
+}
+#endif
+
+int main(int argc, const char *argv[]) { return ctest_main(argc, argv); }
diff --git a/src/day04/solve.c b/src/day04/solve.c
new file mode 100644
index 0000000..299098a
--- /dev/null
+++ b/src/day04/solve.c
@@ -0,0 +1,74 @@
+/*
+ * Author: Michael Adler
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "solve.h"
+#include "aoc/all.h"
+
+// forward decl (to speed up incremental compilation)
+void int_tim_sort(int *dst, const size_t size);
+
+void solve(const char *buf, size_t buf_size, Solution *result) {
+    int part1 = 0, part2 = 0, card_id = 1, tmp;
+    int copies[256] = {0};
+    size_t pos = 0;
+
+    while (pos < buf_size) {
+        copies[card_id]++;
+        int wc[16], mc[32], wc_count = 0, mc_count = 0;
+
+        aoc_parse_seek(buf, &pos, ':');
+        pos++;
+
+        while ((tmp = aoc_parse_nonnegative(buf, &pos)) >= 0) {
+            wc[wc_count++] = tmp;
+        }
+
+        aoc_parse_seek(buf, &pos, '|');
+        pos++;
+
+        while ((tmp = aoc_parse_nonnegative(buf, &pos)) >= 0) {
+            mc[mc_count++] = tmp;
+        }
+
+        int_tim_sort(wc, wc_count);
+        int_tim_sort(mc, mc_count);
+        int match_count = 0;
+        for (int i = 0, j = 0; i < wc_count && j < mc_count;) {
+            if (wc[i] == mc[j]) {
+                match_count++, i++, j++;
+            } else if (wc[i] < mc[j]) {
+                i++;
+            } else {
+                j++;
+            }
+        }
+        if (match_count > 0) {
+            part1 += 1 << (match_count - 1);
+            int instances = copies[card_id];
+            for (int c = card_id + 1; c <= card_id + match_count; c++) {
+                copies[c] += instances;
+            }
+        }
+
+        pos++; // newline
+        card_id++;
+    }
+    for (size_t i = 0; i < ARRAY_LENGTH(copies); i++) { part2 += copies[i]; }
+
+    stbsp_snprintf(result->part1, sizeof(result->part1), "%d", part1);
+    stbsp_snprintf(result->part2, sizeof(result->part2), "%d", part2);
+}
+
+int solve_input(const char *fname, Solution *result) {
+    char buf[1 << 15];
+    int n = read_input(fname, buf, sizeof(buf));
+    if (n <= 0) {
+        fprintf(stderr, "Failed to read %s\n", fname);
+        return -1;
+    }
+    solve(buf, n, result);
+    return 0;
+}
diff --git a/src/day04/solve_test.c b/src/day04/solve_test.c
new file mode 100644
index 0000000..c033a75
--- /dev/null
+++ b/src/day04/solve_test.c
@@ -0,0 +1,34 @@
+/*
+ * Author: Michael Adler
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#define CTEST_MAIN
+
+#include "ctest.h"
+#include "solve.h"
+
+CTEST(day04, example) {
+    const char *buf = "Card 1: 41 48 83 86 17 | 83 86  6 31 17  9 48 53\n\
+Card 2: 13 32 20 16 61 | 61 30 68 82 17 32 24 19\n\
+Card 3:  1 21 53 59 44 | 69 82 63 72 16 21 14  1\n\
+Card 4: 41 92 73 84 69 | 59 84 76 51 58  5 54 83\n\
+Card 5: 87 83 26 28 32 | 88 30 70 12 93 22 82 36\n\
+Card 6: 31 18 13 56 72 | 74 77 10 23 35 67 36 11\n";
+    Solution solution;
+    solve(buf, strlen(buf), &solution);
+    ASSERT_STR("13", solution.part1);
+    ASSERT_STR("30", solution.part2);
+}
+
+#ifdef HAVE_INPUTS
+CTEST(day04, real) {
+    Solution solution;
+    solve_input("input/" DAY ".txt", &solution);
+    ASSERT_STR("23847", solution.part1);
+    ASSERT_STR("8570000", solution.part2);
+}
+#endif
+
+int main(int argc, const char *argv[]) { return ctest_main(argc, argv); }
diff --git a/src/day04/sort.c b/src/day04/sort.c
new file mode 100644
index 0000000..ab71aa6
--- /dev/null
+++ b/src/day04/sort.c
@@ -0,0 +1,10 @@
+/*
+ * Author: Michael Adler
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#define SORT_NAME int
+#define SORT_TYPE int
+#define SORT_CMP(x, y) ((x) - (y))
+#include "sort.h"
diff --git a/src/main.c b/src/main.c
new file mode 100644
index 0000000..f88ace6
--- /dev/null
+++ b/src/main.c
@@ -0,0 +1,23 @@
+/*
+ * Author: Michael Adler
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <stdio.h>
+
+#include "solve.h"
+
+#ifndef DAY
+#error "Please define DAY"
+#endif
+
+int main(int argc, char *argv[]) {
+    const char *fname = argc > 1 ? argv[1] : "input/" DAY ".txt";
+    Solution solution;
+    if (solve_input(fname, &solution)) {
+        fprintf(stderr, DAY ": no solution found!\n");
+        return 1;
+    }
+    printf("Part1: %s\nPart2: %s\n", solution.part1, solution.part2);
+    return 0;
+}
diff --git a/src/template/solve.c b/src/template/solve.c
new file mode 100644
index 0000000..6bf791e
--- /dev/null
+++ b/src/template/solve.c
@@ -0,0 +1,30 @@
+/*
+ * Author: Michael Adler
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "solve.h"
+#include "aoc/all.h"
+
+void solve(const char *buf, size_t buf_size, Solution *result) {
+    int part1 = 0, part2 = 0;
+    size_t pos = 0;
+    while (pos < buf_size) {
+        // have fun
+        (void)buf[pos++];
+    }
+    stbsp_snprintf(result->part1, sizeof(result->part1), "%d", part1);
+    stbsp_snprintf(result->part2, sizeof(result->part2), "%d", part2);
+}
+
+int solve_input(const char *fname, Solution *result) {
+    char buf[1 << 14];
+    int n = read_input(fname, buf, sizeof(buf));
+    if (n <= 0) {
+        fprintf(stderr, "Failed to read %s\n", fname);
+        return -1;
+    }
+    solve(buf, n, result);
+    return 0;
+}
diff --git a/src/template/solve_test.c b/src/template/solve_test.c
new file mode 100644
index 0000000..9975997
--- /dev/null
+++ b/src/template/solve_test.c
@@ -0,0 +1,30 @@
+/*
+ * Author: Michael Adler
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#define CTEST_MAIN
+
+#include "ctest.h"
+#include "solve.h"
+
+CTEST(dayXX, example) {
+    const char *buf = "foo\n\
+bar\n";
+    Solution solution;
+    solve(buf, strlen(buf), &solution);
+    ASSERT_STR("0", solution.part1);
+    ASSERT_STR("0", solution.part2);
+}
+
+#ifdef HAVE_INPUTS
+CTEST_SKIP(dayXX, real) {
+    Solution solution;
+    solve_input("input/" DAY ".txt", &solution);
+    ASSERT_STR("0", solution.part1);
+    ASSERT_STR("0", solution.part2);
+}
+#endif
+
+int main(int argc, const char *argv[]) { return ctest_main(argc, argv); }
diff --git a/vendor/ctest/ctest.h b/vendor/ctest/ctest.h
new file mode 100644
index 0000000..b9db8cd
--- /dev/null
+++ b/vendor/ctest/ctest.h
@@ -0,0 +1,610 @@
+/* Copyright 2011-2023 Bas van den Berg
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CTEST_H
+#define CTEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GNUC__
+#define CTEST_IMPL_FORMAT_PRINTF(a, b) __attribute__ ((format(printf, a, b)))
+#else
+#define CTEST_IMPL_FORMAT_PRINTF(a, b)
+#endif
+
+#include <inttypes.h> /* intmax_t, uintmax_t, PRI* */
+#include <stdbool.h> /* bool, true, false */
+#include <stddef.h> /* size_t */
+
+typedef void (*ctest_nullary_run_func)(void);
+typedef void (*ctest_unary_run_func)(void*);
+typedef void (*ctest_setup_func)(void*);
+typedef void (*ctest_teardown_func)(void*);
+
+union ctest_run_func_union {
+    ctest_nullary_run_func nullary;
+    ctest_unary_run_func unary;
+};
+
+#define CTEST_IMPL_PRAGMA(x) _Pragma (#x)
+
+#if defined(__GNUC__)
+#if defined(__clang__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+/* the GCC argument will work for both gcc and clang  */
+#define CTEST_IMPL_DIAG_PUSH_IGNORED(w) \
+    CTEST_IMPL_PRAGMA(GCC diagnostic push) \
+    CTEST_IMPL_PRAGMA(GCC diagnostic ignored "-W" #w)
+#define CTEST_IMPL_DIAG_POP() \
+    CTEST_IMPL_PRAGMA(GCC diagnostic pop)
+#else
+/* the push/pop functionality wasn't in gcc until 4.6, fallback to "ignored"  */
+#define CTEST_IMPL_DIAG_PUSH_IGNORED(w) \
+    CTEST_IMPL_PRAGMA(GCC diagnostic ignored "-W" #w)
+#define CTEST_IMPL_DIAG_POP()
+#endif
+#else
+/* leave them out entirely for non-GNUC compilers  */
+#define CTEST_IMPL_DIAG_PUSH_IGNORED(w)
+#define CTEST_IMPL_DIAG_POP()
+#endif
+
+struct ctest {
+    const char* ssname;  // suite name
+    const char* ttname;  // test name
+    union ctest_run_func_union run;
+
+    void* data;
+    ctest_setup_func* setup;
+    ctest_teardown_func* teardown;
+
+    int skip;
+
+    unsigned int magic;
+};
+
+#define CTEST_IMPL_NAME(name) ctest_##name
+#define CTEST_IMPL_FNAME(sname, tname) CTEST_IMPL_NAME(sname##_##tname##_run)
+#define CTEST_IMPL_TNAME(sname, tname) CTEST_IMPL_NAME(sname##_##tname)
+#define CTEST_IMPL_DATA_SNAME(sname) CTEST_IMPL_NAME(sname##_data)
+#define CTEST_IMPL_DATA_TNAME(sname, tname) CTEST_IMPL_NAME(sname##_##tname##_data)
+#define CTEST_IMPL_SETUP_FNAME(sname) CTEST_IMPL_NAME(sname##_setup)
+#define CTEST_IMPL_SETUP_FPNAME(sname) CTEST_IMPL_NAME(sname##_setup_ptr)
+#define CTEST_IMPL_SETUP_TPNAME(sname, tname) CTEST_IMPL_NAME(sname##_##tname##_setup_ptr)
+#define CTEST_IMPL_TEARDOWN_FNAME(sname) CTEST_IMPL_NAME(sname##_teardown)
+#define CTEST_IMPL_TEARDOWN_FPNAME(sname) CTEST_IMPL_NAME(sname##_teardown_ptr)
+#define CTEST_IMPL_TEARDOWN_TPNAME(sname, tname) CTEST_IMPL_NAME(sname##_##tname##_teardown_ptr)
+
+#define CTEST_IMPL_MAGIC (0xdeadbeef)
+#ifdef __APPLE__
+#define CTEST_IMPL_SECTION __attribute__ ((used, section ("__DATA, .ctest"), aligned(1)))
+#else
+#define CTEST_IMPL_SECTION __attribute__ ((used, section (".ctest"), aligned(1)))
+#endif
+
+#define CTEST_IMPL_STRUCT(sname, tname, tskip, tdata, tsetup, tteardown) \
+    static struct ctest CTEST_IMPL_TNAME(sname, tname) CTEST_IMPL_SECTION = { \
+        #sname, \
+        #tname, \
+        { (ctest_nullary_run_func) CTEST_IMPL_FNAME(sname, tname) }, \
+        tdata, \
+        (ctest_setup_func*) tsetup, \
+        (ctest_teardown_func*) tteardown, \
+        tskip, \
+        CTEST_IMPL_MAGIC, \
+    }
+
+#ifdef __cplusplus
+
+#define CTEST_SETUP(sname) \
+    template <> void CTEST_IMPL_SETUP_FNAME(sname)(struct CTEST_IMPL_DATA_SNAME(sname)* data)
+
+#define CTEST_TEARDOWN(sname) \
+    template <> void CTEST_IMPL_TEARDOWN_FNAME(sname)(struct CTEST_IMPL_DATA_SNAME(sname)* data)
+
+#define CTEST_DATA(sname) \
+    template <typename T> void CTEST_IMPL_SETUP_FNAME(sname)(T* data) { } \
+    template <typename T> void CTEST_IMPL_TEARDOWN_FNAME(sname)(T* data) { } \
+    struct CTEST_IMPL_DATA_SNAME(sname)
+
+#define CTEST_IMPL_CTEST(sname, tname, tskip) \
+    static void CTEST_IMPL_FNAME(sname, tname)(void); \
+    CTEST_IMPL_STRUCT(sname, tname, tskip, NULL, NULL, NULL); \
+    static void CTEST_IMPL_FNAME(sname, tname)(void)
+
+#define CTEST_IMPL_CTEST2(sname, tname, tskip) \
+    static struct CTEST_IMPL_DATA_SNAME(sname) CTEST_IMPL_DATA_TNAME(sname, tname); \
+    static void CTEST_IMPL_FNAME(sname, tname)(struct CTEST_IMPL_DATA_SNAME(sname)* data); \
+    static void (*CTEST_IMPL_SETUP_TPNAME(sname, tname))(struct CTEST_IMPL_DATA_SNAME(sname)*) = &CTEST_IMPL_SETUP_FNAME(sname)<struct CTEST_IMPL_DATA_SNAME(sname)>; \
+    static void (*CTEST_IMPL_TEARDOWN_TPNAME(sname, tname))(struct CTEST_IMPL_DATA_SNAME(sname)*) = &CTEST_IMPL_TEARDOWN_FNAME(sname)<struct CTEST_IMPL_DATA_SNAME(sname)>; \
+    CTEST_IMPL_STRUCT(sname, tname, tskip, &CTEST_IMPL_DATA_TNAME(sname, tname), &CTEST_IMPL_SETUP_TPNAME(sname, tname), &CTEST_IMPL_TEARDOWN_TPNAME(sname, tname)); \
+    static void CTEST_IMPL_FNAME(sname, tname)(struct CTEST_IMPL_DATA_SNAME(sname)* data)
+
+#else
+
+#define CTEST_SETUP(sname) \
+    static void CTEST_IMPL_SETUP_FNAME(sname)(struct CTEST_IMPL_DATA_SNAME(sname)* data); \
+    static void (*CTEST_IMPL_SETUP_FPNAME(sname))(struct CTEST_IMPL_DATA_SNAME(sname)*) = &CTEST_IMPL_SETUP_FNAME(sname); \
+    static void CTEST_IMPL_SETUP_FNAME(sname)(struct CTEST_IMPL_DATA_SNAME(sname)* data)
+
+#define CTEST_TEARDOWN(sname) \
+    static void CTEST_IMPL_TEARDOWN_FNAME(sname)(struct CTEST_IMPL_DATA_SNAME(sname)* data); \
+    static void (*CTEST_IMPL_TEARDOWN_FPNAME(sname))(struct CTEST_IMPL_DATA_SNAME(sname)*) = &CTEST_IMPL_TEARDOWN_FNAME(sname); \
+    static void CTEST_IMPL_TEARDOWN_FNAME(sname)(struct CTEST_IMPL_DATA_SNAME(sname)* data)
+
+#define CTEST_DATA(sname) \
+    struct CTEST_IMPL_DATA_SNAME(sname); \
+    static void (*CTEST_IMPL_SETUP_FPNAME(sname))(struct CTEST_IMPL_DATA_SNAME(sname)*); \
+    static void (*CTEST_IMPL_TEARDOWN_FPNAME(sname))(struct CTEST_IMPL_DATA_SNAME(sname)*); \
+    struct CTEST_IMPL_DATA_SNAME(sname)
+
+#define CTEST_IMPL_CTEST(sname, tname, tskip) \
+    static void CTEST_IMPL_FNAME(sname, tname)(void); \
+    CTEST_IMPL_STRUCT(sname, tname, tskip, NULL, NULL, NULL); \
+    static void CTEST_IMPL_FNAME(sname, tname)(void)
+
+#define CTEST_IMPL_CTEST2(sname, tname, tskip) \
+    static struct CTEST_IMPL_DATA_SNAME(sname) CTEST_IMPL_DATA_TNAME(sname, tname); \
+    static void CTEST_IMPL_FNAME(sname, tname)(struct CTEST_IMPL_DATA_SNAME(sname)* data); \
+    CTEST_IMPL_STRUCT(sname, tname, tskip, &CTEST_IMPL_DATA_TNAME(sname, tname), &CTEST_IMPL_SETUP_FPNAME(sname), &CTEST_IMPL_TEARDOWN_FPNAME(sname)); \
+    static void CTEST_IMPL_FNAME(sname, tname)(struct CTEST_IMPL_DATA_SNAME(sname)* data)
+
+#endif
+
+void CTEST_LOG(const char* fmt, ...) CTEST_IMPL_FORMAT_PRINTF(1, 2);
+void CTEST_ERR(const char* fmt, ...) CTEST_IMPL_FORMAT_PRINTF(1, 2);  // doesn't return
+
+#define CTEST(sname, tname) CTEST_IMPL_CTEST(sname, tname, 0)
+#define CTEST_SKIP(sname, tname) CTEST_IMPL_CTEST(sname, tname, 1)
+
+#define CTEST2(sname, tname) CTEST_IMPL_CTEST2(sname, tname, 0)
+#define CTEST2_SKIP(sname, tname) CTEST_IMPL_CTEST2(sname, tname, 1)
+
+
+void assert_str(const char* cmp, const char* exp, const char* real, const char* caller, int line);
+#define ASSERT_STR(exp, real) assert_str("==", exp, real, __FILE__, __LINE__)
+#define ASSERT_NOT_STR(exp, real) assert_str("!=", exp, real, __FILE__, __LINE__)
+#define ASSERT_STRSTR(str, substr) assert_str("=~", str, substr, __FILE__, __LINE__)
+#define ASSERT_NOT_STRSTR(str, substr) assert_str("!~", str, substr, __FILE__, __LINE__)
+
+void assert_wstr(const char* cmp, const wchar_t *exp, const wchar_t *real, const char* caller, int line);
+#define ASSERT_WSTR(exp, real) assert_wstr("==", exp, real, __FILE__, __LINE__)
+#define ASSERT_NOT_WSTR(exp, real) assert_wstr("!=", exp, real, __FILE__, __LINE__)
+#define ASSERT_WSTRSTR(str, substr) assert_wstr("=~", str, substr, __FILE__, __LINE__)
+#define ASSERT_NOT_WSTRSTR(str, substr) assert_wstr("!~", str, substr, __FILE__, __LINE__)
+
+void assert_data(const unsigned char* exp, size_t expsize,
+                 const unsigned char* real, size_t realsize,
+                 const char* caller, int line);
+#define ASSERT_DATA(exp, expsize, real, realsize) \
+    assert_data(exp, expsize, real, realsize, __FILE__, __LINE__)
+
+#define CTEST_FLT_EPSILON 1e-5
+#define CTEST_DBL_EPSILON 1e-12
+
+void assert_compare(const char* cmp, intmax_t exp, intmax_t real, const char* caller, int line);
+#define ASSERT_EQUAL(exp, real) assert_compare("==", exp, real, __FILE__, __LINE__)
+#define ASSERT_NOT_EQUAL(exp, real) assert_compare("!=", exp, real, __FILE__, __LINE__)
+
+#define ASSERT_LT(v1, v2) assert_compare("<", v1, v2, __FILE__, __LINE__)
+#define ASSERT_LE(v1, v2) assert_compare("<=", v1, v2, __FILE__, __LINE__)
+#define ASSERT_GT(v1, v2) assert_compare(">", v1, v2, __FILE__, __LINE__)
+#define ASSERT_GE(v1, v2) assert_compare(">=", v1, v2, __FILE__, __LINE__)
+
+void assert_compare_u(const char* cmp, uintmax_t exp, uintmax_t real, const char* caller, int line);
+#define ASSERT_EQUAL_U(exp, real) assert_compare_u("==", exp, real, __FILE__, __LINE__)
+#define ASSERT_NOT_EQUAL_U(exp, real) assert_compare_u("!=", exp, real, __FILE__, __LINE__)
+
+#define ASSERT_LT_U(v1, v2) assert_compare_u("<", v1, v2, __FILE__, __LINE__)
+#define ASSERT_LE_U(v1, v2) assert_compare_u("<=", v1, v2, __FILE__, __LINE__)
+#define ASSERT_GT_U(v1, v2) assert_compare_u(">", v1, v2, __FILE__, __LINE__)
+#define ASSERT_GE_U(v1, v2) assert_compare_u(">=", v1, v2, __FILE__, __LINE__)
+
+void assert_interval(intmax_t exp1, intmax_t exp2, intmax_t real, const char* caller, int line);
+#define ASSERT_INTERVAL(exp1, exp2, real) assert_interval(exp1, exp2, real, __FILE__, __LINE__)
+
+void assert_null(void* real, const char* caller, int line);
+#define ASSERT_NULL(real) assert_null((void*)real, __FILE__, __LINE__)
+
+void assert_not_null(const void* real, const char* caller, int line);
+#define ASSERT_NOT_NULL(real) assert_not_null(real, __FILE__, __LINE__)
+
+void assert_true(int real, const char* caller, int line);
+#define ASSERT_TRUE(real) assert_true(real, __FILE__, __LINE__)
+
+void assert_false(int real, const char* caller, int line);
+#define ASSERT_FALSE(real) assert_false(real, __FILE__, __LINE__)
+
+void assert_fail(const char* caller, int line);
+#define ASSERT_FAIL() assert_fail(__FILE__, __LINE__)
+
+void assert_dbl_compare(const char* cmp, double exp, double real, double tol, const char* caller, int line);
+#define ASSERT_DBL_NEAR(exp, real) assert_dbl_compare("==", exp, real, -CTEST_DBL_EPSILON, __FILE__, __LINE__)
+#define ASSERT_DBL_NEAR_TOL(exp, real, tol) assert_dbl_compare("==", exp, real, tol, __FILE__, __LINE__)
+#define ASSERT_DBL_FAR(exp, real) assert_dbl_compare("!=", exp, real, -CTEST_DBL_EPSILON, __FILE__, __LINE__)
+#define ASSERT_DBL_FAR_TOL(exp, real, tol) assert_dbl_compare("!=", exp, real, tol, __FILE__, __LINE__)
+
+#define ASSERT_FLT_NEAR(v1, v2) assert_dbl_compare("==", v1, v2, -CTEST_FLT_EPSILON, __FILE__, __LINE__)
+#define ASSERT_FLT_FAR(v1, v2) assert_dbl_compare("!=", v1, v2, -CTEST_FLT_EPSILON, __FILE__, __LINE__)
+#define ASSERT_DBL_LT(v1, v2) assert_dbl_compare("<", v1, v2, 0.0, __FILE__, __LINE__)
+#define ASSERT_DBL_GT(v1, v2) assert_dbl_compare(">", v1, v2, 0.0, __FILE__, __LINE__)
+
+#ifdef CTEST_MAIN
+
+#include <setjmp.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#if !defined(_WIN32) || defined(__GNUC__)
+#include <unistd.h>
+#elif defined(_WIN32)
+#include <io.h>
+#endif
+#include <stdint.h>
+#include <stdlib.h>
+#include <wchar.h>
+
+static size_t ctest_errorsize;
+static char* ctest_errormsg;
+#define MSG_SIZE 4096
+static char ctest_errorbuffer[MSG_SIZE];
+static jmp_buf ctest_err;
+static int color_output = 1;
+static const char* suite_name;
+
+typedef int (*ctest_filter_func)(struct ctest*);
+
+#define ANSI_BLACK    "\033[0;30m"
+#define ANSI_RED      "\033[0;31m"
+#define ANSI_GREEN    "\033[0;32m"
+#define ANSI_YELLOW   "\033[0;33m"
+#define ANSI_BLUE     "\033[0;34m"
+#define ANSI_MAGENTA  "\033[0;35m"
+#define ANSI_CYAN     "\033[0;36m"
+#define ANSI_GREY     "\033[0;37m"
+#define ANSI_DARKGREY "\033[01;30m"
+#define ANSI_BRED     "\033[01;31m"
+#define ANSI_BGREEN   "\033[01;32m"
+#define ANSI_BYELLOW  "\033[01;33m"
+#define ANSI_BBLUE    "\033[01;34m"
+#define ANSI_BMAGENTA "\033[01;35m"
+#define ANSI_BCYAN    "\033[01;36m"
+#define ANSI_WHITE    "\033[01;37m"
+#define ANSI_NORMAL   "\033[0m"
+
+CTEST(suite, test) { }
+
+static void vprint_errormsg(const char* const fmt, va_list ap) CTEST_IMPL_FORMAT_PRINTF(1, 0);
+static void print_errormsg(const char* const fmt, ...) CTEST_IMPL_FORMAT_PRINTF(1, 2);
+
+static void vprint_errormsg(const char* const fmt, va_list ap) {
+    // (v)snprintf returns the number that would have been written
+    const int ret = vsnprintf(ctest_errormsg, ctest_errorsize, fmt, ap);
+    if (ret < 0) {
+        ctest_errormsg[0] = 0x00;
+    } else {
+        const size_t size = (size_t) ret;
+        const size_t s = (ctest_errorsize <= size ? size -ctest_errorsize : size);
+        // ctest_errorsize may overflow at this point
+        ctest_errorsize -= s;
+        ctest_errormsg += s;
+    }
+}
+
+static void print_errormsg(const char* const fmt, ...) {
+    va_list argp;
+    va_start(argp, fmt);
+    vprint_errormsg(fmt, argp);
+    va_end(argp);
+}
+
+static void msg_start(const char* color, const char* title) {
+    if (color_output) {
+        print_errormsg("%s", color);
+    }
+    print_errormsg("  %s: ", title);
+}
+
+static void msg_end(void) {
+    if (color_output) {
+        print_errormsg(ANSI_NORMAL);
+    }
+    print_errormsg("\n");
+}
+
+void CTEST_LOG(const char* fmt, ...)
+{
+    va_list argp;
+    msg_start(ANSI_BLUE, "LOG");
+
+    va_start(argp, fmt);
+    vprint_errormsg(fmt, argp);
+    va_end(argp);
+
+    msg_end();
+}
+
+CTEST_IMPL_DIAG_PUSH_IGNORED(missing-noreturn)
+
+void CTEST_ERR(const char* fmt, ...)
+{
+    va_list argp;
+    msg_start(ANSI_YELLOW, "ERR");
+
+    va_start(argp, fmt);
+    vprint_errormsg(fmt, argp);
+    va_end(argp);
+
+    msg_end();
+    longjmp(ctest_err, 1);
+}
+
+CTEST_IMPL_DIAG_POP()
+
+void assert_str(const char* cmp, const char* exp, const char*  real, const char* caller, int line) {
+    if ((!exp ^ !real) || (exp && (
+        (cmp[1] == '=' && ((cmp[0] == '=') ^ (strcmp(exp, real) == 0))) ||
+        (cmp[1] == '~' && ((cmp[0] == '=') ^ (strstr(exp, real) != NULL)))
+    ))) {
+        CTEST_ERR("%s:%d  assertion failed, '%s' %s '%s'", caller, line, exp, cmp, real);
+    }
+}
+
+void assert_wstr(const char* cmp, const wchar_t *exp, const wchar_t *real, const char* caller, int line) {
+    if ((!exp ^ !real) || (exp && (
+        (cmp[1] == '=' && ((cmp[0] == '=') ^ (wcscmp(exp, real) == 0))) ||
+        (cmp[1] == '~' && ((cmp[0] == '=') ^ (wcsstr(exp, real) != NULL)))
+    ))) {
+        CTEST_ERR("%s:%d  assertion failed, '%ls' %s '%ls'", caller, line, exp, cmp, real);
+    }
+}
+
+void assert_data(const unsigned char* exp, size_t expsize,
+                 const unsigned char* real, size_t realsize,
+                 const char* caller, int line) {
+    size_t i;
+    if (expsize != realsize) {
+        CTEST_ERR("%s:%d  expected %" PRIuMAX " bytes, got %" PRIuMAX, caller, line, (uintmax_t) expsize, (uintmax_t) realsize);
+    }
+    for (i=0; i<expsize; i++) {
+        if (exp[i] != real[i]) {
+            CTEST_ERR("%s:%d expected 0x%02x at offset %" PRIuMAX " got 0x%02x",
+                caller, line, exp[i], (uintmax_t) i, real[i]);
+        }
+    }
+}
+
+static bool get_compare_result(const char* cmp, int c3, bool eq) {
+    if (cmp[0] == '<')
+        return c3 < 0 || ((cmp[1] == '=') & eq);
+    if (cmp[0] == '>')
+        return c3 > 0 || ((cmp[1] == '=') & eq);
+    return (cmp[0] == '=') == eq;
+}
+
+void assert_compare(const char* cmp, intmax_t exp, intmax_t real, const char* caller, int line) {
+    int c3 = (real < exp) - (exp < real);
+
+    if (!get_compare_result(cmp, c3, c3 == 0)) {
+        CTEST_ERR("%s:%d  assertion failed, %" PRIdMAX " %s %" PRIdMAX "", caller, line, exp, cmp, real);
+    }
+}
+
+void assert_compare_u(const char* cmp, uintmax_t exp, uintmax_t real, const char* caller, int line) {
+    int c3 = (real < exp) - (exp < real);
+
+    if (!get_compare_result(cmp, c3, c3 == 0)) {
+        CTEST_ERR("%s:%d  assertion failed, %" PRIuMAX " %s %" PRIuMAX, caller, line, exp, cmp, real);
+    }
+}
+
+void assert_interval(intmax_t exp1, intmax_t exp2, intmax_t real, const char* caller, int line) {
+    if (real < exp1 || real > exp2) {
+        CTEST_ERR("%s:%d  expected %" PRIdMAX "-%" PRIdMAX ", got %" PRIdMAX, caller, line, exp1, exp2, real);
+    }
+}
+
+static bool approximately_equal(double a, double b, double epsilon) {
+    double d = a - b;
+    if (d < 0) d = -d;
+    if (a < 0) a = -a;
+    if (b < 0) b = -b;
+    return d <= (a > b ? a : b)*epsilon;     /* D.Knuth */
+}
+
+/* tol < 0 means it is an epsilon, else absolute error */
+void assert_dbl_compare(const char* cmp, double exp, double real, double tol, const char* caller, int line) {
+    double diff = exp - real;
+    double absdiff = diff < 0 ? -diff : diff;
+    int c3 = (real < exp) - (exp < real);
+    bool eq = tol < 0 ? approximately_equal(exp, real, -tol) : absdiff <= tol;
+
+    if (!get_compare_result(cmp, c3, eq)) {
+        const char* tolstr = "tol";
+        if (tol < 0) {
+            tolstr = "eps";
+            tol = -tol;
+        }
+        CTEST_ERR("%s:%d  assertion failed, %.8g %s %.8g (diff %.4g, %s %.4g)", caller, line, exp, cmp, real, diff, tolstr, tol);
+    }
+}
+
+void assert_null(void* real, const char* caller, int line) {
+    if ((real) != NULL) {
+        CTEST_ERR("%s:%d  should be NULL", caller, line);
+    }
+}
+
+void assert_not_null(const void* real, const char* caller, int line) {
+    if (real == NULL) {
+        CTEST_ERR("%s:%d  should not be NULL", caller, line);
+    }
+}
+
+void assert_true(int real, const char* caller, int line) {
+    if ((real) == 0) {
+        CTEST_ERR("%s:%d  should be true", caller, line);
+    }
+}
+
+void assert_false(int real, const char* caller, int line) {
+    if ((real) != 0) {
+        CTEST_ERR("%s:%d  should be false", caller, line);
+    }
+}
+
+void assert_fail(const char* caller, int line) {
+    CTEST_ERR("%s:%d  shouldn't come here", caller, line);
+}
+
+
+static int suite_all(struct ctest* t) {
+    (void) t; // fix unused parameter warning
+    return 1;
+}
+
+static int suite_filter(struct ctest* t) {
+    return strncmp(suite_name, t->ssname, strlen(suite_name)) == 0;
+}
+
+static void color_print(const char* color, const char* text) {
+    if (color_output)
+        printf("%s%s" ANSI_NORMAL "\n", color, text);
+    else
+        printf("%s\n", text);
+}
+
+#ifdef CTEST_SEGFAULT
+#include <signal.h>
+static void sighandler(int signum)
+{
+    const char msg_color[] = ANSI_BRED "[SIGSEGV: Segmentation fault]" ANSI_NORMAL "\n";
+    const char msg_nocolor[] = "[SIGSEGV: Segmentation fault]\n";
+
+    const char* msg = color_output ? msg_color : msg_nocolor;
+    write(STDOUT_FILENO, msg, (unsigned int)strlen(msg));
+
+    /* "Unregister" the signal handler and send the signal back to the process
+     * so it can terminate as expected */
+    signal(signum, SIG_DFL);
+#if !defined(_WIN32) || defined(__CYGWIN__)
+    kill(getpid(), signum);
+#endif
+}
+#endif
+
+int ctest_main(int argc, const char *argv[]);
+
+__attribute__((no_sanitize_address)) int ctest_main(int argc, const char *argv[])
+{
+    static int total = 0;
+    static int num_ok = 0;
+    static int num_fail = 0;
+    static int num_skip = 0;
+    static int idx = 1;
+    static ctest_filter_func filter = suite_all;
+
+#ifdef CTEST_SEGFAULT
+    signal(SIGSEGV, sighandler);
+#endif
+
+    if (argc == 2) {
+        suite_name = argv[1];
+        filter = suite_filter;
+    }
+#ifdef CTEST_NO_COLORS
+    color_output = 0;
+#else
+    color_output = isatty(1);
+#endif
+    clock_t t1 = clock();
+
+    struct ctest* ctest_begin = &CTEST_IMPL_TNAME(suite, test);
+    struct ctest* ctest_end = &CTEST_IMPL_TNAME(suite, test);
+    // find begin and end of section by comparing magics
+    while (1) {
+        struct ctest* t = ctest_begin-1;
+        if (t->magic != CTEST_IMPL_MAGIC) break;
+        ctest_begin--;
+    }
+    while (1) {
+        struct ctest* t = ctest_end+1;
+        if (t->magic != CTEST_IMPL_MAGIC) break;
+        ctest_end++;
+    }
+    ctest_end++;    // end after last one
+
+    static struct ctest* test;
+    for (test = ctest_begin; test != ctest_end; test++) {
+        if (test == &CTEST_IMPL_TNAME(suite, test)) continue;
+        if (filter(test)) total++;
+    }
+
+    for (test = ctest_begin; test != ctest_end; test++) {
+        if (test == &CTEST_IMPL_TNAME(suite, test)) continue;
+        if (filter(test)) {
+            ctest_errorbuffer[0] = 0;
+            ctest_errorsize = MSG_SIZE-1;
+            ctest_errormsg = ctest_errorbuffer;
+            printf("TEST %d/%d %s:%s ", idx, total, test->ssname, test->ttname);
+            fflush(stdout);
+            if (test->skip) {
+                color_print(ANSI_BYELLOW, "[SKIPPED]");
+                num_skip++;
+            } else {
+                int result = setjmp(ctest_err);
+                if (result == 0) {
+                    if (test->setup && *test->setup) (*test->setup)(test->data);
+                    if (test->data)
+                        test->run.unary(test->data);
+                    else
+                        test->run.nullary();
+                    if (test->teardown && *test->teardown) (*test->teardown)(test->data);
+                    // if we got here it's ok
+#ifdef CTEST_COLOR_OK
+                    color_print(ANSI_BGREEN, "[OK]");
+#else
+                    printf("[OK]\n");
+#endif
+                    num_ok++;
+                } else {
+                    color_print(ANSI_BRED, "[FAIL]");
+                    num_fail++;
+                }
+                if (ctest_errorsize != MSG_SIZE-1) printf("%s", ctest_errorbuffer);
+            }
+            idx++;
+        }
+    }
+    clock_t t2 = clock();
+
+    const char* color = (num_fail) ? ANSI_BRED : ANSI_GREEN;
+    char results[80];
+    snprintf(results, sizeof(results), "RESULTS: %d tests (%d ok, %d failed, %d skipped) ran in %.1f ms",
+             total, num_ok, num_fail, num_skip, (double)(t2 - t1)*1000.0/CLOCKS_PER_SEC);
+    color_print(color, results);
+    return num_fail;
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/vendor/ctl/ctl.h b/vendor/ctl/ctl.h
new file mode 100644
index 0000000..1b5ecfd
--- /dev/null
+++ b/vendor/ctl/ctl.h
@@ -0,0 +1,19 @@
+#ifndef __CTL_H__
+#define __CTL_H__
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#define CAT(a, b) a##b
+
+#define PASTE(a, b) CAT(a, b)
+
+#define JOIN(prefix, name) PASTE(prefix, PASTE(_, name))
+
+#define SWAP(TYPE, a, b) { TYPE temp = *(a); *(a) = *(b); *(b) = temp; }
+
+#define foreach(a, b, c) for(JOIN(a, it) c = JOIN(JOIN(a, it), each) (b); !c.done; c.step(&c))
+
+#define len(a) (sizeof(a) / sizeof(*(a)))
+
+#endif
diff --git a/vendor/ctl/deq.h b/vendor/ctl/deq.h
new file mode 100644
index 0000000..fd0d338
--- /dev/null
+++ b/vendor/ctl/deq.h
@@ -0,0 +1,470 @@
+//
+// Double Ended Queue
+//
+
+#ifndef T
+#error "Template type T undefined for <deq.h>"
+#endif
+
+#include <ctl.h>
+
+#define A JOIN(deq, T)
+#define B JOIN(A, bucket)
+#define Z JOIN(A, it)
+
+#define DEQ_BUCKET_SIZE (512)
+
+typedef struct B
+{
+    T value[DEQ_BUCKET_SIZE];
+    int16_t a;
+    int16_t b;
+}
+B;
+
+typedef struct A
+{
+    void (*free)(T*);
+    T (*copy)(T*);
+    B** pages;
+    size_t mark_a;
+    size_t mark_b;
+    size_t capacity;
+    size_t size;
+}
+A;
+
+typedef struct Z
+{
+    void (*step)(struct Z*);
+    A* container;
+    T* ref;
+    size_t index;
+    size_t index_next;
+    size_t index_last;
+    int done;
+}
+Z;
+
+static inline B**
+JOIN(A, first)(A* self)
+{
+    return &self->pages[self->mark_a];
+}
+
+static inline B**
+JOIN(A, last)(A* self)
+{
+    return &self->pages[self->mark_b - 1];
+}
+
+static inline T*
+JOIN(A, at)(A* self, size_t index)
+{
+    if(self->size == 0)
+        return NULL;
+    else
+    {
+        B* first = *JOIN(A, first)(self);
+        size_t actual = index + first->a;
+        size_t q = actual / DEQ_BUCKET_SIZE;
+        size_t r = actual % DEQ_BUCKET_SIZE;
+        B* page = self->pages[self->mark_a + q];
+        return &page->value[r];
+    }
+}
+
+static inline T*
+JOIN(A, front)(A* self)
+{
+    return JOIN(A, at)(self, 0);
+}
+
+static inline T*
+JOIN(A, back)(A* self)
+{
+    return JOIN(A, at)(self, self->size - 1);
+}
+
+static inline T*
+JOIN(A, begin)(A* self)
+{
+    return JOIN(A, front)(self);
+}
+
+static inline T*
+JOIN(A, end)(A* self)
+{
+    return JOIN(A, back)(self) + 1;
+}
+
+static inline void
+JOIN(Z, step)(Z* self)
+{
+    self->index = self->index_next;
+    if(self->index == self->index_last)
+        self->done = 1;
+    else
+    {
+        self->ref = JOIN(A, at)(self->container, self->index);
+        self->index_next += 1;
+    }
+}
+
+static inline Z
+JOIN(Z, range)(A* container, T* begin, T* end)
+{
+    static Z zero;
+    Z self = zero;
+    if(begin && end)
+    {
+        self.container = container;
+        self.step = JOIN(Z, step);
+        self.index = begin - JOIN(A, begin)(container);
+        self.index_next = self.index + 1;
+        self.index_last = container->size - (JOIN(A, end)(container) - end);
+        self.ref = JOIN(A, at)(container, self.index);
+    }
+    else
+        self.done = 1;
+    return self;
+}
+
+static inline int
+JOIN(A, empty)(A* self)
+{
+    return self->size == 0;
+}
+
+static inline Z
+JOIN(Z, each)(A* a)
+{
+    return JOIN(A, empty)(a)
+         ? JOIN(Z, range)(a, NULL, NULL)
+         : JOIN(Z, range)(a, JOIN(A, begin)(a), JOIN(A, end)(a));
+}
+
+static inline T
+JOIN(A, implicit_copy)(T* self)
+{
+    return *self;
+}
+
+static inline int
+JOIN(A, equal)(A* self, A* other, int _equal(T*, T*))
+{
+    if(self->size != other->size)
+        return 0;
+    Z a = JOIN(Z, each)(self);
+    Z b = JOIN(Z, each)(other);
+    while(!a.done && !b.done)
+    {
+        if(!_equal(a.ref, b.ref))
+            return 0;
+        a.step(&a);
+        b.step(&b);
+    }
+    return 1;
+}
+
+static inline void
+JOIN(A, swap)(A* self, A* other)
+{
+    A temp = *self;
+    *self = *other;
+    *other = temp;
+}
+
+static inline A
+JOIN(A, init)(void)
+{
+    static A zero;
+    A self = zero;
+#ifdef P
+#undef P
+    self.copy = JOIN(A, implicit_copy);
+#else
+    self.free = JOIN(T, free);
+    self.copy = JOIN(T, copy);
+#endif
+    return self;
+}
+
+static inline B*
+JOIN(B, init)(size_t cut)
+{
+    B* self = (B*) malloc(sizeof(B));
+    self->a = self->b = cut;
+    return self;
+}
+
+static inline void
+JOIN(A, set)(A* self, size_t index, T value)
+{
+    T* ref = JOIN(A, at)(self, index);
+    if(self->free)
+        self->free(ref);
+    *ref = value;
+}
+
+static inline void
+JOIN(A, alloc)(A* self, size_t capacity, size_t shift_from)
+{
+    self->capacity = capacity;
+    self->pages = (B**) realloc(self->pages, capacity * sizeof(B*));
+    size_t shift = (self->capacity - shift_from) / 2;
+    size_t i = self->mark_b;
+    while(i != 0)
+    {
+        i -= 1;
+        self->pages[i + shift] = self->pages[i];
+    }
+    self->mark_a += shift;
+    self->mark_b += shift;
+}
+
+static inline void
+JOIN(A, push_front)(A* self, T value)
+{
+    if(JOIN(A, empty)(self))
+    {
+        self->mark_a = 0;
+        self->mark_b = 1;
+        JOIN(A, alloc)(self, 1, 0);
+        *JOIN(A, last)(self) = JOIN(B, init)(DEQ_BUCKET_SIZE);
+    }
+    else
+    {
+        B* page = *JOIN(A, first)(self);
+        if(page->a == 0)
+        {
+            if(self->mark_a == 0)
+                JOIN(A, alloc)(self, 2 * self->capacity, self->mark_a);
+            self->mark_a -= 1;
+            *JOIN(A, first)(self) = JOIN(B, init)(DEQ_BUCKET_SIZE);
+        }
+    }
+    B* page = *JOIN(A, first)(self);
+    page->a -= 1;
+    self->size += 1;
+    page->value[page->a] = value;
+}
+
+static inline void
+JOIN(A, pop_front)(A* self)
+{
+    B* page = *JOIN(A, first)(self);
+    if(self->free)
+    {
+        T* ref = &page->value[page->a];
+        self->free(ref);
+    }
+    page->a += 1;
+    self->size -= 1;
+    if(page->a == page->b)
+    {
+        free(page);
+        self->mark_a += 1;
+    }
+}
+
+static inline void
+JOIN(A, push_back)(A* self, T value)
+{
+    if(JOIN(A, empty)(self))
+    {
+        self->mark_a = 0;
+        self->mark_b = 1;
+        JOIN(A, alloc)(self, 1, 0);
+        *JOIN(A, last)(self) = JOIN(B, init)(0);
+    }
+    else
+    {
+        B* page = *JOIN(A, last)(self);
+        if(page->b == DEQ_BUCKET_SIZE)
+        {
+            if(self->mark_b == self->capacity)
+                JOIN(A, alloc)(self, 2 * self->capacity, self->mark_b);
+            self->mark_b += 1;
+            *JOIN(A, last)(self) = JOIN(B, init)(0);
+        }
+    }
+    B* page = *JOIN(A, last)(self);
+    page->value[page->b] = value;
+    page->b += 1;
+    self->size += 1;
+}
+
+static inline void
+JOIN(A, pop_back)(A* self)
+{
+    B* page = *JOIN(A, last)(self);
+    page->b -= 1;
+    self->size -= 1;
+    if(self->free)
+    {
+        T* ref = &page->value[page->b];
+        self->free(ref);
+    }
+    if(page->b == page->a)
+    {
+        free(page);
+        self->mark_b -= 1;
+    }
+}
+
+static inline void
+JOIN(A, erase)(A* self, size_t index)
+{
+    static T zero;
+    JOIN(A, set)(self, index, zero);
+    void (*saved)(T*) = self->free;
+    self->free = NULL;
+    if(index < self->size / 2)
+    {
+        for(size_t i = index; i > 0; i--)
+            *JOIN(A, at)(self, i) = *JOIN(A, at)(self, i - 1);
+        JOIN(A, pop_front)(self);
+    }
+    else
+    {
+        for(size_t i = index; i < self->size - 1; i++)
+            *JOIN(A, at)(self, i) = *JOIN(A, at)(self, i + 1);
+        JOIN(A, pop_back)(self);
+    }
+    self->free = saved;
+}
+
+static inline void
+JOIN(A, insert)(A* self, size_t index, T value)
+{
+    if(self->size > 0)
+    {
+        void (*saved)(T*) = self->free;
+        self->free = NULL;
+        if(index < self->size / 2)
+        {
+            JOIN(A, push_front)(self, *JOIN(A, at)(self, 0));
+            for(size_t i = 0; i < index; i++)
+                *JOIN(A, at)(self, i) = *JOIN(A, at)(self, i + 1);
+        }
+        else
+        {
+            JOIN(A, push_back)(self, *JOIN(A, at)(self, self->size - 1));
+            for(size_t i = self->size - 1; i > index; i--)
+                *JOIN(A, at)(self, i) = *JOIN(A, at)(self, i - 1);
+        }
+        *JOIN(A, at)(self, index) = value;
+        self->free = saved;
+    }
+    else
+        JOIN(A, push_back)(self, value);
+}
+
+static inline void
+JOIN(A, resize)(A* self, size_t size, T value)
+{
+    if(size != self->size)
+    {
+        while(size != self->size)
+            if(size < self->size)
+                JOIN(A, pop_back)(self);
+            else
+                JOIN(A, push_back)(self, self->copy(&value));
+    }
+    if(self->free)
+        self->free(&value);
+}
+
+static inline void
+JOIN(A, assign)(A* self, size_t size, T value)
+{
+    JOIN(A, resize)(self, size, self->copy(&value));
+    for(size_t i = 0; i < size; i++)
+        JOIN(A, set)(self, i, self->copy(&value));
+    if(self->free)
+        self->free(&value);
+}
+
+static inline void
+JOIN(A, clear)(A* self)
+{
+    while(!JOIN(A, empty)(self))
+        JOIN(A, pop_back)(self);
+}
+
+static inline void
+JOIN(A, free)(A* self)
+{
+    JOIN(A, clear)(self);
+    free(self->pages);
+    *self = JOIN(A, init)();
+}
+
+static inline A
+JOIN(A, copy)(A* self)
+{
+    A other = JOIN(A, init)();
+    while(other.size < self->size)
+    {
+        T* value = JOIN(A, at)(self, other.size);
+        JOIN(A, push_back)(&other, other.copy(value));
+    }
+    return other;
+}
+
+static inline void
+JOIN(A, ranged_sort)(A* self, int64_t a, int64_t b, int _compare(T*, T*))
+{
+    if(a >= b)
+        return;
+    int64_t mid = (a + b) / 2;
+    SWAP(T, JOIN(A, at)(self, a), JOIN(A, at)(self, mid));
+    int64_t z = a;
+    for(int64_t i = a + 1; i <= b; i++)
+        if(_compare(JOIN(A, at)(self, a), JOIN(A, at)(self, i)))
+        {
+            z += 1;
+            SWAP(T, JOIN(A, at)(self, z), JOIN(A, at)(self, i));
+        }
+    SWAP(T, JOIN(A, at)(self, a), JOIN(A, at)(self, z));
+    JOIN(A, ranged_sort)(self, a, z - 1, _compare);
+    JOIN(A, ranged_sort)(self, z + 1, b, _compare);
+}
+
+static inline void
+JOIN(A, sort)(A* self, int _compare(T*, T*))
+{
+    JOIN(A, ranged_sort)(self, 0, self->size - 1, _compare);
+}
+
+static inline size_t
+JOIN(A, remove_if)(A* self, int _match(T*))
+{
+    size_t erases = 0;
+    foreach(A, self, it)
+        if(_match(it.ref))
+        {
+            JOIN(A, erase)(self, it.index);
+            it.index_next = it.index;
+            it.index_last -= 1;
+            erases += 1;
+        }
+    return erases;
+}
+
+static inline T*
+JOIN(A, find)(A* self, T key, int _equal(T*, T*))
+{
+    foreach(A, self, it)
+        if(_equal(it.ref, &key))
+            return it.ref;
+    return NULL;
+}
+
+#undef T
+#undef A
+#undef B
+#undef Z
+
+#undef DEQ_BUCKET_SIZE
diff --git a/vendor/ctl/lst.h b/vendor/ctl/lst.h
new file mode 100644
index 0000000..51192f9
--- /dev/null
+++ b/vendor/ctl/lst.h
@@ -0,0 +1,417 @@
+//
+// Doubly Linked List
+//
+
+#ifndef T
+#error "Template type T undefined for <lst.h>"
+#endif
+
+#include <ctl.h>
+
+#define A JOIN(lst, T)
+#define B JOIN(A, node)
+#define Z JOIN(A, it)
+
+typedef struct B
+{
+    struct B* prev;
+    struct B* next;
+    T value;
+}
+B;
+
+typedef struct A
+{
+    void (*free)(T*);
+    T (*copy)(T*);
+    B* head;
+    B* tail;
+    size_t size;
+}
+A;
+
+typedef struct Z
+{
+    void (*step)(struct Z*);
+    T* ref;
+    B* begin;
+    B* node;
+    B* next;
+    B* end;
+    int done;
+}
+Z;
+
+static inline T*
+JOIN(A, front)(A* self)
+{
+    return &self->head->value;
+}
+
+static inline T*
+JOIN(A, back)(A* self)
+{
+    return &self->tail->value;
+}
+
+static inline B*
+JOIN(A, begin)(A* self)
+{
+    return self->head;
+}
+
+static inline B*
+JOIN(A, end)(A* self)
+{
+    (void) self;
+    return NULL;
+}
+
+static inline void
+JOIN(Z, step)(Z* self)
+{
+    if(self->next == self->end)
+        self->done = 1;
+    else
+    {
+        self->node = self->next;
+        self->ref = &self->node->value;
+        self->next = self->node->next;
+    }
+}
+
+static inline Z
+JOIN(Z, range)(A* container, B* begin, B* end)
+{
+    (void) container;
+    static Z zero;
+    Z self = zero;
+    if(begin)
+    {
+        self.step = JOIN(Z, step);
+        self.begin = begin;
+        self.end = end;
+        self.next = begin->next;
+        self.node = begin;
+        self.ref = &begin->value;
+    }
+    else
+        self.done = 1;
+    return self;
+}
+
+static inline int
+JOIN(A, empty)(A* self)
+{
+    return self->size == 0;
+}
+
+static inline Z
+JOIN(Z, each)(A* a)
+{
+    return JOIN(A, empty)(a)
+         ? JOIN(Z, range)(a, NULL, NULL)
+         : JOIN(Z, range)(a, JOIN(A, begin)(a), JOIN(A, end)(a));
+}
+
+static inline T
+JOIN(A, implicit_copy)(T* self)
+{
+    return *self;
+}
+
+static inline int
+JOIN(A, equal)(A* self, A* other, int _equal(T*, T*))
+{
+    if(self->size != other->size)
+        return 0;
+    Z a = JOIN(Z, each)(self);
+    Z b = JOIN(Z, each)(other);
+    while(!a.done && !b.done)
+    {
+        if(!_equal(a.ref, b.ref))
+            return 0;
+        a.step(&a);
+        b.step(&b);
+    }
+    return 1;
+}
+
+static inline void
+JOIN(A, swap)(A* self, A* other)
+{
+    A temp = *self;
+    *self = *other;
+    *other = temp;
+}
+
+static inline A
+JOIN(A, init)(void)
+{
+    static A zero;
+    A self = zero;
+#ifdef P
+#undef P
+    self.copy = JOIN(A, implicit_copy);
+#else
+    self.free = JOIN(T, free);
+    self.copy = JOIN(T, copy);
+#endif
+    return self;
+}
+
+static inline B*
+JOIN(B, init)(T value)
+{
+    B* self = (B*) malloc(sizeof(B));
+    self->prev = self->next = NULL;
+    self->value = value;
+    return self;
+}
+
+static inline void
+JOIN(A, disconnect)(A* self, B* node)
+{
+    if(node == self->tail) self->tail = self->tail->prev;
+    if(node == self->head) self->head = self->head->next;
+    if(node->prev) node->prev->next = node->next;
+    if(node->next) node->next->prev = node->prev;
+    node->prev = node->next = NULL;
+    self->size -= 1;
+}
+
+static inline void
+JOIN(A, connect)(A* self, B* position, B* node, int before)
+{
+    if(JOIN(A, empty)(self))
+        self->head = self->tail = node;
+    else
+    if(before)
+    {
+        node->next = position;
+        node->prev = position->prev;
+        if(position->prev)
+            position->prev->next = node;
+        position->prev = node;
+        if(position == self->head)
+            self->head = node;
+    }
+    else
+    {
+        node->prev = position;
+        node->next = position->next;
+        if(position->next)
+            position->next->prev = node;
+        position->next = node;
+        if(position == self->tail)
+            self->tail = node;
+    }
+    self->size += 1;
+}
+
+static inline void
+JOIN(A, push_back)(A* self, T value)
+{
+    B* node = JOIN(B, init)(value);
+    JOIN(A, connect)(self, self->tail, node, 0);
+}
+
+static inline void
+JOIN(A, push_front)(A* self, T value)
+{
+    B* node = JOIN(B, init)(value);
+    JOIN(A, connect)(self, self->head, node, 1);
+}
+
+static inline void
+JOIN(A, transfer)(A* self, A* other, B* position, B* node, int before)
+{
+    JOIN(A, disconnect)(other, node);
+    JOIN(A, connect)(self, position, node, before);
+}
+
+static inline void
+JOIN(A, erase)(A* self, B* node)
+{
+    JOIN(A, disconnect)(self, node);
+    if(self->free)
+        self->free(&node->value);
+    free(node);
+}
+
+static inline void
+JOIN(A, pop_back)(A* self)
+{
+    JOIN(A, erase)(self, self->tail);
+}
+
+static inline void
+JOIN(A, pop_front)(A* self)
+{
+    JOIN(A, erase)(self, self->head);
+}
+
+static inline void
+JOIN(A, insert)(A* self, B* position, T value)
+{
+    B* node = JOIN(B, init)(value);
+    JOIN(A, connect)(self, position, node, 1);
+}
+
+static inline void
+JOIN(A, clear)(A* self)
+{
+    while(!JOIN(A, empty)(self))
+        JOIN(A, pop_back)(self);
+}
+
+static inline void
+JOIN(A, free)(A* self)
+{
+    JOIN(A, clear)(self);
+    *self = JOIN(A, init)();
+}
+
+static inline void
+JOIN(A, resize)(A* self, size_t size, T value)
+{
+    if(size != self->size)
+        for(size_t i = 0; size != self->size; i++)
+            (size < self->size)
+                ? JOIN(A, pop_back)(self)
+                : JOIN(A, push_back)(self, self->copy(&value));
+    if(self->free)
+        self->free(&value);
+}
+
+static inline A
+JOIN(A, copy)(A* self)
+{
+    A other = JOIN(A, init)();
+    for(B* node = self->head; node; node = node->next)
+        JOIN(A, push_back)(&other, self->copy(&node->value));
+    return other;
+}
+
+static inline void
+JOIN(A, assign)(A* self, size_t size, T value)
+{
+    JOIN(A, resize)(self, size, self->copy(&value));
+    size_t i = 0;
+    foreach(A, self, it)
+    {
+        if(self->free)
+            self->free(it.ref);
+        *it.ref = self->copy(&value);
+        i += 1;
+    }
+    if(self->free)
+        self->free(&value);
+}
+
+static inline void
+JOIN(A, reverse)(A* self)
+{
+    foreach(A, self, it)
+    {
+        B* next = it.node->next;
+        B* prev = it.node->prev;
+        it.node->prev = next;
+        it.node->next = prev;
+    }
+    B* tail = self->tail;
+    B* head = self->head;
+    self->tail = head;
+    self->head = tail;
+}
+
+static inline size_t
+JOIN(A, remove_if)(A* self, int _equal(T*))
+{
+    size_t erases = 0;
+    foreach(A, self, it)
+        if(_equal(it.ref))
+        {
+            JOIN(A, erase)(self, it.node);
+            erases += 1;
+        }
+    return erases;
+}
+
+static inline void
+JOIN(A, splice)(A* self, B* position, A* other)
+{
+    if(self->size == 0 && position == NULL)
+        JOIN(A, swap)(self, other);
+    else
+        foreach(A, other, it)
+            JOIN(A, transfer)(self, other, position, it.node, 1);
+}
+
+static inline void
+JOIN(A, merge)(A* self, A* other, int _compare(T*, T*))
+{
+    if(JOIN(A, empty)(self))
+        JOIN(A, swap)(self, other);
+    else
+    {
+        for(B* node = self->head; node; node = node->next)
+            while(!JOIN(A, empty)(other) && _compare(&node->value, &other->head->value))
+                JOIN(A, transfer)(self, other, node, other->head, 1);
+        // Remainder.
+        while(!JOIN(A, empty)(other))
+            JOIN(A, transfer)(self, other, self->tail, other->head, 0);
+    }
+}
+
+static inline void
+JOIN(A, sort)(A* self, int _compare(T*, T*))
+{
+    if(self->size > 1)
+    {
+        A carry = JOIN(A, init)();
+        A temp[64];
+        for(size_t i = 0; i < len(temp); i++)
+            temp[i] = JOIN(A, init)();
+        A* fill = temp;
+        A* counter = NULL;
+        do
+        {
+            JOIN(A, transfer)(&carry, self, carry.head, self->head, 1);
+            for(counter = temp; counter != fill && !JOIN(A, empty)(counter); counter++)
+            {
+                JOIN(A, merge)(counter, &carry, _compare);
+                JOIN(A, swap)(&carry, counter);
+            }
+            JOIN(A, swap)(&carry, counter);
+            if(counter == fill)
+                fill++;
+        }
+        while(!JOIN(A, empty)(self));
+        for(counter = temp + 1; counter != fill; counter++)
+            JOIN(A, merge)(counter, counter - 1, _compare);
+        JOIN(A, swap)(self, fill - 1);
+    }
+}
+
+static inline void
+JOIN(A, unique)(A* self, int _equal(T*, T*))
+{
+    foreach(A, self, it)
+        if(it.next && _equal(it.ref, &it.next->value))
+            JOIN(A, erase)(self, it.node);
+}
+
+static inline B*
+JOIN(A, find)(A* self, T key, int _equal(T*, T*))
+{
+    foreach(A, self, it)
+        if(_equal(it.ref, &key))
+            return it.node;
+    return NULL;
+}
+
+#undef T
+#undef A
+#undef B
+#undef Z
diff --git a/vendor/ctl/pqu.h b/vendor/ctl/pqu.h
new file mode 100644
index 0000000..c5f6ca1
--- /dev/null
+++ b/vendor/ctl/pqu.h
@@ -0,0 +1,139 @@
+//
+// Priority Queue
+//
+
+#ifndef T
+#error "Template type T undefined for <pqu.h>"
+#endif
+
+#define front         top
+#define at            __AT
+#define back          __BACK
+#define begin         __BEGIN
+#define end           __END
+#define set           __SET
+#define pop_back      __POP_BACK
+#define wipe          __WIPE
+#define clear         __CLEAR
+#define fit           __FIT
+#define reserve       __RESERVE
+#define push_back     __PUSH_BACK
+#define resize        __RESIZE
+#define assign        __ASSIGN
+#define shrink_to_fit __SHRINK_TO_FIT
+#define data          __DATA
+#define insert        __INSERT
+#define erase         __ERASE
+#define sort          __SORT
+#define step          __STEP
+#define range         __RANGE
+#define each          __EACH
+#define remove_if     __REMOVE_IF
+
+#define vec pqu
+#define HOLD
+#define COMPARE
+#define init __INIT
+#include <vec.h>
+#undef init
+#undef vec
+
+#define A JOIN(pqu, T)
+
+static inline A
+JOIN(A, init)(int _compare(T*, T*))
+{
+    A self = JOIN(A, __INIT)();
+    self.compare = _compare;
+    return self;
+}
+
+static inline void
+JOIN(A, up)(A* self, size_t n)
+{
+    if(n > 0)
+    {
+        size_t p = (n - 1) / 2;
+        T* x = &self->value[n];
+        T* y = &self->value[p];
+        if(self->compare(x, y))
+        {
+            SWAP(T, x, y);
+            JOIN(A, up)(self, p);
+        }
+    }
+}
+
+static inline void
+JOIN(A, down)(A* self, size_t n)
+{
+    size_t min = 2;
+    if(self->size < min)
+        return;
+    else
+    if(self->size == min)
+    {
+        T* a = &self->value[0];
+        T* b = &self->value[1];
+        if(!self->compare(a, b))
+            SWAP(T, a, b);
+    }
+    else
+    {
+        size_t l = 2 * n + 1;
+        size_t r = 2 * n + 2;
+        if(r < self->size)
+        {
+            size_t index = self->compare(&self->value[r], &self->value[l]) ? r : l;
+            T* x = &self->value[index];
+            T* y = &self->value[n];
+            if(self->compare(x, y))
+            {
+                SWAP(T, x, y);
+                JOIN(A, down)(self, index);
+            }
+        }
+    }
+}
+
+static inline void
+JOIN(A, push)(A* self, T value)
+{
+    JOIN(A, push_back)(self, value);
+    JOIN(A, up)(self, self->size - 1);
+}
+
+static inline void
+JOIN(A, pop)(A* self)
+{
+    SWAP(T, JOIN(A, front)(self), JOIN(A, back)(self));
+    JOIN(A, pop_back)(self);
+    JOIN(A, down)(self, 0);
+}
+
+#undef front
+#undef at
+#undef back
+#undef begin
+#undef end
+#undef set
+#undef pop_back
+#undef wipe
+#undef clear
+#undef fit
+#undef reserve
+#undef push_back
+#undef resize
+#undef assign
+#undef shrink_to_fit_fit
+#undef data
+#undef insert
+#undef erase
+#undef sort
+#undef step
+#undef range
+#undef each
+#undef remove_if
+
+#undef T // See HOLD.
+#undef A
diff --git a/vendor/ctl/que.h b/vendor/ctl/que.h
new file mode 100644
index 0000000..c013c06
--- /dev/null
+++ b/vendor/ctl/que.h
@@ -0,0 +1,49 @@
+//
+// Queue
+//
+
+#ifndef T
+#error "Template type T undefined for <que.h>"
+#endif
+
+#define push_back   push
+#define pop_front   pop
+#define at          __AT
+#define begin       __BEGIN
+#define end         __END
+#define push_front  __PUSH_FRONT
+#define pop_back    __PUSH_BACK
+#define erase       __ERASE
+#define insert      __INSERT
+#define resize      __RESIZE
+#define assign      __ASSIGN
+#define clear       __CLEAR
+#define ranged_sort __RANGED_SORT
+#define sort        __SORT
+#define range       __RANGE
+#define each        __each
+#define step        __STEP
+#define remove_if   __REMOVE_IF
+
+#define deq que
+#include <deq.h>
+#undef deq
+
+#undef push_back
+#undef pop_front
+#undef at
+#undef begin
+#undef end
+#undef push_front
+#undef pop_back
+#undef erase
+#undef insert
+#undef resize
+#undef assign
+#undef clear
+#undef ranged_sort
+#undef sort
+#undef range
+#undef each
+#undef step
+#undef remove_if
diff --git a/vendor/ctl/set.h b/vendor/ctl/set.h
new file mode 100644
index 0000000..8a0977a
--- /dev/null
+++ b/vendor/ctl/set.h
@@ -0,0 +1,761 @@
+//
+// Set
+//
+
+#ifndef T
+#error "Template type T undefined for <set.h>"
+#endif
+
+#include <ctl.h>
+
+#define A JOIN(set, T)
+#define B JOIN(A, node)
+#define Z JOIN(A, it)
+
+typedef struct B
+{
+    struct B* l;
+    struct B* r;
+    struct B* p;
+    T key;
+    int color; // Red = 0, Black = 1
+}
+B;
+
+typedef struct A
+{
+    B* root;
+    int (*compare)(T*, T*);
+    void (*free)(T*);
+    T (*copy)(T*);
+    size_t size;
+}
+A;
+
+typedef struct Z
+{
+    void (*step)(struct Z*);
+    B* end;
+    B* node;
+    T* ref;
+    B* next;
+    int done;
+}
+Z;
+
+static inline B*
+JOIN(A, begin)(A* self)
+{
+    return self->root;
+}
+
+static inline B*
+JOIN(A, end)(A* self)
+{
+    (void) self;
+    return NULL;
+}
+
+static inline B*
+JOIN(B, min)(B* self)
+{
+    while(self->l)
+        self = self->l;
+    return self;
+}
+
+static inline B*
+JOIN(B, max)(B* self)
+{
+    while(self->r)
+        self = self->r;
+    return self;
+}
+
+static inline B*
+JOIN(B, next)(B* self)
+{
+    if(self->r)
+    {
+        self = self->r;
+        while(self->l)
+            self = self->l;
+    }
+    else
+    {
+        B* parent = self->p;
+        while(parent && self == parent->r)
+        {
+            self = parent;
+            parent = parent->p;
+        }
+        self = parent;
+    }
+    return self;
+}
+
+static inline void
+JOIN(Z, step)(Z* self)
+{
+    if(self->next == self->end)
+        self->done = 1;
+    else
+    {
+        self->node = self->next;
+        self->ref = &self->node->key;
+        self->next = JOIN(B, next)(self->node);
+    }
+}
+
+static inline Z
+JOIN(Z, range)(A* container, B* begin, B* end)
+{
+    (void) container;
+    static Z zero;
+    Z self = zero;
+    if(begin)
+    {
+        self.step = JOIN(Z, step);
+        self.node = JOIN(B, min)(begin);
+        self.ref = &self.node->key;
+        self.next = JOIN(B, next)(self.node);
+        self.end = end;
+    }
+    else
+        self.done = 1;
+    return self;
+}
+
+static inline int
+JOIN(A, empty)(A* self)
+{
+    return self->size == 0;
+}
+
+static inline Z
+JOIN(Z, each)(A* a)
+{
+    return JOIN(A, empty)(a)
+         ? JOIN(Z, range)(a, NULL, NULL)
+         : JOIN(Z, range)(a, JOIN(A, begin)(a), JOIN(A, end)(a));
+}
+
+static inline T
+JOIN(A, implicit_copy)(T* self)
+{
+    return *self;
+}
+
+static inline int
+JOIN(A, equal)(A* self, A* other, int _equal(T*, T*))
+{
+    if(self->size != other->size)
+        return 0;
+    Z a = JOIN(Z, each)(self);
+    Z b = JOIN(Z, each)(other);
+    while(!a.done && !b.done)
+    {
+        if(!_equal(a.ref, b.ref))
+            return 0;
+        a.step(&a);
+        b.step(&b);
+    }
+    return 1;
+}
+
+static inline void
+JOIN(A, swap)(A* self, A* other)
+{
+    A temp = *self;
+    *self = *other;
+    *other = temp;
+}
+
+static inline A
+JOIN(A, init)(int _compare(T*, T*))
+{
+    static A zero;
+    A self = zero;
+    self.compare = _compare;
+#ifdef P
+#undef P
+    self.copy = JOIN(A, implicit_copy);
+#else
+    self.free = JOIN(T, free);
+    self.copy = JOIN(T, copy);
+#endif
+    return self;
+}
+
+static inline void
+JOIN(A, free_node)(A* self, B* node)
+{
+    if(self->free)
+        self->free(&node->key);
+    free(node);
+}
+
+static inline int
+JOIN(B, color)(B* self)
+{
+    return self ? self->color : 1;
+}
+
+static inline int
+JOIN(B, is_blk)(B* self)
+{
+    return JOIN(B, color)(self) == 1;
+}
+
+static inline int
+JOIN(B, is_red)(B* self)
+{
+    return JOIN(B, color)(self) == 0;
+}
+
+static inline B*
+JOIN(B, grandfather)(B* self)
+{
+    return self->p->p;
+}
+
+static inline B*
+JOIN(B, sibling)(B* self)
+{
+    if(self == self->p->l)
+        return self->p->r;
+    else
+        return self->p->l;
+}
+
+static inline B*
+JOIN(B, uncle)(B* self)
+{
+    return JOIN(B, sibling)(self->p);
+}
+
+static inline B*
+JOIN(B, init)(T key, int color)
+{
+    B* self = (B*) malloc(sizeof(B));
+    self->key = key;
+    self->color = color;
+    self->l = self->r = self->p = NULL;
+    return self;
+}
+
+static inline B*
+JOIN(A, lower_bound)(A* self, T key)
+{
+    B* node = self->root;
+    B* result = NULL;
+    while(node)
+    {
+        int diff = self->compare(&key, &node->key);
+        if(diff <= 0)
+        {
+            result = node;
+            node = node->l;
+        }
+        else
+            node = node->r;
+    }
+    return result;
+}
+
+static inline B*
+JOIN(A, upper_bound)(A* self, T key)
+{
+    B* node = self->root;
+    B* result = NULL;
+    while(node)
+    {
+        int diff = self->compare(&key, &node->key);
+        if(diff < 0)
+        {
+            result = node;
+            node = node->l;
+        }
+        else
+            node = node->r;
+    }
+    return result;
+}
+
+static inline B*
+JOIN(A, find)(A* self, T key)
+{
+    B* node = self->root;
+    while(node)
+    {
+        int diff = self->compare(&key, &node->key);
+        if(diff == 0)
+            return node;
+        else
+        if(diff < 0)
+            node = node->l;
+        else
+            node = node->r;
+    }
+    return NULL;
+}
+
+static inline int
+JOIN(A, count)(A* self, T key)
+{
+    return JOIN(A, find)(self, key) ? 1 : 0;
+}
+
+static inline void
+JOIN(B, replace)(A* self, B* a, B* b)
+{
+    if(a->p)
+    {
+        if(a == a->p->l)
+            a->p->l = b;
+        else
+            a->p->r = b;
+    }
+    else
+        self->root = b;
+    if(b)
+        b->p = a->p;
+}
+
+#ifdef USE_INTERNAL_VERIFY
+
+    #include <assert.h>
+
+    static inline void
+    JOIN(B, verify_property_1)(B* self)
+    {
+        assert(JOIN(B, is_red)(self) || JOIN(B, is_blk)(self));
+        if(self)
+        {
+            JOIN(B, verify_property_1)(self->l);
+            JOIN(B, verify_property_1)(self->r);
+        }
+    }
+
+    static inline void
+    JOIN(B, verify_property_2)(B* self)
+    {
+        assert(JOIN(B, is_blk)(self));
+    }
+
+    static inline void
+    JOIN(B, verify_property_4)(B* self)
+    {
+        if(JOIN(B, is_red)(self))
+        {
+            assert(JOIN(B, is_blk)(self->l));
+            assert(JOIN(B, is_blk)(self->r));
+            assert(JOIN(B, is_blk)(self->p));
+        }
+        if(self)
+        {
+            JOIN(B, verify_property_4)(self->l);
+            JOIN(B, verify_property_4)(self->r);
+        }
+    }
+
+    static inline void
+    JOIN(B, count_blk)(B* self, int nodes, int* in_path)
+    {
+        if(JOIN(B, is_blk)(self))
+            nodes += 1;
+        if(self)
+        {
+            JOIN(B, count_blk)(self->l, nodes, in_path);
+            JOIN(B, count_blk)(self->r, nodes, in_path);
+        }
+        else
+        {
+            if(*in_path == -1)
+                *in_path = nodes;
+            else
+                assert(nodes == *in_path);
+        }
+    }
+
+    static inline void
+    JOIN(B, verify_property_5)(B* self)
+    {
+        int in_path = -1;
+        JOIN(B, count_blk)(self, 0, &in_path);
+    }
+
+    static inline void
+    JOIN(A, verify)(A* self)
+    {
+        JOIN(B, verify_property_1)(self->root); // Property 1: Each node is either red or black.
+        JOIN(B, verify_property_2)(self->root); // Property 2: The root node is black.
+        /* Implicit */                          // Property 3: Leaves are colored black
+        JOIN(B, verify_property_4)(self->root); // Property 4: Every red node has two black ndoes.
+        JOIN(B, verify_property_5)(self->root); // Property 5: All paths from a node have the same number of black nodes.
+    }
+
+#endif
+
+static inline void
+JOIN(A, rotate_l)(A* self, B* node)
+{
+    B* r = node->r;
+    JOIN(B, replace)(self, node, r);
+    node->r = r->l;
+    if(r->l)
+        r->l->p = node;
+    r->l = node;
+    node->p = r;
+}
+
+static inline void
+JOIN(A, rotate_r)(A* self, B* node)
+{
+    B* l = node->l;
+    JOIN(B, replace)(self, node, l);
+    node->l = l->r;
+    if(l->r)
+        l->r->p = node;
+    l->r = node;
+    node->p = l;
+}
+
+static inline void
+JOIN(A, insert_1)(A*, B*),
+JOIN(A, insert_2)(A*, B*),
+JOIN(A, insert_3)(A*, B*),
+JOIN(A, insert_4)(A*, B*),
+JOIN(A, insert_5)(A*, B*);
+
+static inline B*
+JOIN(A, insert)(A* self, T key)
+{
+    B* insert = JOIN(B, init)(key, 0);
+    if(self->root)
+    {
+        B* node = self->root;
+        while(1)
+        {
+            int diff = self->compare(&key, &node->key);
+            if(diff == 0)
+            {
+                JOIN(A, free_node)(self, insert);
+                return node;
+            }
+            else
+            if(diff < 0)
+            {
+                if(node->l)
+                    node = node->l;
+                else
+                {
+                    node->l = insert;
+                    break;
+                }
+            }
+            else
+            {
+                if(node->r)
+                    node = node->r;
+                else
+                {
+                    node->r = insert;
+                    break;
+                }
+            }
+        }
+        insert->p = node;
+    }
+    else
+        self->root = insert;
+    JOIN(A, insert_1)(self, insert);
+    self->size += 1;
+#ifdef USE_INTERNAL_VERIFY
+    JOIN(A, verify)(self);
+#endif
+    return insert;
+}
+
+static inline void
+JOIN(A, insert_1)(A* self, B* node)
+{
+    if(node->p)
+        JOIN(A, insert_2)(self, node);
+    else
+        node->color = 1;
+}
+
+static inline void
+JOIN(A, insert_2)(A* self, B* node)
+{
+    if(JOIN(B, is_blk)(node->p))
+        return;
+    else
+       JOIN(A, insert_3)(self, node);
+}
+
+static inline void
+JOIN(A, insert_3)(A* self, B* node)
+{
+    if(JOIN(B, is_red)(JOIN(B, uncle)(node)))
+    {
+        node->p->color = 1;
+        JOIN(B, uncle)(node)->color = 1;
+        JOIN(B, grandfather)(node)->color = 0;
+        JOIN(A, insert_1)(self, JOIN(B, grandfather)(node));
+    }
+    else
+        JOIN(A, insert_4)(self, node);
+}
+
+static inline void
+JOIN(A, insert_4)(A* self, B* node)
+{
+    if(node == node->p->r && node->p == JOIN(B, grandfather)(node)->l)
+    {
+        JOIN(A, rotate_l)(self, node->p);
+        node = node->l;
+    }
+    else
+    if(node == node->p->l && node->p == JOIN(B, grandfather)(node)->r)
+    {
+        JOIN(A, rotate_r)(self, node->p);
+        node = node->r;
+    }
+    JOIN(A, insert_5)(self, node);
+}
+
+static inline void
+JOIN(A, insert_5)(A* self, B* node)
+{
+    node->p->color = 1;
+    JOIN(B, grandfather)(node)->color = 0;
+    if(node == node->p->l && node->p == JOIN(B, grandfather)(node)->l)
+        JOIN(A, rotate_r)(self, JOIN(B, grandfather)(node));
+    else
+        JOIN(A, rotate_l)(self, JOIN(B, grandfather)(node));
+}
+
+static inline void
+JOIN(A, erase_1)(A*, B*),
+JOIN(A, erase_2)(A*, B*),
+JOIN(A, erase_3)(A*, B*),
+JOIN(A, erase_4)(A*, B*),
+JOIN(A, erase_5)(A*, B*),
+JOIN(A, erase_6)(A*, B*);
+
+static inline void
+JOIN(A, erase_node)(A* self, B* node)
+{
+    if(node->l && node->r)
+    {
+        B* pred = JOIN(B, max)(node->l);
+        SWAP(T, &node->key, &pred->key);
+        node = pred;
+    }
+    B* child = node->r ? node->r : node->l;
+    if(JOIN(B, is_blk)(node))
+    {
+        node->color = JOIN(B, color)(child);
+        JOIN(A, erase_1)(self, node);
+    }
+    JOIN(B, replace)(self, node, child);
+    if(node->p == NULL && child)
+        child->color = 1;
+    JOIN(A, free_node)(self, node);
+    self->size -= 1;
+#ifdef USE_INTERNAL_VERIFY
+    JOIN(A, verify)(self);
+#endif
+}
+
+static inline void
+JOIN(A, erase)(A* self, T key)
+{
+    B* node = JOIN(A, find)(self, key);
+    if(node)
+        JOIN(A, erase_node)(self, node);
+}
+
+static inline void
+JOIN(A, erase_1)(A* self, B* node)
+{
+    if(node->p)
+        JOIN(A, erase_2)(self, node);
+}
+
+static inline void
+JOIN(A, erase_2)(A* self, B* node)
+{
+    if(JOIN(B, is_red)(JOIN(B, sibling)(node)))
+    {
+        node->p->color = 0;
+        JOIN(B, sibling)(node)->color = 1;
+        if(node == node->p->l)
+            JOIN(A, rotate_l)(self, node->p);
+        else
+            JOIN(A, rotate_r)(self, node->p);
+    }
+    JOIN(A, erase_3)(self, node);
+}
+
+static inline void
+JOIN(A, erase_3)(A* self, B* node)
+{
+    if(JOIN(B, is_blk)(node->p)
+    && JOIN(B, is_blk)(JOIN(B, sibling)(node))
+    && JOIN(B, is_blk)(JOIN(B, sibling)(node)->l)
+    && JOIN(B, is_blk)(JOIN(B, sibling)(node)->r))
+    {
+        JOIN(B, sibling)(node)->color = 0;
+        JOIN(A, erase_1)(self, node->p);
+    }
+    else
+        JOIN(A, erase_4)(self, node);
+}
+
+static inline void
+JOIN(A, erase_4)(A* self, B* node)
+{
+    if(JOIN(B, is_red)(node->p)
+    && JOIN(B, is_blk)(JOIN(B, sibling)(node))
+    && JOIN(B, is_blk)(JOIN(B, sibling)(node)->l)
+    && JOIN(B, is_blk)(JOIN(B, sibling)(node)->r))
+    {
+        JOIN(B, sibling)(node)->color = 0;
+        node->p->color = 1;
+    }
+    else
+        JOIN(A, erase_5)(self, node);
+}
+
+static inline void
+JOIN(A, erase_5)(A* self, B* node)
+{
+    if(node == node->p->l
+    && JOIN(B, is_blk)(JOIN(B, sibling)(node))
+    && JOIN(B, is_red)(JOIN(B, sibling)(node)->l)
+    && JOIN(B, is_blk)(JOIN(B, sibling)(node)->r))
+    {
+        JOIN(B, sibling)(node)->color = 0;
+        JOIN(B, sibling)(node)->l->color = 1;
+        JOIN(A, rotate_r)(self, JOIN(B, sibling)(node));
+    }
+    else
+    if(node == node->p->r
+    && JOIN(B, is_blk)(JOIN(B, sibling)(node))
+    && JOIN(B, is_red)(JOIN(B, sibling)(node)->r)
+    && JOIN(B, is_blk)(JOIN(B, sibling)(node)->l))
+    {
+        JOIN(B, sibling)(node)->color = 0;
+        JOIN(B, sibling)(node)->r->color = 1;
+        JOIN(A, rotate_l)(self, JOIN(B, sibling)(node));
+    }
+    JOIN(A, erase_6)(self, node);
+}
+
+static inline void
+JOIN(A, erase_6)(A* self, B* node)
+{
+    JOIN(B, sibling)(node)->color = JOIN(B, color)(node->p);
+    node->p->color = 1;
+    if(node == node->p->l)
+    {
+        JOIN(B, sibling)(node)->r->color = 1;
+        JOIN(A, rotate_l)(self, node->p);
+    }
+    else
+    {
+        JOIN(B, sibling)(node)->l->color = 1;
+        JOIN(A, rotate_r)(self, node->p);
+    }
+}
+
+static inline void
+JOIN(A, clear)(A* self)
+{
+    while(!JOIN(A, empty)(self))
+        JOIN(A, erase)(self, self->root->key);
+}
+
+static inline void
+JOIN(A, free)(A* self)
+{
+    JOIN(A, clear)(self);
+    *self = JOIN(A, init)(self->compare);
+}
+
+static inline A
+JOIN(A, copy)(A* self)
+{
+    Z it = JOIN(Z, each)(self);
+    A copy =  JOIN(A, init)(self->compare);
+    while(!it.done)
+    {
+        JOIN(A, insert)(&copy, self->copy(&it.node->key));
+        it.step(&it);
+    }
+    return copy;
+}
+
+static inline size_t
+JOIN(A, remove_if)(A* self, int _match(T*))
+{
+    size_t erases = 0;
+    foreach(A, self, it)
+        if(_match(&it.node->key))
+        {
+            JOIN(A, erase_node)(self, it.node);
+            erases += 1;
+        }
+    return erases;
+}
+
+static inline A
+JOIN(A, intersection)(A* a, A* b)
+{
+    A self = JOIN(A, init)(a->compare);
+    foreach(A, a, i)
+        if(JOIN(A, find)(b, *i.ref))
+            JOIN(A, insert)(&self, self.copy(i.ref));
+    return self;
+}
+
+static inline A
+JOIN(A, union)(A* a, A* b)
+{
+    A self = JOIN(A, init)(a->compare);
+    foreach(A, a, i) JOIN(A, insert)(&self, self.copy(i.ref));
+    foreach(A, b, i) JOIN(A, insert)(&self, self.copy(i.ref));
+    return self;
+}
+
+static inline A
+JOIN(A, difference)(A* a, A* b)
+{
+    A self = JOIN(A, copy)(a);
+    foreach(A, b, i)
+        JOIN(A, erase)(&self, *i.ref);
+    return self;
+}
+
+static inline A
+JOIN(A, symmetric_difference)(A* a, A* b)
+{
+    A self = JOIN(A, union)(a, b);
+    A intersection = JOIN(A, intersection)(a, b);
+    foreach(A, &intersection, i)
+        JOIN(A, erase)(&self, *i.ref);
+    JOIN(A, free)(&intersection);
+    return self;
+}
+
+#undef T
+#undef A
+#undef B
+#undef Z
+
+#ifdef USE_INTERNAL_VERIFY
+#undef USE_INTERNAL_VERIFY
+#endif
diff --git a/vendor/ctl/stk.h b/vendor/ctl/stk.h
new file mode 100644
index 0000000..aaf285f
--- /dev/null
+++ b/vendor/ctl/stk.h
@@ -0,0 +1,53 @@
+//
+// Stack
+//
+
+#ifndef T
+#error "Template type T undefined for <stk.h>"
+#endif
+
+#define push_back   push
+#define pop_back    pop
+#define back        top
+#define at          __AT
+#define front       __FRONT
+#define begin       __BEGIN
+#define end         __END
+#define push_front  __PUSH_FRONT
+#define pop_front   __POP_FRONT
+#define erase       __ERASE
+#define insert      __INSERT
+#define resize      __RESIZE
+#define assign      __ASSIGN
+#define clear       __CLEAR
+#define ranged_sort __RANGED_SORT
+#define sort        __SORT
+#define range       __RANGE
+#define each        __EACH
+#define step        __STEP
+#define remove_if   __REMOVE_IF
+
+#define deq stk
+#include <deq.h>
+#undef deq
+
+#undef push_back
+#undef pop_back
+#undef back
+#undef at
+#undef front
+#undef begin
+#undef end
+#undef push_front
+#undef pop_front
+#undef erase
+#undef insert
+#undef resize
+#undef assign
+#undef clear
+#undef ranged_sort
+#undef sort
+#undef range
+#undef each
+#undef step
+#undef remove_if
diff --git a/vendor/ctl/str.h b/vendor/ctl/str.h
new file mode 100644
index 0000000..cdb3eab
--- /dev/null
+++ b/vendor/ctl/str.h
@@ -0,0 +1,195 @@
+//
+// String
+//
+
+#ifndef __STR__H__
+#define __STR__H__
+
+#ifdef T
+#error "Template type T defined for <str.h>"
+#endif
+
+#define vec_char str
+#define P
+#define T char
+#define str_init str___INIT
+#define str_equal str___EQUAL
+#define str_find str___FIND
+#define str_copy str___COPY
+#include <vec.h>
+#undef str_init
+#undef str_copy
+#undef str_equal
+#undef str_find
+#undef vec_char
+
+#include <stdint.h>
+#include <string.h>
+
+static inline str
+str_init(const char* c_str)
+{
+    str self = str___INIT();
+    size_t len = strlen(c_str);
+    size_t min = 15;
+    str_reserve(&self, len < min ? min : len);
+    for(const char* s = c_str; *s; s++)
+        str_push_back(&self, *s);
+    return self;
+}
+
+static inline void
+str_append(str* self, const char* s)
+{
+    size_t start = self->size;
+    size_t len = strlen(s);
+    str_resize(self, self->size + len, '\0');
+    for(size_t i = 0; i < len; i++)
+        self->value[start + i] = s[i];
+}
+
+static inline str
+str_copy(str* s)
+{
+    str other = str_init("");
+    str_append(&other, s->value);
+    return other;
+}
+
+static inline void
+str_insert_str(str* self, size_t index, const char* s)
+{
+    size_t start = self->size;
+    size_t len = strlen(s);
+    str_resize(self, self->size + len, '\0');
+    self->size = start;
+    while(len != 0)
+    {
+        len -= 1;
+        str_insert(self, index, s[len]);
+    }
+}
+
+static inline void
+str_replace(str* self, size_t index, size_t size, const char* s)
+{
+    size_t end = index + size;
+    if(end >= self->size)
+        end = self->size;
+    for(size_t i = index; i < end; i++)
+        str_erase(self, index);
+    str_insert_str(self, index, s);
+}
+
+static inline char*
+str_c_str(str* self)
+{
+    return str_data(self);
+}
+
+static inline size_t
+str_find(str* self, const char* s)
+{
+    char* c_str = self->value;
+    char* found = strstr(c_str, s);
+    if(found)
+        return found - c_str;
+    return SIZE_MAX;
+}
+
+static inline int
+str_count(str* self, char c)
+{
+    size_t count = 0;
+    for(size_t i = 0; i < self->size; i++)
+        if(self->value[i] == c)
+            count += 1;
+    return count;
+}
+
+static inline size_t
+str_rfind(str* self, const char* s)
+{
+    char* c_str = self->value;
+    for(size_t i = self->size; i != SIZE_MAX; i--)
+    {
+        char* found = strstr(&c_str[i], s);
+        if(found)
+            return found - c_str;
+    }
+    return SIZE_MAX;
+}
+
+static inline size_t
+str_find_first_of(str* self, const char* s)
+{
+    for(size_t i = 0; i < self->size; i++)
+    for(const char* p = s; *p; p++)
+        if(self->value[i] == *p)
+            return i;
+    return SIZE_MAX;
+}
+
+static inline size_t
+str_find_last_of(str* self, const char* s)
+{
+    for(size_t i = self->size; i != SIZE_MAX; i--)
+    for(const char* p = s; *p; p++)
+        if(self->value[i] == *p)
+            return i;
+    return SIZE_MAX;
+}
+
+static inline size_t
+str_find_first_not_of(str* self, const char* s)
+{
+    for(size_t i = 0; i < self->size; i++)
+    {
+        size_t count = 0;
+        for(const char* p = s; *p; p++)
+            if(self->value[i] == *p)
+                count += 1;
+        if(count == 0)
+            return i;
+    }
+    return SIZE_MAX;
+}
+
+static inline size_t
+str_find_last_not_of(str* self, const char* s)
+{
+    for(size_t i = self->size - 1; i != SIZE_MAX; i--)
+    {
+        size_t count = 0;
+        for(const char* p = s; *p; p++)
+            if(self->value[i] == *p)
+                count += 1;
+        if(count == 0)
+            return i;
+    }
+    return SIZE_MAX;
+}
+
+static inline str
+str_substr(str* self, size_t index, size_t size)
+{
+    str substr = str_init("");
+    str_resize(&substr, size, '\0');
+    for(size_t i = 0; i < size; i++)
+        substr.value[i] = self->value[index + i];
+    return substr;
+}
+
+static inline int
+str_compare(str* self, const char* s)
+{
+    return strcmp(self->value, s);
+}
+
+static inline int
+str_key_compare(str* self, str* s)
+{
+    return strcmp(self->value, s->value);
+}
+
+#endif
diff --git a/vendor/ctl/ust.h b/vendor/ctl/ust.h
new file mode 100644
index 0000000..895b43f
--- /dev/null
+++ b/vendor/ctl/ust.h
@@ -0,0 +1,455 @@
+//
+// Unordered Set
+//
+
+#ifndef T
+#error "Template type T undefined for <ust.h>"
+#endif
+
+#include <ctl.h>
+
+#define A JOIN(ust, T)
+#define B JOIN(A, node)
+#define Z JOIN(A, it)
+
+typedef struct B
+{
+    T key;
+    struct B* next;
+}
+B;
+
+typedef struct A
+{
+    void (*free)(T*);
+    T (*copy)(T*);
+    size_t (*hash)(T*);
+    int (*equal)(T*, T*);
+    B** bucket;
+    size_t size;
+    size_t bucket_count;
+}
+A;
+
+typedef struct Z
+{
+    void (*step)(struct Z*);
+    B* end;
+    B* node;
+    T* ref;
+    B* next;
+    A* container;
+    size_t index;
+    int done;
+}
+Z;
+
+static inline B*
+JOIN(A, begin)(A* self)
+{
+    for(size_t i = 0; i < self->bucket_count; i++)
+    {
+        B* node = self->bucket[i];
+        if(node)
+            return node;
+    }
+    return NULL;
+}
+
+static inline B*
+JOIN(A, end)(A* self)
+{
+    (void) self;
+    return NULL;
+}
+
+static inline size_t
+JOIN(Z, index)(A* self, T value)
+{
+    return self->hash(&value) % self->bucket_count;
+}
+
+static inline void
+JOIN(Z, update)(Z* self)
+{
+    self->node = self->next;
+    self->ref = &self->node->key;
+    self->next = self->node->next;
+}
+
+static inline int
+JOIN(Z, scan)(Z* self)
+{
+    for(size_t i = self->index + 1; i < self->container->bucket_count; i++)
+    {
+        self->next = self->container->bucket[i];
+        if(self->next)
+        {
+            self->index = i;
+            JOIN(Z, update)(self);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+static inline void
+JOIN(Z, step)(Z* self)
+{
+    if(self->next == JOIN(A, end)(self->container))
+    {
+        if(!JOIN(Z, scan)(self))
+            self->done = 1;
+    }
+    else
+        JOIN(Z, update)(self);
+}
+
+static inline Z
+JOIN(Z, range)(A* container, B* begin, B* end)
+{
+    static Z zero;
+    Z self = zero;
+    if(begin)
+    {
+        self.step = JOIN(Z, step);
+        self.node = begin;
+        self.ref = &self.node->key;
+        self.next = self.node->next;
+        self.end = end;
+        self.container = container;
+        self.index = JOIN(Z, index)(container, *self.ref);
+    }
+    else
+        self.done = 1;
+    return self;
+}
+
+static inline B**
+JOIN(A, bucket)(A* self, T value)
+{
+    size_t index = JOIN(Z, index)(self, value);
+    return &self->bucket[index];
+}
+
+static inline int
+JOIN(A, empty)(A* self)
+{
+    return self->size == 0;
+}
+
+static inline Z
+JOIN(Z, each)(A* a)
+{
+    return JOIN(A, empty)(a)
+         ? JOIN(Z, range)(a, NULL, NULL)
+         : JOIN(Z, range)(a, JOIN(A, begin)(a), JOIN(A, end)(a));
+}
+
+static inline T
+JOIN(A, implicit_copy)(T* self)
+{
+    return *self;
+}
+
+static inline B*
+JOIN(A, find)(A* self, T value)
+{
+    if(!JOIN(A, empty)(self))
+    {
+        B** bucket = JOIN(A, bucket)(self, value);
+        for(B* n = *bucket; n; n = n->next)
+            if(self->equal(&value, &n->key))
+                return n;
+    }
+    return NULL;
+}
+
+static inline int
+JOIN(A, equal)(A* a, A* b)
+{
+    size_t count_a = 0;
+    size_t count_b = 0;
+    foreach(A, a, it) if(JOIN(A, find)(b, *it.ref)) count_a += 1;
+    foreach(A, b, it) if(JOIN(A, find)(a, *it.ref)) count_b += 1;
+    return count_a == count_b;
+}
+
+static inline void
+JOIN(A, swap)(A* self, A* other)
+{
+    A temp = *self;
+    *self = *other;
+    *other = temp;
+}
+
+static inline size_t
+JOIN(A, closest_prime)(size_t number)
+{
+    static uint32_t primes[] = {
+        2, 3, 5, 7, 11,
+        13, 17, 19, 23, 29, 31,
+        37, 41, 43, 47, 53, 59,
+        61, 67, 71, 73, 79, 83,
+        89, 97, 103, 109, 113, 127,
+        137, 139, 149, 157, 167, 179,
+        193, 199, 211, 227, 241, 257,
+        277, 293, 313, 337, 359, 383,
+        409, 439, 467, 503, 541, 577,
+        619, 661, 709, 761, 823, 887,
+        953, 1031, 1109, 1193, 1289, 1381,
+        1493, 1613, 1741, 1879, 2029, 2179,
+        2357, 2549, 2753, 2971, 3209, 3469,
+        3739, 4027, 4349, 4703, 5087, 5503,
+        5953, 6427, 6949, 7517, 8123, 8783,
+        9497, 10273, 11113, 12011, 12983, 14033,
+        15173, 16411, 17749, 19183, 20753, 22447,
+        24281, 26267, 28411, 30727, 33223, 35933,
+        38873, 42043, 45481, 49201, 53201, 57557,
+        62233, 67307, 72817, 78779, 85229, 92203,
+        99733, 107897, 116731, 126271, 136607, 147793,
+        159871, 172933, 187091, 202409, 218971, 236897,
+        256279, 277261, 299951, 324503, 351061, 379787,
+        410857, 444487, 480881, 520241, 562841, 608903,
+        658753, 712697, 771049, 834181, 902483, 976369,
+        1056323, 1142821, 1236397, 1337629, 1447153, 1565659,
+        1693859, 1832561, 1982627, 2144977, 2320627, 2510653,
+        2716249, 2938679, 3179303, 3439651, 3721303, 4026031,
+        4355707, 4712381, 5098259, 5515729, 5967347, 6456007,
+        6984629, 7556579, 8175383, 8844859, 9569143, 10352717,
+        11200489, 12117689, 13109983, 14183539, 15345007, 16601593,
+        17961079, 19431899, 21023161, 22744717, 24607243, 26622317,
+        28802401, 31160981, 33712729, 36473443, 39460231, 42691603,
+        46187573, 49969847, 54061849, 58488943, 63278561, 68460391,
+        74066549, 80131819, 86693767, 93793069, 101473717, 109783337,
+        118773397, 128499677, 139022417, 150406843, 162723577, 176048909,
+        190465427, 206062531, 222936881, 241193053, 260944219, 282312799,
+        305431229, 330442829, 357502601, 386778277, 418451333, 452718089,
+        489790921, 529899637, 573292817, 620239453, 671030513, 725980837,
+        785430967, 849749479, 919334987, 994618837, 1076067617, 1164186217,
+        1259520799, 1362662261, 1474249943, 1594975441, 1725587117,
+    };
+    size_t min = primes[0];
+    if(number < min)
+        return min;
+    size_t size = len(primes);
+    for(size_t i = 0; i < size - 1; i++)
+    {
+        size_t a = primes[i + 0];
+        size_t b = primes[i + 1];
+        if(number >= a && number <= b)
+            return number == a ? a : b;
+    }
+    return primes[size - 1];
+}
+
+static inline B*
+JOIN(B, init)(T value)
+{
+    B* n = (B*) malloc(sizeof(B));
+    n->key = value;
+    n->next = NULL;
+    return n;
+}
+
+static inline void
+JOIN(B, push)(A* self, B** bucket, B* n)
+{
+    n->next = *bucket;
+    self->size += 1;
+    *bucket = n;
+}
+
+static inline size_t
+JOIN(A, bucket_size)(A* self, size_t index)
+{
+    size_t size = 0;
+    for(B* n = self->bucket[index]; n; n = n->next)
+        size += 1;
+    return size;
+}
+
+static inline float
+JOIN(A, load_factor)(A* self)
+{
+    return (float) self->size / (float) self->bucket_count;
+}
+
+static inline A
+JOIN(A, init)(size_t _hash(T*), int _equal(T*, T*))
+{
+    static A zero;
+    A self = zero;
+    self.hash = _hash;
+    self.equal = _equal;
+#ifdef P
+#undef P
+    self.copy = JOIN(A, implicit_copy);
+#else
+    self.free = JOIN(T, free);
+    self.copy = JOIN(T, copy);
+#endif
+    return self;
+}
+
+static inline void
+JOIN(A, rehash)(A* self, size_t desired_count);
+
+static inline void
+JOIN(A, reserve)(A* self, size_t desired_count)
+{
+    if(self->size > 0)
+        JOIN(A, rehash)(self, desired_count);
+    else
+    {
+        size_t bucket_count = JOIN(A, closest_prime)(desired_count);
+        B** temp = (B**) calloc(bucket_count, sizeof(B*));
+        for(size_t i = 0; i < self->bucket_count; i++)
+            temp[i] = self->bucket[i];
+        free(self->bucket);
+        self->bucket = temp;
+        self->bucket_count = bucket_count;
+    }
+}
+
+static inline void
+JOIN(A, rehash)(A* self, size_t desired_count)
+{
+    if(desired_count <= self->size)
+        desired_count = self->size + 1;
+    size_t expected = JOIN(A, closest_prime)(desired_count);
+    if(expected != self->bucket_count)
+    {
+        A rehashed = JOIN(A, init)(self->hash, self->equal);
+        JOIN(A, reserve)(&rehashed, desired_count);
+        foreach(A, self, it)
+        {
+            B** bucket = JOIN(A, bucket)(&rehashed, it.node->key);
+            JOIN(B, push)(&rehashed, bucket, it.node);
+        }
+        free(self->bucket);
+        *self = rehashed;
+    }
+}
+
+static inline void
+JOIN(A, free_node)(A* self, B* n)
+{
+    if(self->free)
+        self->free(&n->key);
+    free(n);
+    self->size -= 1;
+}
+
+static inline void
+JOIN(A, clear)(A* self)
+{
+    foreach(A, self, it)
+        JOIN(A, free_node)(self, it.node);
+    for(size_t i = 0; i < self->bucket_count; i++)
+        self->bucket[i] = NULL;
+}
+
+static inline void
+JOIN(A, free)(A* self)
+{
+    JOIN(A, clear)(self);
+    free(self->bucket);
+}
+
+static inline void
+JOIN(A, insert)(A* self, T value)
+{
+    if(JOIN(A, empty)(self))
+        JOIN(A, rehash)(self, 12);
+    if(JOIN(A, find)(self, value))
+    {
+        if(self->free)
+            self->free(&value);
+    }
+    else
+    {
+        B** bucket = JOIN(A, bucket)(self, value);
+        JOIN(B, push)(self, bucket, JOIN(B, init)(value));
+        if(self->size > self->bucket_count)
+            JOIN(A, rehash)(self, 2 * self->bucket_count);
+    }
+}
+
+static inline size_t
+JOIN(A, count)(A* self, T value)
+{
+    return JOIN(A, find)(self, value) ? 1 : 0;
+}
+
+static inline void
+JOIN(A, linked_erase)(A* self, B** bucket, B* n, B* prev, B* next)
+{
+    JOIN(A, free_node)(self, n);
+    if(prev)
+        prev->next = next;
+    else
+        *bucket = next;
+}
+
+static inline void
+JOIN(A, erase)(A* self, T value)
+{
+    if(!JOIN(A, empty)(self))
+    {
+        B** bucket = JOIN(A, bucket)(self, value);
+        B* prev = NULL;
+        B* n = *bucket;
+        while(n)
+        {
+            B* next = n->next;
+            if(self->equal(&n->key, &value))
+            {
+                JOIN(A, linked_erase)(self, bucket, n, prev, next);
+                break;
+            }
+            else
+                prev = n;
+            n = next;
+        }
+    }
+}
+
+static inline size_t
+JOIN(A, remove_if)(A* self, int _match(T*))
+{
+    size_t erases = 0;
+    for(size_t i = 0; i < self->bucket_count; i++)
+    {
+        B** bucket = &self->bucket[i];
+        B* prev = NULL;
+        B* n = *bucket;
+        while(n)
+        {
+            B* next = n->next;
+            if(_match(&n->key))
+            {
+                JOIN(A, linked_erase)(self, bucket, n, prev, next);
+                erases += 1;
+            }
+            else
+                prev = n;
+            n = next;
+        }
+    }
+    return erases;
+}
+
+static inline A
+JOIN(A, copy)(A* self)
+{
+    A other = JOIN(A, init)(self->hash, self->equal);
+    foreach(A, self, it)
+        JOIN(A, insert)(&other, self->copy(it.ref));
+    return other;
+}
+
+#undef T
+#undef A
+#undef B
+#undef Z
diff --git a/vendor/ctl/vec.h b/vendor/ctl/vec.h
new file mode 100644
index 0000000..c7c7391
--- /dev/null
+++ b/vendor/ctl/vec.h
@@ -0,0 +1,407 @@
+//
+// Vector
+//
+
+#ifndef T
+#error "Template type T undefined for <vec.h>"
+#endif
+
+#include <ctl.h>
+
+#define A JOIN(vec, T)
+#define Z JOIN(A, it)
+
+#define MUST_ALIGN_16(T) (sizeof(T) == sizeof(char))
+
+typedef struct A
+{
+    T* value;
+    void (*free)(T*);
+#ifdef COMPARE
+    int (*compare)(T*, T*);
+#endif
+    T (*copy)(T*);
+    size_t size;
+    size_t capacity;
+}
+A;
+
+typedef struct Z
+{
+    void (*step)(struct Z*);
+    T* ref;
+    T* begin;
+    T* end;
+    T* next;
+    int done;
+}
+Z;
+
+static inline T*
+JOIN(A, at)(A* self, size_t index)
+{
+    return &self->value[index];
+}
+
+static inline T*
+JOIN(A, front)(A* self)
+{
+    return JOIN(A, at)(self, 0);
+}
+
+static inline T*
+JOIN(A, back)(A* self)
+{
+    return JOIN(A, at)(self, self->size - 1);
+}
+
+static inline T*
+JOIN(A, begin)(A* self)
+{
+    return JOIN(A, front)(self);
+}
+
+static inline T*
+JOIN(A, end)(A* self)
+{
+    return JOIN(A, back)(self) + 1;
+}
+
+static inline void
+JOIN(Z, step)(Z* self)
+{
+    if(self->next >= self->end)
+        self->done = 1;
+    else
+    {
+        self->ref = self->next;
+        self->next += 1;
+    }
+}
+
+static inline Z
+JOIN(Z, range)(A* container, T* begin, T* end)
+{
+    (void) container;
+    static Z zero;
+    Z self = zero;
+    if(begin && end)
+    {
+        self.step = JOIN(Z, step);
+        self.begin = begin;
+        self.end = end;
+        self.next = begin + 1;
+        self.ref = begin;
+    }
+    else
+        self.done = 1;
+    return self;
+}
+
+static inline int
+JOIN(A, empty)(A* self)
+{
+    return self->size == 0;
+}
+
+static inline Z
+JOIN(Z, each)(A* a)
+{
+    return JOIN(A, empty)(a)
+         ? JOIN(Z, range)(a, NULL, NULL)
+         : JOIN(Z, range)(a, JOIN(A, begin)(a), JOIN(A, end)(a));
+}
+
+static inline T
+JOIN(A, implicit_copy)(T* self)
+{
+    return *self;
+}
+
+static inline int
+JOIN(A, equal)(A* self, A* other, int _equal(T*, T*))
+{
+    if(self->size != other->size)
+        return 0;
+    Z a = JOIN(Z, each)(self);
+    Z b = JOIN(Z, each)(other);
+    while(!a.done && !b.done)
+    {
+        if(!_equal(a.ref, b.ref))
+            return 0;
+        a.step(&a);
+        b.step(&b);
+    }
+    return 1;
+}
+
+static inline void
+JOIN(A, swap)(A* self, A* other)
+{
+    A temp = *self;
+    *self = *other;
+    *other = temp;
+}
+
+static inline A
+JOIN(A, init)(void)
+{
+    static A zero;
+    A self = zero;
+#ifdef P
+#undef P
+    self.copy = JOIN(A, implicit_copy);
+#else
+    self.free = JOIN(T, free);
+    self.copy = JOIN(T, copy);
+#endif
+    return self;
+}
+
+static inline void
+JOIN(A, set)(A* self, size_t index, T value)
+{
+    T* ref = JOIN(A, at)(self, index);
+    if(self->free)
+        self->free(ref);
+    *ref = value;
+}
+
+static inline void
+JOIN(A, pop_back)(A* self)
+{
+    static T zero;
+    self->size -= 1;
+    JOIN(A, set)(self, self->size, zero);
+}
+
+static inline void
+JOIN(A, wipe)(A* self, size_t n)
+{
+    while(n != 0)
+    {
+        JOIN(A, pop_back)(self);
+        n -= 1;
+    }
+}
+
+static inline void
+JOIN(A, clear)(A* self)
+{
+    if(self->size > 0)
+        JOIN(A, wipe)(self, self->size);
+}
+
+static inline void
+JOIN(A, free)(A* self)
+{
+    JOIN(A, clear)(self);
+    free(self->value);
+    *self = JOIN(A, init)();
+}
+
+static inline void
+JOIN(A, fit)(A* self, size_t capacity)
+{
+    static T zero;
+    size_t overall = capacity;
+    if(MUST_ALIGN_16(T))
+        overall += 1;
+    self->value = (T*) realloc(self->value, overall * sizeof(T));
+    if(MUST_ALIGN_16(T))
+        for(size_t i = self->capacity; i < overall; i++)
+            self->value[i] = zero;
+    self->capacity = capacity;
+}
+
+static inline void
+JOIN(A, reserve)(A* self, const size_t capacity)
+{
+    if(capacity != self->capacity)
+    {
+        size_t actual = 0;
+        if(MUST_ALIGN_16(T))
+        {
+            if(capacity <= self->size)
+                actual = self->size;
+            else
+            if(capacity > self->size && capacity < self->capacity)
+                actual = capacity;
+            else
+            {
+                actual = 2 * self->capacity;
+                if(capacity > actual)
+                    actual = capacity;
+            }
+        }
+        else
+        if(capacity > self->capacity)
+            actual = capacity;
+        if(actual > 0)
+            JOIN(A, fit)(self, actual);
+    }
+}
+
+static inline void
+JOIN(A, push_back)(A* self, T value)
+{
+    if(self->size == self->capacity)
+        JOIN(A, reserve)(self, self->capacity == 0 ? 1 : 2 * self->capacity);
+    *JOIN(A, at)(self, self->size) = value;
+    self->size += 1;
+}
+
+static inline void
+JOIN(A, resize)(A* self, size_t size, T value)
+{
+    if(size < self->size)
+    {
+        int64_t less = self->size - size;
+        if(less > 0)
+            JOIN(A, wipe)(self, less);
+    }
+    else
+    {
+        if(size > self->capacity)
+        {
+            size_t capacity = 2 * self->size;
+            if(size > capacity)
+                capacity = size;
+            JOIN(A, reserve)(self, capacity);
+        }
+        for(size_t i = 0; self->size < size; i++)
+            JOIN(A, push_back)(self, self->copy(&value));
+    }
+    if(self->free)
+        self->free(&value);
+}
+
+static inline void
+JOIN(A, assign)(A* self, size_t size, T value)
+{
+    JOIN(A, resize)(self, size, self->copy(&value));
+    for(size_t i = 0; i < size; i++)
+        JOIN(A, set)(self, i, self->copy(&value));
+    if(self->free)
+        self->free(&value);
+}
+
+static inline void
+JOIN(A, shrink_to_fit)(A* self)
+{
+    JOIN(A, fit)(self, self->size);
+}
+
+static inline T*
+JOIN(A, data)(A* self)
+{
+    return JOIN(A, front)(self);
+}
+
+static inline void
+JOIN(A, insert)(A* self, size_t index, T value)
+{
+    if(self->size > 0)
+    {
+        JOIN(A, push_back)(self, *JOIN(A, back)(self));
+        for(size_t i = self->size - 2; i > index; i--)
+            self->value[i] = self->value[i - 1];
+        self->value[index] = value;
+    }
+    else
+        JOIN(A, push_back)(self, value);
+}
+
+static inline void
+JOIN(A, erase)(A* self, size_t index)
+{
+    static T zero;
+    JOIN(A, set)(self, index, zero);
+    for(size_t i = index; i < self->size - 1; i++)
+    {
+        self->value[i] = self->value[i + 1];
+        self->value[i + 1] = zero;
+    }
+    self->size -= 1;
+}
+
+static inline void
+JOIN(A, ranged_sort)(A* self, int64_t a, int64_t b, int _compare(T*, T*))
+{
+    if(a >= b)
+        return;
+    int64_t mid = (a + b) / 2;
+    SWAP(T, &self->value[a], &self->value[mid]);
+    int64_t z = a;
+    for(int64_t i = a + 1; i <= b; i++)
+        if(_compare(&self->value[a], &self->value[i]))
+        {
+            z += 1;
+            SWAP(T, &self->value[z], &self->value[i]);
+        }
+    SWAP(T, &self->value[a], &self->value[z]);
+    JOIN(A, ranged_sort)(self, a, z - 1, _compare);
+    JOIN(A, ranged_sort)(self, z + 1, b, _compare);
+}
+
+static inline void
+JOIN(A, sort)(A* self, int _compare(T*, T*))
+{
+    JOIN(A, ranged_sort)(self, 0, self->size - 1, _compare);
+}
+
+static inline A
+JOIN(A, copy)(A* self)
+{
+    A other = JOIN(A, init)();
+#ifdef COMPARE
+    other.compare = self->compare;
+#endif
+    JOIN(A, reserve)(&other, self->size);
+    while(other.size < self->size)
+        JOIN(A, push_back)(&other, other.copy(&self->value[other.size]));
+    return other;
+}
+
+static inline size_t
+JOIN(A, remove_if)(A* self, int _match(T*))
+{
+    size_t erases = 0;
+    foreach(A, self, it)
+    {
+        if(_match(it.ref))
+        {
+            size_t index = it.ref - JOIN(A, begin)(self);
+            JOIN(A, erase)(self, index);
+            it.end = JOIN(A, end)(self);
+            it.next = it.ref;
+            erases += 1;
+        }
+    }
+    return erases;
+}
+
+static inline T*
+JOIN(A, find)(A* self, T key, int _equal(T*, T*))
+{
+    foreach(A, self, it)
+        if(_equal(it.ref, &key))
+            return it.ref;
+    return NULL;
+}
+
+#ifdef COMPARE
+#undef COMPARE
+#endif
+
+#undef A
+#undef Z
+#undef MUST_ALIGN_16
+
+// Hold preserves `T` if other containers
+// (eg. `pqu.h`) wish to extend `vec.h`.
+#ifdef HOLD
+#undef HOLD
+#else
+#undef T
+#endif
diff --git a/vendor/sort/sort.h b/vendor/sort/sort.h
new file mode 100644
index 0000000..2a8148b
--- /dev/null
+++ b/vendor/sort/sort.h
@@ -0,0 +1,3097 @@
+/* Copyright (c) 2010-2019 Christopher Swenson. */
+/* Copyright (c) 2012 Vojtech Fried. */
+/* Copyright (c) 2012 Google Inc. All Rights Reserved. */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#ifndef SORT_NAME
+#error "Must declare SORT_NAME"
+#endif
+
+#ifndef SORT_TYPE
+#error "Must declare SORT_TYPE"
+#endif
+
+#ifndef SORT_CMP
+#define SORT_CMP(x, y)  ((x) < (y) ? -1 : ((y) < (x) ? 1 : 0))
+#endif
+
+#ifndef SORT_DEF
+#define SORT_DEF
+#else
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+#endif
+
+#ifdef __cplusplus
+#ifndef SORT_SAFE_CPY
+#define SORT_SAFE_CPY 0
+#endif
+#else
+#undef SORT_SAFE_CPY
+#define SORT_SAFE_CPY 0
+#endif
+
+#ifndef TIM_SORT_STACK_SIZE
+#define TIM_SORT_STACK_SIZE 128
+#endif
+
+#ifndef SORT_SWAP
+#define SORT_SWAP(x,y) {SORT_TYPE _sort_swap_temp = (x); (x) = (y); (y) = _sort_swap_temp;}
+#endif
+
+/* Common, type-agnostic functions and constants that we don't want to declare twice. */
+#ifndef SORT_COMMON_H
+#define SORT_COMMON_H
+
+#ifndef MAX
+#define MAX(x,y) (((x) > (y) ? (x) : (y)))
+#endif
+
+#ifndef MIN
+#define MIN(x,y) (((x) < (y) ? (x) : (y)))
+#endif
+
+static int compute_minrun(const uint64_t);
+
+/* From http://oeis.org/classic/A102549 */
+static const uint64_t shell_gaps[48] = {1, 4, 10, 23, 57, 132, 301, 701, 1750, 4376, 10941, 27353, 68383, 170958, 427396, 1068491, 2671228, 6678071, 16695178, 41737946, 104344866, 260862166, 652155416, 1630388541, 4075971353LL, 10189928383LL, 25474820958LL, 63687052396LL, 159217630991LL, 398044077478LL, 995110193696LL, 2487775484241LL, 6219438710603LL, 15548596776508LL, 38871491941271LL, 97178729853178LL, 242946824632946LL, 607367061582366LL, 1518417653955916LL, 3796044134889791LL, 9490110337224478LL, 23725275843061196LL, 59313189607652991LL, 148282974019132478LL, 370707435047831196LL, 926768587619577991LL, 2316921469048944978LL, 5792303672622362446LL};
+
+#ifndef CLZ
+/* clang-only */
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+#if __has_builtin(__builtin_clzll) || (defined(__GNUC__) && ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ > 3)))
+#define CLZ __builtin_clzll
+#else
+
+static int clzll(uint64_t);
+
+/* adapted from Hacker's Delight */
+static int clzll(uint64_t x) {
+  int n;
+
+  if (x == 0) {
+    return 64;
+  }
+
+  n = 0;
+
+  if (x <= 0x00000000FFFFFFFFL) {
+    n = n + 32;
+    x = x << 32;
+  }
+
+  if (x <= 0x0000FFFFFFFFFFFFL) {
+    n = n + 16;
+    x = x << 16;
+  }
+
+  if (x <= 0x00FFFFFFFFFFFFFFL) {
+    n = n + 8;
+    x = x << 8;
+  }
+
+  if (x <= 0x0FFFFFFFFFFFFFFFL) {
+    n = n + 4;
+    x = x << 4;
+  }
+
+  if (x <= 0x3FFFFFFFFFFFFFFFL) {
+    n = n + 2;
+    x = x << 2;
+  }
+
+  if (x <= 0x7FFFFFFFFFFFFFFFL) {
+    n = n + 1;
+  }
+
+  return n;
+}
+
+#define CLZ clzll
+#endif
+#endif
+
+static __inline int compute_minrun(const uint64_t size) {
+  const int top_bit = 64 - CLZ(size);
+  const int shift = MAX(top_bit, 6) - 6;
+  const int minrun = (int)(size >> shift);
+  const uint64_t mask = (1ULL << shift) - 1;
+
+  if (mask & size) {
+    return minrun + 1;
+  }
+
+  return minrun;
+}
+
+static __inline size_t rbnd(size_t len) {
+  int k;
+
+  if (len < 16) {
+    return 2;
+  }
+
+  k = 62 - CLZ(len);
+  return 1ULL << ((2 * k) / 3);
+}
+
+#endif /* SORT_COMMON_H */
+
+#define SORT_CONCAT(x, y) x ## _ ## y
+#define SORT_MAKE_STR1(x, y) SORT_CONCAT(x,y)
+#define SORT_MAKE_STR(x) SORT_MAKE_STR1(SORT_NAME,x)
+
+#ifndef SMALL_SORT_BND
+#define SMALL_SORT_BND 16
+#endif
+#ifndef SMALL_SORT
+#define SMALL_SORT BITONIC_SORT
+/*#define SMALL_SORT BINARY_INSERTION_SORT*/
+#endif
+#ifndef SMALL_STABLE_SORT
+#define SMALL_STABLE_SORT BINARY_INSERTION_SORT
+#endif
+
+#define SORT_TYPE_CPY                  SORT_MAKE_STR(sort_type_cpy)
+#define SORT_TYPE_MOVE                 SORT_MAKE_STR(sort_type_move)
+#define SORT_NEW_BUFFER                SORT_MAKE_STR(sort_new_buffer)
+#define SORT_DELETE_BUFFER             SORT_MAKE_STR(sort_delete_buffer)
+#define BITONIC_SORT                   SORT_MAKE_STR(bitonic_sort)
+#define BINARY_INSERTION_FIND          SORT_MAKE_STR(binary_insertion_find)
+#define BINARY_INSERTION_SORT_START    SORT_MAKE_STR(binary_insertion_sort_start)
+#define BINARY_INSERTION_SORT          SORT_MAKE_STR(binary_insertion_sort)
+#define REVERSE_ELEMENTS               SORT_MAKE_STR(reverse_elements)
+#define COUNT_RUN                      SORT_MAKE_STR(count_run)
+#define CHECK_INVARIANT                SORT_MAKE_STR(check_invariant)
+#define TIM_SORT                       SORT_MAKE_STR(tim_sort)
+#define TIM_SORT_RESIZE                SORT_MAKE_STR(tim_sort_resize)
+#define TIM_SORT_MERGE                 SORT_MAKE_STR(tim_sort_merge)
+#define TIM_SORT_COLLAPSE              SORT_MAKE_STR(tim_sort_collapse)
+#define HEAP_SORT                      SORT_MAKE_STR(heap_sort)
+#define MEDIAN                         SORT_MAKE_STR(median)
+#define QUICK_SORT                     SORT_MAKE_STR(quick_sort)
+#define MERGE_SORT                     SORT_MAKE_STR(merge_sort)
+#define MERGE_SORT_RECURSIVE           SORT_MAKE_STR(merge_sort_recursive)
+#define MERGE_SORT_IN_PLACE            SORT_MAKE_STR(merge_sort_in_place)
+#define MERGE_SORT_IN_PLACE_RMERGE     SORT_MAKE_STR(merge_sort_in_place_rmerge)
+#define MERGE_SORT_IN_PLACE_BACKMERGE  SORT_MAKE_STR(merge_sort_in_place_backmerge)
+#define MERGE_SORT_IN_PLACE_FRONTMERGE SORT_MAKE_STR(merge_sort_in_place_frontmerge)
+#define MERGE_SORT_IN_PLACE_ASWAP      SORT_MAKE_STR(merge_sort_in_place_aswap)
+#define SELECTION_SORT                 SORT_MAKE_STR(selection_sort)
+#define SHELL_SORT                     SORT_MAKE_STR(shell_sort)
+#define QUICK_SORT_PARTITION           SORT_MAKE_STR(quick_sort_partition)
+#define QUICK_SORT_RECURSIVE           SORT_MAKE_STR(quick_sort_recursive)
+#define HEAP_SIFT_DOWN                 SORT_MAKE_STR(heap_sift_down)
+#define HEAPIFY                        SORT_MAKE_STR(heapify)
+#define TIM_SORT_RUN_T                 SORT_MAKE_STR(tim_sort_run_t)
+#define TEMP_STORAGE_T                 SORT_MAKE_STR(temp_storage_t)
+#define PUSH_NEXT                      SORT_MAKE_STR(push_next)
+#define GRAIL_SWAP1                    SORT_MAKE_STR(grail_swap1)
+#define REC_STABLE_SORT                SORT_MAKE_STR(rec_stable_sort)
+#define GRAIL_REC_MERGE                SORT_MAKE_STR(grail_rec_merge)
+#define GRAIL_SORT_DYN_BUFFER          SORT_MAKE_STR(grail_sort_dyn_buffer)
+#define GRAIL_SORT_FIXED_BUFFER        SORT_MAKE_STR(grail_sort_fixed_buffer)
+#define GRAIL_COMMON_SORT              SORT_MAKE_STR(grail_common_sort)
+#define GRAIL_SORT                     SORT_MAKE_STR(grail_sort)
+#define GRAIL_COMBINE_BLOCKS           SORT_MAKE_STR(grail_combine_blocks)
+#define GRAIL_LAZY_STABLE_SORT         SORT_MAKE_STR(grail_lazy_stable_sort)
+#define GRAIL_MERGE_WITHOUT_BUFFER     SORT_MAKE_STR(grail_merge_without_buffer)
+#define GRAIL_ROTATE                   SORT_MAKE_STR(grail_rotate)
+#define GRAIL_BIN_SEARCH_LEFT          SORT_MAKE_STR(grail_bin_search_left)
+#define GRAIL_BUILD_BLOCKS             SORT_MAKE_STR(grail_build_blocks)
+#define GRAIL_FIND_KEYS                SORT_MAKE_STR(grail_find_keys)
+#define GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF SORT_MAKE_STR(grail_merge_buffers_left_with_x_buf)
+#define GRAIL_BIN_SEARCH_RIGHT         SORT_MAKE_STR(grail_bin_search_right)
+#define GRAIL_MERGE_BUFFERS_LEFT       SORT_MAKE_STR(grail_merge_buffers_left)
+#define GRAIL_SMART_MERGE_WITH_X_BUF   SORT_MAKE_STR(grail_smart_merge_with_x_buf)
+#define GRAIL_MERGE_LEFT_WITH_X_BUF    SORT_MAKE_STR(grail_merge_left_with_x_buf)
+#define GRAIL_SMART_MERGE_WITHOUT_BUFFER SORT_MAKE_STR(grail_smart_merge_without_buffer)
+#define GRAIL_SMART_MERGE_WITH_BUFFER  SORT_MAKE_STR(grail_smart_merge_with_buffer)
+#define GRAIL_MERGE_RIGHT              SORT_MAKE_STR(grail_merge_right)
+#define GRAIL_MERGE_LEFT               SORT_MAKE_STR(grail_merge_left)
+#define GRAIL_SWAP_N                   SORT_MAKE_STR(grail_swap_n)
+#define SQRT_SORT                      SORT_MAKE_STR(sqrt_sort)
+#define SQRT_SORT_BUILD_BLOCKS         SORT_MAKE_STR(sqrt_sort_build_blocks)
+#define SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF SORT_MAKE_STR(sqrt_sort_merge_buffers_left_with_x_buf)
+#define SQRT_SORT_MERGE_DOWN           SORT_MAKE_STR(sqrt_sort_merge_down)
+#define SQRT_SORT_MERGE_LEFT_WITH_X_BUF SORT_MAKE_STR(sqrt_sort_merge_left_with_x_buf)
+#define SQRT_SORT_MERGE_RIGHT          SORT_MAKE_STR(sqrt_sort_merge_right)
+#define SQRT_SORT_SWAP_N               SORT_MAKE_STR(sqrt_sort_swap_n)
+#define SQRT_SORT_SWAP_1               SORT_MAKE_STR(sqrt_sort_swap_1)
+#define SQRT_SORT_SMART_MERGE_WITH_X_BUF SORT_MAKE_STR(sqrt_sort_smart_merge_with_x_buf)
+#define SQRT_SORT_SORT_INS             SORT_MAKE_STR(sqrt_sort_sort_ins)
+#define SQRT_SORT_COMBINE_BLOCKS       SORT_MAKE_STR(sqrt_sort_combine_blocks)
+#define SQRT_SORT_COMMON_SORT          SORT_MAKE_STR(sqrt_sort_common_sort)
+#define BUBBLE_SORT                    SORT_MAKE_STR(bubble_sort)
+
+#ifndef MAX
+#define MAX(x,y) (((x) > (y) ? (x) : (y)))
+#endif
+#ifndef MIN
+#define MIN(x,y) (((x) < (y) ? (x) : (y)))
+#endif
+#ifndef SORT_CSWAP
+#define SORT_CSWAP(x, y) { if(SORT_CMP((x),(y)) > 0) {SORT_SWAP((x),(y));}}
+#endif
+
+typedef struct {
+  size_t start;
+  size_t length;
+} TIM_SORT_RUN_T;
+
+
+SORT_DEF void SHELL_SORT(SORT_TYPE *dst, const size_t size);
+SORT_DEF void BINARY_INSERTION_SORT(SORT_TYPE *dst, const size_t size);
+SORT_DEF void HEAP_SORT(SORT_TYPE *dst, const size_t size);
+SORT_DEF void QUICK_SORT(SORT_TYPE *dst, const size_t size);
+SORT_DEF void MERGE_SORT(SORT_TYPE *dst, const size_t size);
+SORT_DEF void MERGE_SORT_IN_PLACE(SORT_TYPE *dst, const size_t size);
+SORT_DEF void SELECTION_SORT(SORT_TYPE *dst, const size_t size);
+SORT_DEF void TIM_SORT(SORT_TYPE *dst, const size_t size);
+SORT_DEF void BUBBLE_SORT(SORT_TYPE *dst, const size_t size);
+SORT_DEF void BITONIC_SORT(SORT_TYPE *dst, const size_t size);
+SORT_DEF void REC_STABLE_SORT(SORT_TYPE *dst, const size_t size);
+SORT_DEF void GRAIL_SORT_DYN_BUFFER(SORT_TYPE *dst, const size_t size);
+SORT_DEF void GRAIL_SORT_FIXED_BUFFER(SORT_TYPE *dst, const size_t size);
+SORT_DEF void GRAIL_SORT(SORT_TYPE *dst, const size_t size);
+SORT_DEF void SQRT_SORT(SORT_TYPE *dst, const size_t size);
+
+/* The full implementation of a bitonic sort is not here. Since we only want to use
+   sorting networks for small length lists we create optimal sorting networks for
+   lists of length <= 16 and call out to BINARY_INSERTION_SORT for anything larger
+   than 16.
+   Optimal sorting networks for small length lists.
+   Taken from https://pages.ripco.net/~jgamble/nw.html */
+#define BITONIC_SORT_2          SORT_MAKE_STR(bitonic_sort_2)
+static __inline void BITONIC_SORT_2(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+}
+
+
+#define BITONIC_SORT_3          SORT_MAKE_STR(bitonic_sort_3)
+static __inline void BITONIC_SORT_3(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[0], dst[1]);
+}
+
+
+#define BITONIC_SORT_4          SORT_MAKE_STR(bitonic_sort_4)
+static __inline void BITONIC_SORT_4(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[1], dst[2]);
+}
+
+
+#define BITONIC_SORT_5          SORT_MAKE_STR(bitonic_sort_5)
+static __inline void BITONIC_SORT_5(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[0], dst[3]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[1], dst[2]);
+}
+
+
+#define BITONIC_SORT_6          SORT_MAKE_STR(bitonic_sort_6)
+static __inline void BITONIC_SORT_6(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[2], dst[5]);
+  SORT_CSWAP(dst[0], dst[3]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[2], dst[3]);
+}
+
+
+#define BITONIC_SORT_7          SORT_MAKE_STR(bitonic_sort_7)
+static __inline void BITONIC_SORT_7(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[0], dst[3]);
+  SORT_CSWAP(dst[2], dst[5]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[2], dst[3]);
+}
+
+
+#define BITONIC_SORT_8          SORT_MAKE_STR(bitonic_sort_8)
+static __inline void BITONIC_SORT_8(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[3], dst[7]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[3], dst[6]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[3], dst[4]);
+}
+
+
+#define BITONIC_SORT_9          SORT_MAKE_STR(bitonic_sort_9)
+static __inline void BITONIC_SORT_9(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[7], dst[8]);
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[2], dst[5]);
+  SORT_CSWAP(dst[0], dst[3]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[5], dst[8]);
+  SORT_CSWAP(dst[3], dst[6]);
+  SORT_CSWAP(dst[4], dst[7]);
+  SORT_CSWAP(dst[2], dst[5]);
+  SORT_CSWAP(dst[0], dst[3]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[2], dst[3]);
+}
+
+
+#define BITONIC_SORT_10          SORT_MAKE_STR(bitonic_sort_10)
+static __inline void BITONIC_SORT_10(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[4], dst[9]);
+  SORT_CSWAP(dst[3], dst[8]);
+  SORT_CSWAP(dst[2], dst[7]);
+  SORT_CSWAP(dst[1], dst[6]);
+  SORT_CSWAP(dst[0], dst[5]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[6], dst[9]);
+  SORT_CSWAP(dst[0], dst[3]);
+  SORT_CSWAP(dst[5], dst[8]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[3], dst[6]);
+  SORT_CSWAP(dst[7], dst[9]);
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[7], dst[8]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[2], dst[5]);
+  SORT_CSWAP(dst[6], dst[8]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[4], dst[7]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[4], dst[5]);
+}
+
+
+#define BITONIC_SORT_11          SORT_MAKE_STR(bitonic_sort_11)
+static __inline void BITONIC_SORT_11(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[8], dst[10]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[3], dst[7]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[6], dst[10]);
+  SORT_CSWAP(dst[4], dst[8]);
+  SORT_CSWAP(dst[5], dst[9]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[3], dst[8]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[6], dst[10]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[7], dst[10]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[6], dst[8]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[7], dst[9]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[7], dst[8]);
+}
+
+
+#define BITONIC_SORT_12          SORT_MAKE_STR(bitonic_sort_12)
+static __inline void BITONIC_SORT_12(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[10], dst[11]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[9], dst[11]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[8], dst[10]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[7], dst[11]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[6], dst[10]);
+  SORT_CSWAP(dst[3], dst[7]);
+  SORT_CSWAP(dst[4], dst[8]);
+  SORT_CSWAP(dst[5], dst[9]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[7], dst[11]);
+  SORT_CSWAP(dst[3], dst[8]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[6], dst[10]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[7], dst[10]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[6], dst[8]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[7], dst[9]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[7], dst[8]);
+}
+
+
+#define BITONIC_SORT_13          SORT_MAKE_STR(bitonic_sort_13)
+static __inline void BITONIC_SORT_13(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[1], dst[7]);
+  SORT_CSWAP(dst[9], dst[11]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[8]);
+  SORT_CSWAP(dst[0], dst[12]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[8], dst[11]);
+  SORT_CSWAP(dst[7], dst[12]);
+  SORT_CSWAP(dst[5], dst[9]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[3], dst[7]);
+  SORT_CSWAP(dst[10], dst[11]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[6], dst[12]);
+  SORT_CSWAP(dst[7], dst[8]);
+  SORT_CSWAP(dst[11], dst[12]);
+  SORT_CSWAP(dst[4], dst[9]);
+  SORT_CSWAP(dst[6], dst[10]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[10], dst[11]);
+  SORT_CSWAP(dst[1], dst[7]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[9], dst[11]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[4], dst[7]);
+  SORT_CSWAP(dst[8], dst[10]);
+  SORT_CSWAP(dst[0], dst[5]);
+  SORT_CSWAP(dst[2], dst[5]);
+  SORT_CSWAP(dst[6], dst[8]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[7], dst[8]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+}
+
+
+#define BITONIC_SORT_14          SORT_MAKE_STR(bitonic_sort_14)
+static __inline void BITONIC_SORT_14(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[10], dst[11]);
+  SORT_CSWAP(dst[12], dst[13]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[8], dst[10]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[9], dst[11]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[8], dst[12]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[9], dst[13]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[3], dst[7]);
+  SORT_CSWAP(dst[0], dst[8]);
+  SORT_CSWAP(dst[1], dst[9]);
+  SORT_CSWAP(dst[2], dst[10]);
+  SORT_CSWAP(dst[3], dst[11]);
+  SORT_CSWAP(dst[4], dst[12]);
+  SORT_CSWAP(dst[5], dst[13]);
+  SORT_CSWAP(dst[5], dst[10]);
+  SORT_CSWAP(dst[6], dst[9]);
+  SORT_CSWAP(dst[3], dst[12]);
+  SORT_CSWAP(dst[7], dst[11]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[4], dst[8]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[7], dst[13]);
+  SORT_CSWAP(dst[2], dst[8]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[11], dst[13]);
+  SORT_CSWAP(dst[3], dst[8]);
+  SORT_CSWAP(dst[7], dst[12]);
+  SORT_CSWAP(dst[6], dst[8]);
+  SORT_CSWAP(dst[10], dst[12]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[7], dst[9]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[7], dst[8]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[11], dst[12]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+}
+
+
+#define BITONIC_SORT_15          SORT_MAKE_STR(bitonic_sort_15)
+static __inline void BITONIC_SORT_15(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[10], dst[11]);
+  SORT_CSWAP(dst[12], dst[13]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[8], dst[10]);
+  SORT_CSWAP(dst[12], dst[14]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[9], dst[11]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[8], dst[12]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[9], dst[13]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[10], dst[14]);
+  SORT_CSWAP(dst[3], dst[7]);
+  SORT_CSWAP(dst[0], dst[8]);
+  SORT_CSWAP(dst[1], dst[9]);
+  SORT_CSWAP(dst[2], dst[10]);
+  SORT_CSWAP(dst[3], dst[11]);
+  SORT_CSWAP(dst[4], dst[12]);
+  SORT_CSWAP(dst[5], dst[13]);
+  SORT_CSWAP(dst[6], dst[14]);
+  SORT_CSWAP(dst[5], dst[10]);
+  SORT_CSWAP(dst[6], dst[9]);
+  SORT_CSWAP(dst[3], dst[12]);
+  SORT_CSWAP(dst[13], dst[14]);
+  SORT_CSWAP(dst[7], dst[11]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[4], dst[8]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[7], dst[13]);
+  SORT_CSWAP(dst[2], dst[8]);
+  SORT_CSWAP(dst[11], dst[14]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[11], dst[13]);
+  SORT_CSWAP(dst[3], dst[8]);
+  SORT_CSWAP(dst[7], dst[12]);
+  SORT_CSWAP(dst[6], dst[8]);
+  SORT_CSWAP(dst[10], dst[12]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[7], dst[9]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[7], dst[8]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[11], dst[12]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+}
+
+
+#define BITONIC_SORT_16          SORT_MAKE_STR(bitonic_sort_16)
+static __inline void BITONIC_SORT_16(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[10], dst[11]);
+  SORT_CSWAP(dst[12], dst[13]);
+  SORT_CSWAP(dst[14], dst[15]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[8], dst[10]);
+  SORT_CSWAP(dst[12], dst[14]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[9], dst[11]);
+  SORT_CSWAP(dst[13], dst[15]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[8], dst[12]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[9], dst[13]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[10], dst[14]);
+  SORT_CSWAP(dst[3], dst[7]);
+  SORT_CSWAP(dst[11], dst[15]);
+  SORT_CSWAP(dst[0], dst[8]);
+  SORT_CSWAP(dst[1], dst[9]);
+  SORT_CSWAP(dst[2], dst[10]);
+  SORT_CSWAP(dst[3], dst[11]);
+  SORT_CSWAP(dst[4], dst[12]);
+  SORT_CSWAP(dst[5], dst[13]);
+  SORT_CSWAP(dst[6], dst[14]);
+  SORT_CSWAP(dst[7], dst[15]);
+  SORT_CSWAP(dst[5], dst[10]);
+  SORT_CSWAP(dst[6], dst[9]);
+  SORT_CSWAP(dst[3], dst[12]);
+  SORT_CSWAP(dst[13], dst[14]);
+  SORT_CSWAP(dst[7], dst[11]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[4], dst[8]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[7], dst[13]);
+  SORT_CSWAP(dst[2], dst[8]);
+  SORT_CSWAP(dst[11], dst[14]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[11], dst[13]);
+  SORT_CSWAP(dst[3], dst[8]);
+  SORT_CSWAP(dst[7], dst[12]);
+  SORT_CSWAP(dst[6], dst[8]);
+  SORT_CSWAP(dst[10], dst[12]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[7], dst[9]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[7], dst[8]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[11], dst[12]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+}
+
+SORT_DEF void BITONIC_SORT(SORT_TYPE *dst, const size_t size) {
+  switch (size) {
+  case 0:
+  case 1:
+    break;
+
+  case 2:
+    BITONIC_SORT_2(dst);
+    break;
+
+  case 3:
+    BITONIC_SORT_3(dst);
+    break;
+
+  case 4:
+    BITONIC_SORT_4(dst);
+    break;
+
+  case 5:
+    BITONIC_SORT_5(dst);
+    break;
+
+  case 6:
+    BITONIC_SORT_6(dst);
+    break;
+
+  case 7:
+    BITONIC_SORT_7(dst);
+    break;
+
+  case 8:
+    BITONIC_SORT_8(dst);
+    break;
+
+  case 9:
+    BITONIC_SORT_9(dst);
+    break;
+
+  case 10:
+    BITONIC_SORT_10(dst);
+    break;
+
+  case 11:
+    BITONIC_SORT_11(dst);
+    break;
+
+  case 12:
+    BITONIC_SORT_12(dst);
+    break;
+
+  case 13:
+    BITONIC_SORT_13(dst);
+    break;
+
+  case 14:
+    BITONIC_SORT_14(dst);
+    break;
+
+  case 15:
+    BITONIC_SORT_15(dst);
+    break;
+
+  case 16:
+    BITONIC_SORT_16(dst);
+    break;
+
+  default:
+    BINARY_INSERTION_SORT(dst, size);
+  }
+}
+
+#if SORT_SAFE_CPY
+
+SORT_DEF void SORT_TYPE_CPY(SORT_TYPE *dst, SORT_TYPE *src, const size_t size) {
+  size_t i = 0;
+
+  for (; i < size; ++i) {
+    dst[i] = src[i];
+  }
+}
+
+SORT_DEF void SORT_TYPE_MOVE(SORT_TYPE *dst, SORT_TYPE *src, const size_t size) {
+  size_t i;
+
+  if (dst < src) {
+    SORT_TYPE_CPY(dst, src, size);
+  } else if (dst != src && size > 0) {
+    for (i = size - 1; i > 0; --i) {
+      dst[i] = src[i];
+    }
+
+    *dst = *src;
+  }
+}
+
+#else
+
+#undef SORT_TYPE_CPY
+#define SORT_TYPE_CPY(dst, src, size) memcpy((dst), (src), (size) * sizeof(SORT_TYPE))
+#undef SORT_TYPE_MOVE
+#define SORT_TYPE_MOVE(dst, src, size) memmove((dst), (src), (size) * sizeof(SORT_TYPE))
+
+#endif
+
+SORT_DEF SORT_TYPE* SORT_NEW_BUFFER(size_t size) {
+#if SORT_SAFE_CPY
+  return new SORT_TYPE[size];
+#else
+  return (SORT_TYPE*)malloc(size * sizeof(SORT_TYPE));
+#endif
+}
+
+SORT_DEF void SORT_DELETE_BUFFER(SORT_TYPE* pointer) {
+#if SORT_SAFE_CPY
+  delete[] pointer;
+#else
+  free(pointer);
+#endif
+}
+
+
+/* Shell sort implementation based on Wikipedia article
+   http://en.wikipedia.org/wiki/Shell_sort
+*/
+SORT_DEF void SHELL_SORT(SORT_TYPE *dst, const size_t size) {
+  /* don't bother sorting an array of size 0 or 1 */
+  /* TODO: binary search to find first gap? */
+  int inci = 47;
+  size_t inc = shell_gaps[inci];
+  size_t i;
+
+  if (size <= 1) {
+    return;
+  }
+
+  while (inc > (size >> 1)) {
+    inc = shell_gaps[--inci];
+  }
+
+  while (1) {
+    for (i = inc; i < size; i++) {
+      SORT_TYPE temp = dst[i];
+      size_t j = i;
+
+      while ((j >= inc) && (SORT_CMP(dst[j - inc], temp) > 0)) {
+        dst[j] = dst[j - inc];
+        j -= inc;
+      }
+
+      dst[j] = temp;
+    }
+
+    if (inc == 1) {
+      break;
+    }
+
+    inc = shell_gaps[--inci];
+  }
+}
+
+/* Function used to do a binary search for binary insertion sort */
+static __inline size_t BINARY_INSERTION_FIND(SORT_TYPE *dst, const SORT_TYPE x,
+    const size_t size) {
+  size_t l, c, r;
+  SORT_TYPE cx;
+  l = 0;
+  r = size - 1;
+  c = r >> 1;
+
+  /* check for out of bounds at the beginning. */
+  if (SORT_CMP(x, dst[0]) < 0) {
+    return 0;
+  } else if (SORT_CMP(x, dst[r]) > 0) {
+    return r;
+  }
+
+  cx = dst[c];
+
+  while (1) {
+    const int val = SORT_CMP(x, cx);
+
+    if (val < 0) {
+      if (c - l <= 1) {
+        return c;
+      }
+
+      r = c;
+    } else { /* allow = for stability. The binary search favors the right. */
+      if (r - c <= 1) {
+        return c + 1;
+      }
+
+      l = c;
+    }
+
+    c = l + ((r - l) >> 1);
+    cx = dst[c];
+  }
+}
+
+/* Binary insertion sort, but knowing that the first "start" entries are sorted.  Used in timsort. */
+static void BINARY_INSERTION_SORT_START(SORT_TYPE *dst, const size_t start, const size_t size) {
+  size_t i;
+
+  for (i = start; i < size; i++) {
+    size_t j;
+    SORT_TYPE x;
+    size_t location;
+
+    /* If this entry is already correct, just move along */
+    if (SORT_CMP(dst[i - 1], dst[i]) <= 0) {
+      continue;
+    }
+
+    /* Else we need to find the right place, shift everything over, and squeeze in */
+    x = dst[i];
+    location = BINARY_INSERTION_FIND(dst, x, i);
+
+    for (j = i - 1; j >= location; j--) {
+      dst[j + 1] = dst[j];
+
+      if (j == 0) { /* check edge case because j is unsigned */
+        break;
+      }
+    }
+
+    dst[location] = x;
+  }
+}
+
+/* Binary insertion sort */
+SORT_DEF void BINARY_INSERTION_SORT(SORT_TYPE *dst, const size_t size) {
+  /* don't bother sorting an array of size <= 1 */
+  if (size <= 1) {
+    return;
+  }
+
+  BINARY_INSERTION_SORT_START(dst, 1, size);
+}
+
+/* Selection sort */
+SORT_DEF void SELECTION_SORT(SORT_TYPE *dst, const size_t size) {
+  size_t i, j;
+
+  /* don't bother sorting an array of size <= 1 */
+  if (size <= 1) {
+    return;
+  }
+
+  for (i = 0; i < size; i++) {
+    for (j = i + 1; j < size; j++) {
+      if (SORT_CMP(dst[j], dst[i]) < 0) {
+        SORT_SWAP(dst[i], dst[j]);
+      }
+    }
+  }
+}
+
+/* In-place mergesort */
+SORT_DEF void MERGE_SORT_IN_PLACE_ASWAP(SORT_TYPE * dst1, SORT_TYPE * dst2, size_t len) {
+  do {
+    SORT_SWAP(*dst1, *dst2);
+    dst1++;
+    dst2++;
+  } while (--len);
+}
+
+SORT_DEF void MERGE_SORT_IN_PLACE_FRONTMERGE(SORT_TYPE *dst1, size_t l1, SORT_TYPE *dst2,
+    size_t l2) {
+  SORT_TYPE *dst0 = dst2 - l1;
+
+  if (SORT_CMP(dst1[l1 - 1], dst2[0]) <= 0) {
+    MERGE_SORT_IN_PLACE_ASWAP(dst1, dst0, l1);
+    return;
+  }
+
+  do {
+    while (SORT_CMP(*dst2, *dst1) > 0) {
+      SORT_SWAP(*dst1, *dst0);
+      dst1++;
+      dst0++;
+
+      if (--l1 == 0) {
+        return;
+      }
+    }
+
+    SORT_SWAP(*dst2, *dst0);
+    dst2++;
+    dst0++;
+  } while (--l2);
+
+  do {
+    SORT_SWAP(*dst1, *dst0);
+    dst1++;
+    dst0++;
+  } while (--l1);
+}
+
+SORT_DEF size_t MERGE_SORT_IN_PLACE_BACKMERGE(SORT_TYPE * dst1, size_t l1, SORT_TYPE * dst2,
+    size_t l2) {
+  size_t res;
+  SORT_TYPE *dst0 = dst2 + l1;
+
+  if (SORT_CMP(dst1[1 - l1], dst2[0]) >= 0) {
+    MERGE_SORT_IN_PLACE_ASWAP(dst1 - l1 + 1, dst0 - l1 + 1, l1);
+    return l1;
+  }
+
+  do {
+    while (SORT_CMP(*dst2, *dst1) < 0) {
+      SORT_SWAP(*dst1, *dst0);
+      dst1--;
+      dst0--;
+
+      if (--l1 == 0) {
+        return 0;
+      }
+    }
+
+    SORT_SWAP(*dst2, *dst0);
+    dst2--;
+    dst0--;
+  } while (--l2);
+
+  res = l1;
+
+  do {
+    SORT_SWAP(*dst1, *dst0);
+    dst1--;
+    dst0--;
+  } while (--l1);
+
+  return res;
+}
+
+/* merge dst[p0..p1) by buffer dst[p1..p1+r) */
+SORT_DEF void MERGE_SORT_IN_PLACE_RMERGE(SORT_TYPE *dst, size_t len, size_t lp, size_t r) {
+  size_t i, lq;
+  int cv;
+
+  if (SORT_CMP(dst[lp], dst[lp - 1]) >= 0) {
+    return;
+  }
+
+  lq = lp;
+
+  for (i = 0; i < len; i += r) {
+    /* select smallest dst[p0+n*r] */
+    size_t q = i, j;
+
+    for (j = lp; j <= lq; j += r) {
+      cv = SORT_CMP(dst[j], dst[q]);
+
+      if (cv == 0) {
+        cv = SORT_CMP(dst[j + r - 1], dst[q + r - 1]);
+      }
+
+      if (cv < 0) {
+        q = j;
+      }
+    }
+
+    if (q != i) {
+      MERGE_SORT_IN_PLACE_ASWAP(dst + i, dst + q, r); /* swap it with current position */
+
+      if (q == lq && q < (len - r)) {
+        lq += r;
+      }
+    }
+
+    if (i != 0 && SORT_CMP(dst[i], dst[i - 1]) < 0) {
+      MERGE_SORT_IN_PLACE_ASWAP(dst + len, dst + i, r); /* swap current position with buffer */
+      MERGE_SORT_IN_PLACE_BACKMERGE(dst + (len + r - 1), r, dst + (i - 1),
+                                    r);  /* buffer :merge: dst[i-r..i) -> dst[i-r..i+r) */
+    }
+
+    if (lp == i) {
+      lp += r;
+    }
+  }
+}
+
+/* In-place Merge Sort implementation. (c)2012, Andrey Astrelin, astrelin@tochka.ru */
+SORT_DEF void MERGE_SORT_IN_PLACE(SORT_TYPE *dst, const size_t len) {
+  /* don't bother sorting an array of size <= 1 */
+  size_t r = rbnd(len);
+  size_t lr = (len / r - 1) * r;
+  SORT_TYPE *dst1 = dst - 1;
+  size_t p, m, q, q1, p0;
+
+  if (len <= 1) {
+    return;
+  }
+
+  if (len <= SMALL_SORT_BND) {
+    SMALL_SORT(dst, len);
+    return;
+  }
+
+  for (p = 2; p <= lr; p += 2) {
+    dst1 += 2;
+
+    if (SORT_CMP(dst1[0], dst1[-1]) < 0) {
+      SORT_SWAP(dst1[0], dst1[-1]);
+    }
+
+    if (p & 2) {
+      continue;
+    }
+
+    m = len - p;
+    q = 2;
+
+    while ((p & q) == 0) {
+      if (SORT_CMP(dst1[1 - q], dst1[-(int) q]) < 0) {
+        break;
+      }
+
+      q *= 2;
+    }
+
+    if (p & q) {
+      continue;
+    }
+
+    if (q < m) {
+      p0 = len - q;
+      MERGE_SORT_IN_PLACE_ASWAP(dst + p - q, dst + p0, q);
+
+      for (;;) {
+        q1 = 2 * q;
+
+        if ((q1 > m) || (p & q1)) {
+          break;
+        }
+
+        p0 = len - q1;
+        MERGE_SORT_IN_PLACE_FRONTMERGE(dst + (p - q1), q, dst + p0 + q, q);
+        q = q1;
+      }
+
+      MERGE_SORT_IN_PLACE_BACKMERGE(dst + (len - 1), q, dst1 - q, q);
+      q *= 2;
+    }
+
+    q1 = q;
+
+    while (q1 > m) {
+      q1 /= 2;
+    }
+
+    while ((q & p) == 0) {
+      q *= 2;
+      MERGE_SORT_IN_PLACE_RMERGE(dst + (p - q), q, q / 2, q1);
+    }
+  }
+
+  q1 = 0;
+
+  for (q = r; q < lr; q *= 2) {
+    if ((lr & q) != 0) {
+      q1 += q;
+
+      if (q1 != q) {
+        MERGE_SORT_IN_PLACE_RMERGE(dst + (lr - q1), q1, q, r);
+      }
+    }
+  }
+
+  m = len - lr;
+  MERGE_SORT_IN_PLACE(dst + lr, m);
+  MERGE_SORT_IN_PLACE_ASWAP(dst, dst + lr, m);
+  m += MERGE_SORT_IN_PLACE_BACKMERGE(dst + (m - 1), m, dst + (lr - 1), lr - m);
+  MERGE_SORT_IN_PLACE(dst, m);
+}
+
+/* Standard merge sort */
+SORT_DEF void MERGE_SORT_RECURSIVE(SORT_TYPE *newdst, SORT_TYPE *dst, const size_t size) {
+  const size_t middle = size / 2;
+  size_t out = 0;
+  size_t i = 0;
+  size_t j = middle;
+
+  /* don't bother sorting an array of size <= 1 */
+  if (size <= 1) {
+    return;
+  }
+
+  if (size <= SMALL_SORT_BND) {
+    SMALL_STABLE_SORT(dst, size);
+    return;
+  }
+
+  MERGE_SORT_RECURSIVE(newdst, dst, middle);
+  MERGE_SORT_RECURSIVE(newdst, &dst[middle], size - middle);
+
+  while (out != size) {
+    if (i < middle) {
+      if (j < size) {
+        if (SORT_CMP(dst[i], dst[j]) <= 0) {
+          newdst[out] = dst[i++];
+        } else {
+          newdst[out] = dst[j++];
+        }
+      } else {
+        newdst[out] = dst[i++];
+      }
+    } else {
+      newdst[out] = dst[j++];
+    }
+
+    out++;
+  }
+
+  SORT_TYPE_CPY(dst, newdst, size);
+}
+
+/* Standard merge sort */
+SORT_DEF void MERGE_SORT(SORT_TYPE *dst, const size_t size) {
+  SORT_TYPE *newdst;
+
+  /* don't bother sorting an array of size <= 1 */
+  if (size <= 1) {
+    return;
+  }
+
+  if (size <= SMALL_SORT_BND) {
+    SMALL_STABLE_SORT(dst, size);
+    return;
+  }
+
+  newdst = SORT_NEW_BUFFER(size);
+  MERGE_SORT_RECURSIVE(newdst, dst, size);
+  SORT_DELETE_BUFFER(newdst);
+}
+
+
+static __inline size_t QUICK_SORT_PARTITION(SORT_TYPE *dst, const size_t left,
+    const size_t right, const size_t pivot) {
+  SORT_TYPE value = dst[pivot];
+  size_t index = left;
+  size_t i;
+  int not_all_same = 0;
+  /* move the pivot to the right */
+  SORT_SWAP(dst[pivot], dst[right]);
+
+  for (i = left; i < right; i++) {
+    int cmp = SORT_CMP(dst[i], value);
+    /* check if everything is all the same */
+    not_all_same |= cmp;
+
+    if (cmp < 0) {
+      SORT_SWAP(dst[i], dst[index]);
+      index++;
+    }
+  }
+
+  SORT_SWAP(dst[right], dst[index]);
+
+  /* avoid degenerate case */
+  if (not_all_same == 0) {
+    return SIZE_MAX;
+  }
+
+  return index;
+}
+
+/* Based on Knuth vol. 3
+static __inline size_t QUICK_SORT_HOARE_PARTITION(SORT_TYPE *dst, const size_t l,
+    const size_t r, const size_t pivot) {
+  SORT_TYPE value;
+  size_t i = l + 1;
+  size_t j = r;
+
+  if (pivot != l) {
+    SORT_SWAP(dst[pivot], dst[l]);
+  }
+  value = dst[l];
+
+  while (1) {
+    while (SORT_CMP(dst[i], value) < 0) {
+      i++;
+    }
+    while (SORT_CMP(value, dst[j]) < 0) {
+      j--;
+    }
+    if (j <= i) {
+      SORT_SWAP(dst[l], dst[j]);
+      return j;
+    }
+    SORT_SWAP(dst[i], dst[j]);
+    i++;
+    j--;
+  }
+  return 0;
+}
+*/
+
+
+/* Return the median index of the objects at the three indices. */
+static __inline size_t MEDIAN(const SORT_TYPE *dst, const size_t a, const size_t b,
+                              const size_t c) {
+  const int AB = SORT_CMP(dst[a], dst[b]) < 0;
+
+  if (AB) {
+    /* a < b */
+    const int BC = SORT_CMP(dst[b], dst[c]) < 0;
+
+    if (BC) {
+      /* a < b < c */
+      return b;
+    } else {
+      /* a < b, c < b */
+      const int AC = SORT_CMP(dst[a], dst[c]) < 0;
+
+      if (AC) {
+        /* a < c < b */
+        return c;
+      } else {
+        /* c < a < b */
+        return a;
+      }
+    }
+  } else {
+    /* b < a */
+    const int AC = SORT_CMP(dst[a], dst[b]) < 0;
+
+    if (AC) {
+      /* b < a < c */
+      return a;
+    } else {
+      /* b < a, c < a */
+      const int BC = SORT_CMP(dst[b], dst[c]) < 0;
+
+      if (BC) {
+        /* b < c < a */
+        return c;
+      } else {
+        /* c < b < a */
+        return b;
+      }
+    }
+  }
+}
+
+static void QUICK_SORT_RECURSIVE(SORT_TYPE *dst, const size_t original_left,
+                                 const size_t original_right) {
+  size_t left;
+  size_t right;
+  size_t pivot;
+  size_t new_pivot;
+  size_t middle;
+  int loop_count = 0;
+  const int max_loops = 64 - CLZ(original_right - original_left); /* ~lg N */
+  left = original_left;
+  right = original_right;
+
+  while (1) {
+    if (right <= left) {
+      return;
+    }
+
+    if ((right - left + 1U) <= SMALL_SORT_BND) {
+      SMALL_SORT(&dst[left], right - left + 1U);
+      return;
+    }
+
+    if (++loop_count >= max_loops) {
+      /* we have recursed / looped too many times; switch to heap sort */
+      HEAP_SORT(&dst[left], right - left + 1U);
+      return;
+    }
+
+    /* median of 5 */
+    middle = left + ((right - left) >> 1);
+    pivot = MEDIAN((const SORT_TYPE *) dst, left, middle, right);
+    pivot = MEDIAN((const SORT_TYPE *) dst, left + ((middle - left) >> 1), pivot,
+                   middle + ((right - middle) >> 1));
+    new_pivot = QUICK_SORT_PARTITION(dst, left, right, pivot);
+
+    /* check for partition all equal */
+    if (new_pivot == SIZE_MAX) {
+      return;
+    }
+
+    /* recurse only on the small part to avoid degenerate stack sizes */
+    /* and manually do tail call on the large part */
+    if (new_pivot - 1U - left > right - new_pivot - 1U) {
+      /* left is bigger than right */
+      QUICK_SORT_RECURSIVE(dst, new_pivot + 1U, right);
+      /* tail call for left */
+      right = new_pivot - 1U;
+    } else {
+      /* right is bigger than left */
+      QUICK_SORT_RECURSIVE(dst, left, new_pivot - 1U);
+      /* tail call for right */
+      left = new_pivot + 1U;
+    }
+  }
+}
+
+void QUICK_SORT(SORT_TYPE *dst, const size_t size) {
+  /* don't bother sorting an array of size 1 */
+  if (size <= 1) {
+    return;
+  }
+
+  QUICK_SORT_RECURSIVE(dst, 0U, size - 1U);
+}
+
+
+/* timsort implementation, based on timsort.txt */
+
+static __inline void REVERSE_ELEMENTS(SORT_TYPE *dst, size_t start, size_t end) {
+  while (1) {
+    if (start >= end) {
+      return;
+    }
+
+    SORT_SWAP(dst[start], dst[end]);
+    start++;
+    end--;
+  }
+}
+
+static size_t COUNT_RUN(SORT_TYPE *dst, const size_t start, const size_t size) {
+  size_t curr;
+
+  if (size - start == 1) {
+    return 1;
+  }
+
+  if (start >= size - 2) {
+    if (SORT_CMP(dst[size - 2], dst[size - 1]) > 0) {
+      SORT_SWAP(dst[size - 2], dst[size - 1]);
+    }
+
+    return 2;
+  }
+
+  curr = start + 2;
+
+  if (SORT_CMP(dst[start], dst[start + 1]) <= 0) {
+    /* increasing run */
+    while (1) {
+      if (curr == size - 1) {
+        break;
+      }
+
+      if (SORT_CMP(dst[curr - 1], dst[curr]) > 0) {
+        break;
+      }
+
+      curr++;
+    }
+
+    return curr - start;
+  } else {
+    /* decreasing run */
+    while (1) {
+      if (curr == size - 1) {
+        break;
+      }
+
+      if (SORT_CMP(dst[curr - 1], dst[curr]) <= 0) {
+        break;
+      }
+
+      curr++;
+    }
+
+    /* reverse in-place */
+    REVERSE_ELEMENTS(dst, start, curr - 1);
+    return curr - start;
+  }
+}
+
+static int CHECK_INVARIANT(TIM_SORT_RUN_T *stack, const int stack_curr) {
+  size_t A, B, C;
+
+  if (stack_curr < 2) {
+    return 1;
+  }
+
+  if (stack_curr == 2) {
+    const size_t A1 = stack[stack_curr - 2].length;
+    const size_t B1 = stack[stack_curr - 1].length;
+
+    if (A1 <= B1) {
+      return 0;
+    }
+
+    return 1;
+  }
+
+  A = stack[stack_curr - 3].length;
+  B = stack[stack_curr - 2].length;
+  C = stack[stack_curr - 1].length;
+
+  if ((A <= B + C) || (B <= C)) {
+    return 0;
+  }
+
+  return 1;
+}
+
+typedef struct {
+  size_t alloc;
+  SORT_TYPE *storage;
+} TEMP_STORAGE_T;
+
+static void TIM_SORT_RESIZE(TEMP_STORAGE_T *store, const size_t new_size) {
+  if ((store->storage == NULL) || (store->alloc < new_size)) {
+    SORT_TYPE *tempstore = (SORT_TYPE *)realloc(store->storage, new_size * sizeof(SORT_TYPE));
+
+    if (tempstore == NULL) {
+      fprintf(stderr, "Error allocating temporary storage for tim sort: need %lu bytes",
+              (unsigned long)(sizeof(SORT_TYPE) * new_size));
+      exit(1);
+    }
+
+    store->storage = tempstore;
+    store->alloc = new_size;
+  }
+}
+
+static void TIM_SORT_MERGE(SORT_TYPE *dst, const TIM_SORT_RUN_T *stack, const int stack_curr,
+                           TEMP_STORAGE_T *store) {
+  const size_t A = stack[stack_curr - 2].length;
+  const size_t B = stack[stack_curr - 1].length;
+  const size_t curr = stack[stack_curr - 2].start;
+  SORT_TYPE *storage;
+  size_t i, j, k;
+  TIM_SORT_RESIZE(store, MIN(A, B));
+  storage = store->storage;
+
+  /* left merge */
+  if (A < B) {
+    SORT_TYPE_CPY(storage, &dst[curr], A);
+    i = 0;
+    j = curr + A;
+
+    for (k = curr; k < curr + A + B; k++) {
+      if ((i < A) && (j < curr + A + B)) {
+        if (SORT_CMP(storage[i], dst[j]) <= 0) {
+          dst[k] = storage[i++];
+        } else {
+          dst[k] = dst[j++];
+        }
+      } else if (i < A) {
+        dst[k] = storage[i++];
+      } else {
+        break;
+      }
+    }
+  } else {
+    /* right merge */
+    SORT_TYPE_CPY(storage, &dst[curr + A], B);
+    i = B;
+    j = curr + A;
+    k = curr + A + B;
+
+    while (k-- > curr) {
+      if ((i > 0) && (j > curr)) {
+        if (SORT_CMP(dst[j - 1], storage[i - 1]) > 0) {
+          dst[k] = dst[--j];
+        } else {
+          dst[k] = storage[--i];
+        }
+      } else if (i > 0) {
+        dst[k] = storage[--i];
+      } else {
+        break;
+      }
+    }
+  }
+}
+
+static int TIM_SORT_COLLAPSE(SORT_TYPE *dst, TIM_SORT_RUN_T *stack, int stack_curr,
+                             TEMP_STORAGE_T *store, const size_t size) {
+  while (1) {
+    size_t A, B, C, D;
+    int ABC, BCD, CD;
+
+    /* if the stack only has one thing on it, we are done with the collapse */
+    if (stack_curr <= 1) {
+      break;
+    }
+
+    /* if this is the last merge, just do it */
+    if ((stack_curr == 2) && (stack[0].length + stack[1].length == size)) {
+      TIM_SORT_MERGE(dst, stack, stack_curr, store);
+      stack[0].length += stack[1].length;
+      stack_curr--;
+      break;
+    }
+    /* check if the invariant is off for a stack of 2 elements */
+    else if ((stack_curr == 2) && (stack[0].length <= stack[1].length)) {
+      TIM_SORT_MERGE(dst, stack, stack_curr, store);
+      stack[0].length += stack[1].length;
+      stack_curr--;
+      break;
+    } else if (stack_curr == 2) {
+      break;
+    }
+
+    B = stack[stack_curr - 3].length;
+    C = stack[stack_curr - 2].length;
+    D = stack[stack_curr - 1].length;
+
+    if (stack_curr >= 4) {
+      A = stack[stack_curr - 4].length;
+      ABC = (A <= B + C);
+    } else {
+      ABC = 0;
+    }
+
+    BCD = (B <= C + D) || ABC;
+    CD = (C <= D);
+
+    /* Both invariants are good */
+    if (!BCD && !CD) {
+      break;
+    }
+
+    /* left merge */
+    if (BCD && !CD) {
+      TIM_SORT_MERGE(dst, stack, stack_curr - 1, store);
+      stack[stack_curr - 3].length += stack[stack_curr - 2].length;
+      stack[stack_curr - 2] = stack[stack_curr - 1];
+      stack_curr--;
+    } else {
+      /* right merge */
+      TIM_SORT_MERGE(dst, stack, stack_curr, store);
+      stack[stack_curr - 2].length += stack[stack_curr - 1].length;
+      stack_curr--;
+    }
+  }
+
+  return stack_curr;
+}
+
+static __inline int PUSH_NEXT(SORT_TYPE *dst,
+                              const size_t size,
+                              TEMP_STORAGE_T *store,
+                              const size_t minrun,
+                              TIM_SORT_RUN_T *run_stack,
+                              size_t *stack_curr,
+                              size_t *curr) {
+  size_t len = COUNT_RUN(dst, *curr, size);
+  size_t run = minrun;
+
+  if (run > size - *curr) {
+    run = size - *curr;
+  }
+
+  if (run > len) {
+    BINARY_INSERTION_SORT_START(&dst[*curr], len, run);
+    len = run;
+  }
+
+  run_stack[*stack_curr].start = *curr;
+  run_stack[*stack_curr].length = len;
+  (*stack_curr)++;
+  *curr += len;
+
+  if (*curr == size) {
+    /* finish up */
+    while (*stack_curr > 1) {
+      TIM_SORT_MERGE(dst, run_stack, (int)*stack_curr, store);
+      run_stack[*stack_curr - 2].length += run_stack[*stack_curr - 1].length;
+      (*stack_curr)--;
+    }
+
+    if (store->storage != NULL) {
+      free(store->storage);
+      store->storage = NULL;
+    }
+
+    return 0;
+  }
+
+  return 1;
+}
+
+SORT_DEF void TIM_SORT(SORT_TYPE *dst, const size_t size) {
+  size_t minrun;
+  TEMP_STORAGE_T _store, *store;
+  TIM_SORT_RUN_T run_stack[TIM_SORT_STACK_SIZE];
+  size_t stack_curr = 0;
+  size_t curr = 0;
+
+  /* don't bother sorting an array of size 1 */
+  if (size <= 1) {
+    return;
+  }
+
+  if (size < 64) {
+    SMALL_STABLE_SORT(dst, size);
+    return;
+  }
+
+  /* compute the minimum run length */
+  minrun = compute_minrun(size);
+  /* temporary storage for merges */
+  store = &_store;
+  store->alloc = 0;
+  store->storage = NULL;
+
+  if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr)) {
+    return;
+  }
+
+  if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr)) {
+    return;
+  }
+
+  if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr)) {
+    return;
+  }
+
+  while (1) {
+    if (!CHECK_INVARIANT(run_stack, (int)stack_curr)) {
+      stack_curr = TIM_SORT_COLLAPSE(dst, run_stack, (int)stack_curr, store, size);
+      continue;
+    }
+
+    if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr)) {
+      return;
+    }
+  }
+}
+
+/* heap sort: based on wikipedia */
+
+static __inline void HEAP_SIFT_DOWN(SORT_TYPE *dst, const size_t start, const size_t end) {
+  size_t root = start;
+
+  while ((root << 1) <= end) {
+    size_t child = root << 1;
+
+    if ((child < end) && (SORT_CMP(dst[child], dst[child + 1]) < 0)) {
+      child++;
+    }
+
+    if (SORT_CMP(dst[root], dst[child]) < 0) {
+      SORT_SWAP(dst[root], dst[child]);
+      root = child;
+    } else {
+      return;
+    }
+  }
+}
+
+static __inline void HEAPIFY(SORT_TYPE *dst, const size_t size) {
+  size_t start = size >> 1;
+
+  while (1) {
+    HEAP_SIFT_DOWN(dst, start, size - 1);
+
+    if (start == 0) {
+      break;
+    }
+
+    start--;
+  }
+}
+
+SORT_DEF void HEAP_SORT(SORT_TYPE *dst, const size_t size) {
+  size_t end = size - 1;
+
+  /* don't bother sorting an array of size <= 1 */
+  if (size <= 1) {
+    return;
+  }
+
+  HEAPIFY(dst, size);
+
+  while (end > 0) {
+    SORT_SWAP(dst[end], dst[0]);
+    HEAP_SIFT_DOWN(dst, 0, end - 1);
+    end--;
+  }
+}
+
+/********* Sqrt sorting *********************************/
+/*                                                       */
+/* (c) 2014 by Andrey Astrelin                           */
+/*                                                       */
+/*                                                       */
+/* Stable sorting that works in O(N*log(N)) worst time   */
+/* and uses O(sqrt(N)) extra memory                      */
+/*                                                       */
+/* Define SORT_TYPE and SORT_CMP                         */
+/* and then call SqrtSort() function                     */
+/*                                                       */
+/*********************************************************/
+
+#define SORT_CMP_A(a,b) SORT_CMP(*(a),*(b))
+
+static __inline void SQRT_SORT_SWAP_1(SORT_TYPE *a, SORT_TYPE *b) {
+  SORT_TYPE c = *a;
+  *a++ = *b;
+  *b++ = c;
+}
+
+static __inline void SQRT_SORT_SWAP_N(SORT_TYPE *a, SORT_TYPE *b, int n) {
+  while (n--) {
+    SQRT_SORT_SWAP_1(a++, b++);
+  }
+}
+
+
+static void SQRT_SORT_MERGE_RIGHT(SORT_TYPE *arr, int L1, int L2, int M) {
+  int p0 = L1 + L2 + M - 1, p2 = L1 + L2 - 1, p1 = L1 - 1;
+
+  while (p1 >= 0) {
+    if (p2 < L1 || SORT_CMP_A(arr + p1, arr + p2) > 0) {
+      arr[p0--] = arr[p1--];
+    } else {
+      arr[p0--] = arr[p2--];
+    }
+  }
+
+  if (p2 != p0) while (p2 >= L1) {
+      arr[p0--] = arr[p2--];
+    }
+}
+
+/* arr[M..-1] - free, arr[0,L1-1]++arr[L1,L1+L2-1] -> arr[M,M+L1+L2-1] */
+static void SQRT_SORT_MERGE_LEFT_WITH_X_BUF(SORT_TYPE *arr, int L1, int L2, int M) {
+  int p0 = 0, p1 = L1;
+  L2 += L1;
+
+  while (p1 < L2) {
+    if (p0 == L1 || SORT_CMP_A(arr + p0, arr + p1) > 0) {
+      arr[M++] = arr[p1++];
+    } else {
+      arr[M++] = arr[p0++];
+    }
+  }
+
+  if (M != p0) while (p0 < L1) {
+      arr[M++] = arr[p0++];
+    }
+}
+
+/* arr[0,L1-1] ++ arr2[0,L2-1] -> arr[-L1,L2-1],  arr2 is "before" arr1 */
+static void SQRT_SORT_MERGE_DOWN(SORT_TYPE *arr, SORT_TYPE *arr2, int L1, int L2) {
+  int p0 = 0, p1 = 0, M = -L2;
+
+  while (p1 < L2) {
+    if (p0 == L1 || SORT_CMP_A(arr + p0, arr2 + p1) >= 0) {
+      arr[M++] = arr2[p1++];
+    } else {
+      arr[M++] = arr[p0++];
+    }
+  }
+
+  if (M != p0) while (p0 < L1) {
+      arr[M++] = arr[p0++];
+    }
+}
+
+static void SQRT_SORT_SMART_MERGE_WITH_X_BUF(SORT_TYPE *arr, int *alen1, int *atype, int len2,
+    int lkeys) {
+  int p0 = -lkeys, p1 = 0, p2 = *alen1, q1 = p2, q2 = p2 + len2;
+  int ftype = 1 - *atype; /* 1 if inverted */
+
+  while (p1 < q1 && p2 < q2) {
+    if (SORT_CMP_A(arr + p1, arr + p2) - ftype < 0) {
+      arr[p0++] = arr[p1++];
+    } else {
+      arr[p0++] = arr[p2++];
+    }
+  }
+
+  if (p1 < q1) {
+    *alen1 = q1 - p1;
+
+    while (p1 < q1) {
+      arr[--q2] = arr[--q1];
+    }
+  } else {
+    *alen1 = q2 - p2;
+    *atype = ftype;
+  }
+}
+
+
+/*
+  arr - starting array. arr[-lblock..-1] - buffer (if havebuf).
+  lblock - length of regular blocks. First nblocks are stable sorted by 1st elements and key-coded
+  keys - arrays of keys, in same order as blocks. key<midkey means stream A
+  nblock2 are regular blocks from stream A. llast is length of last (irregular) block from stream B, that should go before nblock2 blocks.
+  llast=0 requires nblock2=0 (no irregular blocks). llast>0, nblock2=0 is possible.
+*/
+static void SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF(int *keys, int midkey, SORT_TYPE *arr,
+    int nblock, int lblock, int nblock2, int llast) {
+  int l, prest, lrest, frest, pidx, cidx, fnext;
+
+  if (nblock == 0) {
+    l = nblock2 * lblock;
+    SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr, l, llast, -lblock);
+    return;
+  }
+
+  lrest = lblock;
+  frest = keys[0] < midkey ? 0 : 1;
+  pidx = lblock;
+
+  for (cidx = 1; cidx < nblock; cidx++, pidx += lblock) {
+    prest = pidx - lrest;
+    fnext = keys[cidx] < midkey ? 0 : 1;
+
+    if (fnext == frest) {
+      SORT_TYPE_CPY(arr + prest - lblock, arr + prest, lrest);
+      prest = pidx;
+      lrest = lblock;
+    } else {
+      SQRT_SORT_SMART_MERGE_WITH_X_BUF(arr + prest, &lrest, &frest, lblock, lblock);
+    }
+  }
+
+  prest = pidx - lrest;
+
+  if (llast) {
+    if (frest) {
+      SORT_TYPE_CPY(arr + prest - lblock, arr + prest, lrest);
+      prest = pidx;
+      lrest = lblock * nblock2;
+      frest = 0;
+    } else {
+      lrest += lblock * nblock2;
+    }
+
+    SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr + prest, lrest, llast, -lblock);
+  } else {
+    SORT_TYPE_CPY(arr + prest - lblock, arr + prest, lrest);
+  }
+}
+
+/*
+  build blocks of length K
+  input: [-K,-1] elements are buffer
+  output: first K elements are buffer, blocks 2*K and last subblock sorted
+*/
+static void SQRT_SORT_BUILD_BLOCKS(SORT_TYPE *arr, int L, int K) {
+  int m, u, h, p0, p1, rest, restk, p;
+
+  for (m = 1; m < L; m += 2) {
+    u = 0;
+
+    if (SORT_CMP_A(arr + (m - 1), arr + m) > 0) {
+      u = 1;
+    }
+
+    arr[m - 3] = arr[m - 1 + u];
+    arr[m - 2] = arr[m - u];
+  }
+
+  if (L % 2) {
+    arr[L - 3] = arr[L - 1];
+  }
+
+  arr -= 2;
+
+  for (h = 2; h < K; h *= 2) {
+    p0 = 0;
+    p1 = L - 2 * h;
+
+    while (p0 <= p1) {
+      SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr + p0, h, h, -h);
+      p0 += 2 * h;
+    }
+
+    rest = L - p0;
+
+    if (rest > h) {
+      SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr + p0, h, rest - h, -h);
+    } else {
+      for (; p0 < L; p0++) {
+        arr[p0 - h] = arr[p0];
+      }
+    }
+
+    arr -= h;
+  }
+
+  restk = L % (2 * K);
+  p = L - restk;
+
+  if (restk <= K) {
+    SORT_TYPE_CPY(arr + p + K, arr + p, restk);
+  } else {
+    SQRT_SORT_MERGE_RIGHT(arr + p, K, restk - K, K);
+  }
+
+  while (p > 0) {
+    p -= 2 * K;
+    SQRT_SORT_MERGE_RIGHT(arr + p, K, K, K);
+  }
+}
+
+
+static void SQRT_SORT_SORT_INS(SORT_TYPE *arr, int len) {
+  int i, j;
+
+  for (i = 1; i < len; i++) {
+    for (j = i - 1; j >= 0 && SORT_CMP_A(arr + (j + 1), arr + j) < 0; j--) {
+      SQRT_SORT_SWAP_1(arr + j, arr + (j + 1));
+    }
+  }
+}
+
+/*
+  keys are on the left of arr. Blocks of length LL combined. We'll combine them in pairs
+  LL and nkeys are powers of 2. (2*LL/lblock) keys are guarantied
+*/
+static void SQRT_SORT_COMBINE_BLOCKS(SORT_TYPE *arr, int len, int LL, int lblock, int *tags) {
+  int M, b, NBlk, midkey, lrest, u, i, p, v, kc, nbl2, llast;
+  SORT_TYPE *arr1;
+  M = len / (2 * LL);
+  lrest = len % (2 * LL);
+
+  if (lrest <= LL) {
+    len -= lrest;
+    lrest = 0;
+  }
+
+  for (b = 0; b <= M; b++) {
+    if (b == M && lrest == 0) {
+      break;
+    }
+
+    arr1 = arr + b * 2 * LL;
+    NBlk = (b == M ? lrest : 2 * LL) / lblock;
+    u = NBlk + (b == M ? 1 : 0);
+
+    for (i = 0; i <= u; i++) {
+      tags[i] = i;
+    }
+
+    midkey = LL / lblock;
+
+    for (u = 1; u < NBlk; u++) {
+      p = u - 1;
+
+      for (v = u; v < NBlk; v++) {
+        kc = SORT_CMP_A(arr1 + p * lblock, arr1 + v * lblock);
+
+        if (kc > 0 || (kc == 0 && tags[p] > tags[v])) {
+          p = v;
+        }
+      }
+
+      if (p != u - 1) {
+        SQRT_SORT_SWAP_N(arr1 + (u - 1)*lblock, arr1 + p * lblock, lblock);
+        i = tags[u - 1];
+        tags[u - 1] = tags[p];
+        tags[p] = i;
+      }
+    }
+
+    nbl2 = llast = 0;
+
+    if (b == M) {
+      llast = lrest % lblock;
+    }
+
+    if (llast != 0) {
+      while (nbl2 < NBlk && SORT_CMP_A(arr1 + NBlk * lblock, arr1 + (NBlk - nbl2 - 1)*lblock) < 0) {
+        nbl2++;
+      }
+    }
+
+    SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF(tags, midkey, arr1, NBlk - nbl2, lblock, nbl2, llast);
+  }
+
+  for (p = len; --p >= 0;) {
+    arr[p] = arr[p - lblock];
+  }
+}
+
+
+static void SQRT_SORT_COMMON_SORT(SORT_TYPE *arr, int Len, SORT_TYPE *extbuf, int *Tags) {
+  int lblock, cbuf;
+
+  if (Len < 16) {
+    SQRT_SORT_SORT_INS(arr, Len);
+    return;
+  }
+
+  lblock = 1;
+
+  while (lblock * lblock < Len) {
+    lblock *= 2;
+  }
+
+  SORT_TYPE_CPY(extbuf, arr, lblock);
+  SQRT_SORT_COMMON_SORT(extbuf, lblock, arr, Tags);
+  SQRT_SORT_BUILD_BLOCKS(arr + lblock, Len - lblock, lblock);
+  cbuf = lblock;
+
+  while (Len > (cbuf *= 2)) {
+    SQRT_SORT_COMBINE_BLOCKS(arr + lblock, Len - lblock, cbuf, lblock, Tags);
+  }
+
+  SQRT_SORT_MERGE_DOWN(arr + lblock, extbuf, Len - lblock, lblock);
+}
+
+void SQRT_SORT(SORT_TYPE *arr, size_t Len) {
+  int L = 1;
+  SORT_TYPE *ExtBuf;
+  int *Tags;
+  int NK;
+
+  while (L * L < Len) {
+    L *= 2;
+  }
+
+  NK = (int)((Len - 1) / L + 2);
+  ExtBuf = SORT_NEW_BUFFER(L);
+
+  if (ExtBuf == NULL) {
+    return;  /* fail */
+  }
+
+  Tags = (int*)malloc(NK * sizeof(int));
+
+  if (Tags == NULL) {
+    return;
+  }
+
+  SQRT_SORT_COMMON_SORT(arr, (int)Len, ExtBuf, Tags);
+  free(Tags);
+  SORT_DELETE_BUFFER(ExtBuf);
+}
+
+/********* Grail sorting *********************************/
+/*                                                       */
+/* (c) 2013 by Andrey Astrelin                           */
+/*                                                       */
+/*                                                       */
+/* Stable sorting that works in O(N*log(N)) worst time   */
+/* and uses O(1) extra memory                            */
+/*                                                       */
+/* Define SORT_TYPE and SORT_CMP                         */
+/* and then call GrailSort() function                    */
+/*                                                       */
+/* For sorting with fixed external buffer (512 items)    */
+/* use GrailSortWithBuffer()                             */
+/*                                                       */
+/* For sorting with dynamic external buffer (O(sqrt(N)) items) */
+/* use GrailSortWithDynBuffer()                          */
+/*                                                       */
+/* Also classic in-place merge sort is implemented       */
+/* under the name of RecStableSort()                     */
+/*                                                       */
+/*********************************************************/
+
+#define GRAIL_EXT_BUFFER_LENGTH 512
+
+static __inline void GRAIL_SWAP1(SORT_TYPE *a, SORT_TYPE *b) {
+  SORT_TYPE c = *a;
+  *a = *b;
+  *b = c;
+}
+
+static __inline void GRAIL_SWAP_N(SORT_TYPE *a, SORT_TYPE *b, int n) {
+  while (n--) {
+    GRAIL_SWAP1(a++, b++);
+  }
+}
+
+static void GRAIL_ROTATE(SORT_TYPE *a, int l1, int l2) {
+  while (l1 && l2) {
+    if (l1 <= l2) {
+      GRAIL_SWAP_N(a, a + l1, l1);
+      a += l1;
+      l2 -= l1;
+    } else {
+      GRAIL_SWAP_N(a + (l1 - l2), a + l1, l2);
+      l1 -= l2;
+    }
+  }
+}
+
+static int GRAIL_BIN_SEARCH_LEFT(SORT_TYPE *arr, int len, SORT_TYPE *key) {
+  int a = -1, b = len, c;
+
+  while (a < b - 1) {
+    c = a + ((b - a) >> 1);
+
+    if (SORT_CMP_A(arr + c, key) >= 0) {
+      b = c;
+    } else {
+      a = c;
+    }
+  }
+
+  return b;
+}
+static int GRAIL_BIN_SEARCH_RIGHT(SORT_TYPE *arr, int len, SORT_TYPE *key) {
+  int a = -1, b = len, c;
+
+  while (a < b - 1) {
+    c = a + ((b - a) >> 1);
+
+    if (SORT_CMP_A(arr + c, key) > 0) {
+      b = c;
+    } else {
+      a = c;
+    }
+  }
+
+  return b;
+}
+
+/* cost: 2*len+nk^2/2 */
+static int GRAIL_FIND_KEYS(SORT_TYPE *arr, int len, int nkeys) {
+  int h = 1, h0 = 0; /* first key is always here */
+  int u = 1, r;
+
+  while (u < len && h < nkeys) {
+    r = GRAIL_BIN_SEARCH_LEFT(arr + h0, h, arr + u);
+
+    if (r == h || SORT_CMP_A(arr + u, arr + (h0 + r)) != 0) {
+      GRAIL_ROTATE(arr + h0, h, u - (h0 + h));
+      h0 = u - h;
+      GRAIL_ROTATE(arr + (h0 + r), h - r, 1);
+      h++;
+    }
+
+    u++;
+  }
+
+  GRAIL_ROTATE(arr, h0, h);
+  return h;
+}
+
+/* cost: min(L1,L2)^2+max(L1,L2) */
+static void GRAIL_MERGE_WITHOUT_BUFFER(SORT_TYPE *arr, int len1, int len2) {
+  int h;
+
+  if (len1 < len2) {
+    while (len1) {
+      h = GRAIL_BIN_SEARCH_LEFT(arr + len1, len2, arr);
+
+      if (h != 0) {
+        GRAIL_ROTATE(arr, len1, h);
+        arr += h;
+        len2 -= h;
+      }
+
+      if (len2 == 0) {
+        break;
+      }
+
+      do {
+        arr++;
+        len1--;
+      } while (len1 && SORT_CMP_A(arr, arr + len1) <= 0);
+    }
+  } else {
+    while (len2) {
+      h = GRAIL_BIN_SEARCH_RIGHT(arr, len1, arr + (len1 + len2 - 1));
+
+      if (h != len1) {
+        GRAIL_ROTATE(arr + h, len1 - h, len2);
+        len1 = h;
+      }
+
+      if (len1 == 0) {
+        break;
+      }
+
+      do {
+        len2--;
+      } while (len2 && SORT_CMP_A(arr + len1 - 1, arr + len1 + len2 - 1) <= 0);
+    }
+  }
+}
+
+/* arr[M..-1] - buffer, arr[0,L1-1]++arr[L1,L1+L2-1] -> arr[M,M+L1+L2-1] */
+static void GRAIL_MERGE_LEFT(SORT_TYPE *arr, int L1, int L2, int M) {
+  int p0 = 0, p1 = L1;
+  L2 += L1;
+
+  while (p1 < L2) {
+    if (p0 == L1 || SORT_CMP_A(arr + p0, arr + p1) > 0) {
+      GRAIL_SWAP1(arr + (M++), arr + (p1++));
+    } else {
+      GRAIL_SWAP1(arr + (M++), arr + (p0++));
+    }
+  }
+
+  if (M != p0) {
+    GRAIL_SWAP_N(arr + M, arr + p0, L1 - p0);
+  }
+}
+static void GRAIL_MERGE_RIGHT(SORT_TYPE *arr, int L1, int L2, int M) {
+  int p0 = L1 + L2 + M - 1, p2 = L1 + L2 - 1, p1 = L1 - 1;
+
+  while (p1 >= 0) {
+    if (p2 < L1 || SORT_CMP_A(arr + p1, arr + p2) > 0) {
+      GRAIL_SWAP1(arr + (p0--), arr + (p1--));
+    } else {
+      GRAIL_SWAP1(arr + (p0--), arr + (p2--));
+    }
+  }
+
+  if (p2 != p0) while (p2 >= L1) {
+      GRAIL_SWAP1(arr + (p0--), arr + (p2--));
+    }
+}
+
+static void GRAIL_SMART_MERGE_WITH_BUFFER(SORT_TYPE *arr, int *alen1, int *atype, int len2,
+    int lkeys) {
+  int p0 = -lkeys, p1 = 0, p2 = *alen1, q1 = p2, q2 = p2 + len2;
+  int ftype = 1 - *atype; /* 1 if inverted */
+
+  while (p1 < q1 && p2 < q2) {
+    if (SORT_CMP_A(arr + p1, arr + p2) - ftype < 0) {
+      GRAIL_SWAP1(arr + (p0++), arr + (p1++));
+    } else {
+      GRAIL_SWAP1(arr + (p0++), arr + (p2++));
+    }
+  }
+
+  if (p1 < q1) {
+    *alen1 = q1 - p1;
+
+    while (p1 < q1) {
+      GRAIL_SWAP1(arr + (--q1), arr + (--q2));
+    }
+  } else {
+    *alen1 = q2 - p2;
+    *atype = ftype;
+  }
+}
+static void GRAIL_SMART_MERGE_WITHOUT_BUFFER(SORT_TYPE *arr, int *alen1, int *atype, int _len2) {
+  int len1, len2, ftype, h;
+
+  if (!_len2) {
+    return;
+  }
+
+  len1 = *alen1;
+  len2 = _len2;
+  ftype = 1 - *atype;
+
+  if (len1 && SORT_CMP_A(arr + (len1 - 1), arr + len1) - ftype >= 0) {
+    while (len1) {
+      h = ftype ? GRAIL_BIN_SEARCH_LEFT(arr + len1, len2, arr) : GRAIL_BIN_SEARCH_RIGHT(arr + len1, len2,
+          arr);
+
+      if (h != 0) {
+        GRAIL_ROTATE(arr, len1, h);
+        arr += h;
+        len2 -= h;
+      }
+
+      if (len2 == 0) {
+        *alen1 = len1;
+        return;
+      }
+
+      do {
+        arr++;
+        len1--;
+      } while (len1 && SORT_CMP_A(arr, arr + len1) - ftype < 0);
+    }
+  }
+
+  *alen1 = len2;
+  *atype = ftype;
+}
+
+/***** Sort With Extra Buffer *****/
+
+/* arr[M..-1] - free, arr[0,L1-1]++arr[L1,L1+L2-1] -> arr[M,M+L1+L2-1] */
+static void GRAIL_MERGE_LEFT_WITH_X_BUF(SORT_TYPE *arr, int L1, int L2, int M) {
+  int p0 = 0, p1 = L1;
+  L2 += L1;
+
+  while (p1 < L2) {
+    if (p0 == L1 || SORT_CMP_A(arr + p0, arr + p1) > 0) {
+      arr[M++] = arr[p1++];
+    } else {
+      arr[M++] = arr[p0++];
+    }
+  }
+
+  if (M != p0) while (p0 < L1) {
+      arr[M++] = arr[p0++];
+    }
+}
+
+static void GRAIL_SMART_MERGE_WITH_X_BUF(SORT_TYPE *arr, int *alen1, int *atype, int len2,
+    int lkeys) {
+  int p0 = -lkeys, p1 = 0, p2 = *alen1, q1 = p2, q2 = p2 + len2;
+  int ftype = 1 - *atype; /* 1 if inverted */
+
+  while (p1 < q1 && p2 < q2) {
+    if (SORT_CMP_A(arr + p1, arr + p2) - ftype < 0) {
+      arr[p0++] = arr[p1++];
+    } else {
+      arr[p0++] = arr[p2++];
+    }
+  }
+
+  if (p1 < q1) {
+    *alen1 = q1 - p1;
+
+    while (p1 < q1) {
+      arr[--q2] = arr[--q1];
+    }
+  } else {
+    *alen1 = q2 - p2;
+    *atype = ftype;
+  }
+}
+
+/*
+  arr - starting array. arr[-lblock..-1] - buffer (if havebuf).
+  lblock - length of regular blocks. First nblocks are stable sorted by 1st elements and key-coded
+  keys - arrays of keys, in same order as blocks. key<midkey means stream A
+  nblock2 are regular blocks from stream A. llast is length of last (irregular) block from stream B, that should go before nblock2 blocks.
+  llast=0 requires nblock2=0 (no irregular blocks). llast>0, nblock2=0 is possible.
+*/
+static void GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF(SORT_TYPE *keys, SORT_TYPE *midkey, SORT_TYPE *arr,
+    int nblock, int lblock, int nblock2, int llast) {
+  int l, prest, lrest, frest, pidx, cidx, fnext;
+
+  if (nblock == 0) {
+    l = nblock2 * lblock;
+    GRAIL_MERGE_LEFT_WITH_X_BUF(arr, l, llast, -lblock);
+    return;
+  }
+
+  lrest = lblock;
+  frest = SORT_CMP_A(keys, midkey) < 0 ? 0 : 1;
+  pidx = lblock;
+
+  for (cidx = 1; cidx < nblock; cidx++, pidx += lblock) {
+    prest = pidx - lrest;
+    fnext = SORT_CMP_A(keys + cidx, midkey) < 0 ? 0 : 1;
+
+    if (fnext == frest) {
+      SORT_TYPE_CPY(arr + prest - lblock, arr + prest, lrest);
+      prest = pidx;
+      lrest = lblock;
+    } else {
+      GRAIL_SMART_MERGE_WITH_X_BUF(arr + prest, &lrest, &frest, lblock, lblock);
+    }
+  }
+
+  prest = pidx - lrest;
+
+  if (llast) {
+    if (frest) {
+      SORT_TYPE_CPY(arr + prest - lblock, arr + prest, lrest);
+      prest = pidx;
+      lrest = lblock * nblock2;
+      frest = 0;
+    } else {
+      lrest += lblock * nblock2;
+    }
+
+    GRAIL_MERGE_LEFT_WITH_X_BUF(arr + prest, lrest, llast, -lblock);
+  } else {
+    SORT_TYPE_CPY(arr + prest - lblock, arr + prest, lrest);
+  }
+}
+
+/***** End Sort With Extra Buffer *****/
+
+/*
+  build blocks of length K
+  input: [-K,-1] elements are buffer
+  output: first K elements are buffer, blocks 2*K and last subblock sorted
+*/
+static void GRAIL_BUILD_BLOCKS(SORT_TYPE *arr, int L, int K, SORT_TYPE *extbuf, int LExtBuf) {
+  int m, u, h, p0, p1, rest, restk, p, kbuf;
+  kbuf = K < LExtBuf ? K : LExtBuf;
+
+  while (kbuf & (kbuf - 1)) {
+    kbuf &= kbuf - 1;  /* max power or 2 - just in case */
+  }
+
+  if (kbuf) {
+    SORT_TYPE_CPY(extbuf, arr - kbuf, kbuf);
+
+    for (m = 1; m < L; m += 2) {
+      u = 0;
+
+      if (SORT_CMP_A(arr + (m - 1), arr + m) > 0) {
+        u = 1;
+      }
+
+      arr[m - 3] = arr[m - 1 + u];
+      arr[m - 2] = arr[m - u];
+    }
+
+    if (L % 2) {
+      arr[L - 3] = arr[L - 1];
+    }
+
+    arr -= 2;
+
+    for (h = 2; h < kbuf; h *= 2) {
+      p0 = 0;
+      p1 = L - 2 * h;
+
+      while (p0 <= p1) {
+        GRAIL_MERGE_LEFT_WITH_X_BUF(arr + p0, h, h, -h);
+        p0 += 2 * h;
+      }
+
+      rest = L - p0;
+
+      if (rest > h) {
+        GRAIL_MERGE_LEFT_WITH_X_BUF(arr + p0, h, rest - h, -h);
+      } else {
+        for (; p0 < L; p0++) {
+          arr[p0 - h] = arr[p0];
+        }
+      }
+
+      arr -= h;
+    }
+
+    SORT_TYPE_CPY(arr + L, extbuf, kbuf);
+  } else {
+    for (m = 1; m < L; m += 2) {
+      u = 0;
+
+      if (SORT_CMP_A(arr + (m - 1), arr + m) > 0) {
+        u = 1;
+      }
+
+      GRAIL_SWAP1(arr + (m - 3), arr + (m - 1 + u));
+      GRAIL_SWAP1(arr + (m - 2), arr + (m - u));
+    }
+
+    if (L % 2) {
+      GRAIL_SWAP1(arr + (L - 1), arr + (L - 3));
+    }
+
+    arr -= 2;
+    h = 2;
+  }
+
+  for (; h < K; h *= 2) {
+    p0 = 0;
+    p1 = L - 2 * h;
+
+    while (p0 <= p1) {
+      GRAIL_MERGE_LEFT(arr + p0, h, h, -h);
+      p0 += 2 * h;
+    }
+
+    rest = L - p0;
+
+    if (rest > h) {
+      GRAIL_MERGE_LEFT(arr + p0, h, rest - h, -h);
+    } else {
+      GRAIL_ROTATE(arr + p0 - h, h, rest);
+    }
+
+    arr -= h;
+  }
+
+  restk = L % (2 * K);
+  p = L - restk;
+
+  if (restk <= K) {
+    GRAIL_ROTATE(arr + p, restk, K);
+  } else {
+    GRAIL_MERGE_RIGHT(arr + p, K, restk - K, K);
+  }
+
+  while (p > 0) {
+    p -= 2 * K;
+    GRAIL_MERGE_RIGHT(arr + p, K, K, K);
+  }
+}
+
+/*
+  arr - starting array. arr[-lblock..-1] - buffer (if havebuf).
+  lblock - length of regular blocks. First nblocks are stable sorted by 1st elements and key-coded
+  keys - arrays of keys, in same order as blocks. key<midkey means stream A
+  nblock2 are regular blocks from stream A. llast is length of last (irregular) block from stream B, that should go before nblock2 blocks.
+  llast=0 requires nblock2=0 (no irregular blocks). llast>0, nblock2=0 is possible.
+*/
+static void GRAIL_MERGE_BUFFERS_LEFT(SORT_TYPE *keys, SORT_TYPE *midkey, SORT_TYPE *arr, int nblock,
+                                     int lblock, int havebuf, int nblock2, int llast) {
+  int l, prest, lrest, frest, pidx, cidx, fnext;
+
+  if (nblock == 0) {
+    l = nblock2 * lblock;
+
+    if (havebuf) {
+      GRAIL_MERGE_LEFT(arr, l, llast, -lblock);
+    } else {
+      GRAIL_MERGE_WITHOUT_BUFFER(arr, l, llast);
+    }
+
+    return;
+  }
+
+  lrest = lblock;
+  frest = SORT_CMP_A(keys, midkey) < 0 ? 0 : 1;
+  pidx = lblock;
+
+  for (cidx = 1; cidx < nblock; cidx++, pidx += lblock) {
+    prest = pidx - lrest;
+    fnext = SORT_CMP_A(keys + cidx, midkey) < 0 ? 0 : 1;
+
+    if (fnext == frest) {
+      if (havebuf) {
+        GRAIL_SWAP_N(arr + prest - lblock, arr + prest, lrest);
+      }
+
+      prest = pidx;
+      lrest = lblock;
+    } else {
+      if (havebuf) {
+        GRAIL_SMART_MERGE_WITH_BUFFER(arr + prest, &lrest, &frest, lblock, lblock);
+      } else {
+        GRAIL_SMART_MERGE_WITHOUT_BUFFER(arr + prest, &lrest, &frest, lblock);
+      }
+    }
+  }
+
+  prest = pidx - lrest;
+
+  if (llast) {
+    if (frest) {
+      if (havebuf) {
+        GRAIL_SWAP_N(arr + prest - lblock, arr + prest, lrest);
+      }
+
+      prest = pidx;
+      lrest = lblock * nblock2;
+      frest = 0;
+    } else {
+      lrest += lblock * nblock2;
+    }
+
+    if (havebuf) {
+      GRAIL_MERGE_LEFT(arr + prest, lrest, llast, -lblock);
+    } else {
+      GRAIL_MERGE_WITHOUT_BUFFER(arr + prest, lrest, llast);
+    }
+  } else {
+    if (havebuf) {
+      GRAIL_SWAP_N(arr + prest, arr + (prest - lblock), lrest);
+    }
+  }
+}
+
+static void GRAIL_LAZY_STABLE_SORT(SORT_TYPE *arr, int L) {
+  int m, h, p0, p1, rest;
+
+  for (m = 1; m < L; m += 2) {
+    if (SORT_CMP_A(arr + m - 1, arr + m) > 0) {
+      GRAIL_SWAP1(arr + (m - 1), arr + m);
+    }
+  }
+
+  for (h = 2; h < L; h *= 2) {
+    p0 = 0;
+    p1 = L - 2 * h;
+
+    while (p0 <= p1) {
+      GRAIL_MERGE_WITHOUT_BUFFER(arr + p0, h, h);
+      p0 += 2 * h;
+    }
+
+    rest = L - p0;
+
+    if (rest > h) {
+      GRAIL_MERGE_WITHOUT_BUFFER(arr + p0, h, rest - h);
+    }
+  }
+}
+
+/*
+  keys are on the left of arr. Blocks of length LL combined. We'll combine them in pairs
+  LL and nkeys are powers of 2. (2*LL/lblock) keys are guarantied
+*/
+static void GRAIL_COMBINE_BLOCKS(SORT_TYPE *keys, SORT_TYPE *arr, int len, int LL, int lblock,
+                                 int havebuf, SORT_TYPE *xbuf) {
+  int M, b, NBlk, midkey, lrest, u, p, v, kc, nbl2, llast;
+  SORT_TYPE *arr1;
+  M = len / (2 * LL);
+  lrest = len % (2 * LL);
+
+  if (lrest <= LL) {
+    len -= lrest;
+    lrest = 0;
+  }
+
+  if (xbuf) {
+    SORT_TYPE_CPY(xbuf, arr - lblock, lblock);
+  }
+
+  for (b = 0; b <= M; b++) {
+    if (b == M && lrest == 0) {
+      break;
+    }
+
+    arr1 = arr + b * 2 * LL;
+    NBlk = (b == M ? lrest : 2 * LL) / lblock;
+    SMALL_STABLE_SORT(keys, NBlk + (b == M ? 1 : 0));
+    midkey = LL / lblock;
+
+    for (u = 1; u < NBlk; u++) {
+      p = u - 1;
+
+      for (v = u; v < NBlk; v++) {
+        kc = SORT_CMP_A(arr1 + p * lblock, arr1 + v * lblock);
+
+        if (kc > 0 || (kc == 0 && SORT_CMP_A(keys + p, keys + v) > 0)) {
+          p = v;
+        }
+      }
+
+      if (p != u - 1) {
+        GRAIL_SWAP_N(arr1 + (u - 1)*lblock, arr1 + p * lblock, lblock);
+        GRAIL_SWAP1(keys + (u - 1), keys + p);
+
+        if (midkey == u - 1 || midkey == p) {
+          midkey ^= (u - 1)^p;
+        }
+      }
+    }
+
+    nbl2 = llast = 0;
+
+    if (b == M) {
+      llast = lrest % lblock;
+    }
+
+    if (llast != 0) {
+      while (nbl2 < NBlk && SORT_CMP_A(arr1 + NBlk * lblock, arr1 + (NBlk - nbl2 - 1)*lblock) < 0) {
+        nbl2++;
+      }
+    }
+
+    if (xbuf) {
+      GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF(keys, keys + midkey, arr1, NBlk - nbl2, lblock, nbl2, llast);
+    } else {
+      GRAIL_MERGE_BUFFERS_LEFT(keys, keys + midkey, arr1, NBlk - nbl2, lblock, havebuf, nbl2, llast);
+    }
+  }
+
+  if (xbuf) {
+    for (p = len; --p >= 0;) {
+      arr[p] = arr[p - lblock];
+    }
+
+    SORT_TYPE_CPY(arr - lblock, xbuf, lblock);
+  } else if (havebuf) {
+    while (--len >= 0) {
+      GRAIL_SWAP1(arr + len, arr + len - lblock);
+    }
+  }
+}
+
+
+static void GRAIL_COMMON_SORT(SORT_TYPE *arr, int Len, SORT_TYPE *extbuf, int LExtBuf) {
+  int lblock, nkeys, findkeys, ptr, cbuf, lb, nk;
+  int havebuf, chavebuf;
+  long long s;
+
+  if (Len <= SMALL_SORT_BND) {
+    SMALL_STABLE_SORT(arr, Len);
+    return;
+  }
+
+  lblock = 1;
+
+  while (lblock * lblock < Len) {
+    lblock *= 2;
+  }
+
+  nkeys = (Len - 1) / lblock + 1;
+  findkeys = GRAIL_FIND_KEYS(arr, Len, nkeys + lblock);
+  havebuf = 1;
+
+  if (findkeys < nkeys + lblock) {
+    if (findkeys < 4) {
+      GRAIL_LAZY_STABLE_SORT(arr, Len);
+      return;
+    }
+
+    nkeys = lblock;
+
+    while (nkeys > findkeys) {
+      nkeys /= 2;
+    }
+
+    havebuf = 0;
+    lblock = 0;
+  }
+
+  ptr = lblock + nkeys;
+  cbuf = havebuf ? lblock : nkeys;
+
+  if (havebuf) {
+    GRAIL_BUILD_BLOCKS(arr + ptr, Len - ptr, cbuf, extbuf, LExtBuf);
+  } else {
+    GRAIL_BUILD_BLOCKS(arr + ptr, Len - ptr, cbuf, NULL, 0);
+  }
+
+  /* 2*cbuf are built */
+  while (Len - ptr > (cbuf *= 2)) {
+    lb = lblock;
+    chavebuf = havebuf;
+
+    if (!havebuf) {
+      if (nkeys > 4 && nkeys / 8 * nkeys >= cbuf) {
+        lb = nkeys / 2;
+        chavebuf = 1;
+      } else {
+        nk = 1;
+        s = (long long)cbuf * findkeys / 2;
+
+        while (nk < nkeys && s != 0) {
+          nk *= 2;
+          s /= 8;
+        }
+
+        lb = (2 * cbuf) / nk;
+      }
+    }
+
+    GRAIL_COMBINE_BLOCKS(arr, arr + ptr, Len - ptr, cbuf, lb, chavebuf, chavebuf
+                         && lb <= LExtBuf ? extbuf : NULL);
+  }
+
+  SMALL_STABLE_SORT(arr, ptr);
+  GRAIL_MERGE_WITHOUT_BUFFER(arr, ptr, Len - ptr);
+}
+
+SORT_DEF void GRAIL_SORT(SORT_TYPE *arr, size_t Len) {
+  GRAIL_COMMON_SORT(arr, (int)Len, NULL, 0);
+}
+
+SORT_DEF void GRAIL_SORT_FIXED_BUFFER(SORT_TYPE *arr, size_t Len) {
+  SORT_TYPE ExtBuf[GRAIL_EXT_BUFFER_LENGTH];
+  GRAIL_COMMON_SORT(arr, (int)Len, ExtBuf, GRAIL_EXT_BUFFER_LENGTH);
+}
+
+SORT_DEF void GRAIL_SORT_DYN_BUFFER(SORT_TYPE *arr, size_t Len) {
+  int L = 1;
+  SORT_TYPE *ExtBuf;
+
+  while (L * L < Len) {
+    L *= 2;
+  }
+
+  ExtBuf = SORT_NEW_BUFFER(L);
+
+  if (ExtBuf == NULL) {
+    GRAIL_SORT_FIXED_BUFFER(arr, Len);
+  } else {
+    GRAIL_COMMON_SORT(arr, (int)Len, ExtBuf, L);
+    SORT_DELETE_BUFFER(ExtBuf);
+  }
+}
+
+/****** classic MergeInPlace *************/
+
+static void GRAIL_REC_MERGE(SORT_TYPE *A, int L1, int L2) {
+  int K, k1, k2, m1, m2;
+
+  if (L1 < 3 || L2 < 3) {
+    GRAIL_MERGE_WITHOUT_BUFFER(A, L1, L2);
+    return;
+  }
+
+  if (L1 < L2) {
+    K = L1 + L2 / 2;
+  } else {
+    K = L1 / 2;
+  }
+
+  k1 = k2 = GRAIL_BIN_SEARCH_LEFT(A, L1, A + K);
+
+  if (k2 < L1 && SORT_CMP_A(A + k2, A + K) == 0) {
+    k2 = GRAIL_BIN_SEARCH_RIGHT(A + k1, L1 - k1, A + K) + k1;
+  }
+
+  m1 = GRAIL_BIN_SEARCH_LEFT(A + L1, L2, A + K);
+  m2 = m1;
+
+  if (m2 < L2 && SORT_CMP_A(A + L1 + m2, A + K) == 0) {
+    m2 = GRAIL_BIN_SEARCH_RIGHT(A + L1 + m1, L2 - m1, A + K) + m1;
+  }
+
+  if (k1 == k2) {
+    GRAIL_ROTATE(A + k2, L1 - k2, m2);
+  } else {
+    GRAIL_ROTATE(A + k1, L1 - k1, m1);
+
+    if (m2 != m1) {
+      GRAIL_ROTATE(A + (k2 + m1), L1 - k2, m2 - m1);
+    }
+  }
+
+  GRAIL_REC_MERGE(A + (k2 + m2), L1 - k2, L2 - m2);
+  GRAIL_REC_MERGE(A, k1, m1);
+}
+
+SORT_DEF void REC_STABLE_SORT(SORT_TYPE *arr, size_t L) {
+  int m, h, p0, p1, rest;
+
+  for (m = 1; m < L; m += 2) {
+    if (SORT_CMP_A(arr + m - 1, arr + m) > 0) {
+      GRAIL_SWAP1(arr + (m - 1), arr + m);
+    }
+  }
+
+  for (h = 2; h < L; h *= 2) {
+    p0 = 0;
+    p1 = (int)(L - 2 * h);
+
+    while (p0 <= p1) {
+      GRAIL_REC_MERGE(arr + p0, h, h);
+      p0 += 2 * h;
+    }
+
+    rest = (int)(L - p0);
+
+    if (rest > h) {
+      GRAIL_REC_MERGE(arr + p0, h, rest - h);
+    }
+  }
+}
+
+/* Bubble sort implementation based on Wikipedia article
+   https://en.wikipedia.org/wiki/Bubble_sort
+*/
+SORT_DEF void BUBBLE_SORT(SORT_TYPE *dst, const size_t size) {
+  size_t n = size;
+
+  while (n) {
+    size_t i, newn = 0U;
+
+    for (i = 1U; i < n; ++i) {
+      if (SORT_CMP(dst[i - 1U], dst[i]) > 0) {
+        SORT_SWAP(dst[i - 1U], dst[i]);
+        newn = i;
+      }
+    }
+
+    n = newn;
+  }
+}
+
+#undef SORT_SAFE_CPY
+#undef SORT_TYPE_CPY
+#undef SORT_TYPE_MOVE
+#undef SORT_NEW_BUFFER
+#undef SORT_DELETE_BUFFER
+#undef QUICK_SORT
+#undef MEDIAN
+#undef SORT_CONCAT
+#undef SORT_MAKE_STR1
+#undef SORT_MAKE_STR
+#undef SORT_NAME
+#undef SORT_TYPE
+#undef SORT_CMP
+#undef TEMP_STORAGE_T
+#undef TIM_SORT_RUN_T
+#undef PUSH_NEXT
+#undef SORT_SWAP
+#undef SORT_CONCAT
+#undef SORT_MAKE_STR1
+#undef SORT_MAKE_STR
+#undef BINARY_INSERTION_FIND
+#undef BINARY_INSERTION_SORT_START
+#undef BINARY_INSERTION_SORT
+#undef REVERSE_ELEMENTS
+#undef COUNT_RUN
+#undef TIM_SORT
+#undef TIM_SORT_RESIZE
+#undef TIM_SORT_COLLAPSE
+#undef TIM_SORT_RUN_T
+#undef TEMP_STORAGE_T
+#undef MERGE_SORT
+#undef MERGE_SORT_RECURSIVE
+#undef MERGE_SORT_IN_PLACE
+#undef MERGE_SORT_IN_PLACE_RMERGE
+#undef MERGE_SORT_IN_PLACE_BACKMERGE
+#undef MERGE_SORT_IN_PLACE_FRONTMERGE
+#undef MERGE_SORT_IN_PLACE_ASWAP
+#undef GRAIL_SWAP1
+#undef REC_STABLE_SORT
+#undef GRAIL_REC_MERGE
+#undef GRAIL_SORT_DYN_BUFFER
+#undef GRAIL_SORT_FIXED_BUFFER
+#undef GRAIL_COMMON_SORT
+#undef GRAIL_SORT
+#undef GRAIL_COMBINE_BLOCKS
+#undef GRAIL_LAZY_STABLE_SORT
+#undef GRAIL_MERGE_WITHOUT_BUFFER
+#undef GRAIL_ROTATE
+#undef GRAIL_BIN_SEARCH_LEFT
+#undef GRAIL_BUILD_BLOCKS
+#undef GRAIL_FIND_KEYS
+#undef GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF
+#undef GRAIL_BIN_SEARCH_RIGHT
+#undef GRAIL_MERGE_BUFFERS_LEFT
+#undef GRAIL_SMART_MERGE_WITH_X_BUF
+#undef GRAIL_MERGE_LEFT_WITH_X_BUF
+#undef GRAIL_SMART_MERGE_WITHOUT_BUFFER
+#undef GRAIL_SMART_MERGE_WITH_BUFFER
+#undef GRAIL_MERGE_RIGHT
+#undef GRAIL_MERGE_LEFT
+#undef GRAIL_SWAP_N
+#undef SQRT_SORT
+#undef SQRT_SORT_BUILD_BLOCKS
+#undef SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF
+#undef SQRT_SORT_MERGE_DOWN
+#undef SQRT_SORT_MERGE_LEFT_WITH_X_BUF
+#undef SQRT_SORT_MERGE_RIGHT
+#undef SQRT_SORT_SWAP_N
+#undef SQRT_SORT_SWAP_1
+#undef SQRT_SORT_SMART_MERGE_WITH_X_BUF
+#undef SQRT_SORT_SORT_INS
+#undef SQRT_SORT_COMBINE_BLOCKS
+#undef SQRT_SORT_COMMON_SORT
+#undef SORT_CMP_A
+#undef BUBBLE_SORT
+
+#ifdef SORT_DEF
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+#endif