From 6287ecfa80b52f89f1217e1edc243cc1e89d34fd Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Fri, 23 Feb 2024 16:48:28 -0800
Subject: [PATCH 01/16] Checkpoint euler patch

This just imports Arman's patch and tries to update it for other changes.
---
 src/cpu_shader/euler.rs   | 285 ++++++++++++++++++++++++++++++++++++++
 src/cpu_shader/flatten.rs | 267 +++++++++++++++++++++++++++++++++--
 src/cpu_shader/mod.rs     |   1 +
 src/cpu_shader/util.rs    |  20 +++
 4 files changed, 560 insertions(+), 13 deletions(-)
 create mode 100644 src/cpu_shader/euler.rs

diff --git a/src/cpu_shader/euler.rs b/src/cpu_shader/euler.rs
new file mode 100644
index 000000000..3ce1b3218
--- /dev/null
+++ b/src/cpu_shader/euler.rs
@@ -0,0 +1,285 @@
+// Copyright 2023 The Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+//! Utility functions for Euler Spiral based stroke expansion.
+
+use super::util::Vec2;
+use std::f32::consts::FRAC_PI_4;
+
+#[derive(Debug)]
+pub struct CubicParams {
+    pub th0: f32,
+    pub th1: f32,
+    pub d0: f32,
+    pub d1: f32,
+}
+
+#[derive(Debug)]
+pub struct EulerParams {
+    pub th0: f32,
+    pub th1: f32,
+    pub k0: f32,
+    pub k1: f32,
+    pub ch: f32,
+}
+
+#[derive(Debug)]
+pub struct EulerSeg {
+    pub p0: Vec2,
+    pub p1: Vec2,
+    pub params: EulerParams,
+}
+
+impl CubicParams {
+    pub fn from_cubic(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2) -> Self {
+        let chord = p3 - p0;
+        // TODO: if chord is 0, we have a problem
+        //assert!(chord.length() > 1e-9);//ROBUST_EPSILON);
+        let d01 = p1 - p0;
+        // TODO: This handles coincident control points. Clean this up
+        let d01 = if d01.length() < 1e-9 {
+            let d02 = p2 - p1;
+            if d02.length() < 1e-9 {
+                chord
+            } else {
+                d02
+            }
+        } else {
+            d01
+        };
+        let h0 = Vec2::new(
+            d01.x * chord.x + d01.y * chord.y,
+            d01.y * chord.x - d01.x * chord.y,
+        );
+        let th0 = h0.atan2();
+        let d0 = h0.length() / chord.length_squared();
+        let d23 = p3 - p2;
+        // TODO: This handles coincident control points. Clean this up
+        let d23 = if d23.length() < 1e-9 {
+            let d13 = p3 - p1;
+            if d13.length() < 1e-9 {
+                chord
+            } else {
+                d13
+            }
+        } else {
+            d23
+        };
+        let h1 = Vec2::new(
+            d23.x * chord.x + d23.y * chord.y,
+            d23.x * chord.y - d23.y * chord.x,
+        );
+        let th1 = h1.atan2();
+        let d1 = h1.length() / chord.length_squared();
+        CubicParams { th0, th1, d0, d1 }
+    }
+
+    // Estimated error of GH to Euler spiral
+    //
+    // Return value is normalized to chord - to get actual error, multiply
+    // by chord.
+    pub fn est_euler_err(&self) -> f32 {
+        // Potential optimization: work with unit vector rather than angle
+        // TODO: preventing division by zero here but may need to work around this in a better way.
+        let e0 = (2. / 3.) / (1.0 + self.th0.cos()).max(1e-9);
+        let e1 = (2. / 3.) / (1.0 + self.th1.cos()).max(1e-9);
+        let s0 = self.th0.sin();
+        let s1 = self.th1.sin();
+        let s01 = (s0 + s1).sin();
+        let amin = 0.15 * (2. * e0 * s0 + 2. * e1 * s1 - e0 * e1 * s01);
+        let a = 0.15 * (2. * self.d0 * s0 + 2. * self.d1 * s1 - self.d0 * self.d1 * s01);
+        let aerr = (a - amin).abs();
+        let symm = (self.th0 + self.th1).abs();
+        let asymm = (self.th0 - self.th1).abs();
+        let dist = (self.d0 - e0).hypot(self.d1 - e1);
+        let ctr = 3.7e-6 * symm.powi(5) + 6e-3 * asymm * symm.powi(2);
+        let halo_symm = 5e-3 * symm * dist;
+        let halo_asymm = 7e-2 * asymm * dist;
+        /*
+        println!("    e0: {e0}");
+        println!("    e1: {e1}");
+        println!("    s0: {s0}");
+        println!("    s1: {s1}");
+        println!("    s01: {s01}");
+        println!("    amin: {amin}");
+        println!("    a: {a}");
+        println!("    aerr: {aerr}");
+        println!("    symm: {symm}");
+        println!("    asymm: {asymm}");
+        println!("    dist: {dist}");
+        println!("    ctr: {ctr}");
+        println!("    halo_symm: {halo_symm}");
+        println!("    halo_asymm: {halo_asymm}");
+        */
+        1.25 * ctr + 1.55 * aerr + halo_symm + halo_asymm
+    }
+}
+
+impl EulerParams {
+    pub fn from_angles(th0: f32, th1: f32) -> EulerParams {
+        let k0 = th0 + th1;
+        let dth = th1 - th0;
+        let d2 = dth * dth;
+        let k2 = k0 * k0;
+        let mut a = 6.0;
+        a -= d2 * (1. / 70.);
+        a -= (d2 * d2) * (1. / 10780.);
+        a += (d2 * d2 * d2) * 2.769178184818219e-07;
+        let b = -0.1 + d2 * (1. / 4200.) + d2 * d2 * 1.6959677820260655e-05;
+        let c = -1. / 1400. + d2 * 6.84915970574303e-05 - k2 * 7.936475029053326e-06;
+        a += (b + c * k2) * k2;
+        let k1 = dth * a;
+
+        // calculation of chord
+        let mut ch = 1.0;
+        ch -= d2 * (1. / 40.);
+        ch += (d2 * d2) * 0.00034226190482569864;
+        ch -= (d2 * d2 * d2) * 1.9349474568904524e-06;
+        let b = -1. / 24. + d2 * 0.0024702380951963226 - d2 * d2 * 3.7297408997537985e-05;
+        let c = 1. / 1920. - d2 * 4.87350869747975e-05 - k2 * 3.1001936068463107e-06;
+        ch += (b + c * k2) * k2;
+        EulerParams {
+            th0,
+            th1,
+            k0,
+            k1,
+            ch,
+        }
+    }
+
+    pub fn eval_th(&self, t: f32) -> f32 {
+        (self.k0 + 0.5 * self.k1 * (t - 1.0)) * t - self.th0
+    }
+
+    /// Evaluate the curve at the given parameter.
+    ///
+    /// The parameter is in the range 0..1, and the result goes from (0, 0) to (1, 0).
+    fn eval(&self, t: f32) -> Vec2 {
+        let thm = self.eval_th(t * 0.5);
+        let k0 = self.k0;
+        let k1 = self.k1;
+        let (u, v) = integ_euler_10((k0 + k1 * (0.5 * t - 0.5)) * t, k1 * t * t);
+        let s = t / self.ch * thm.sin();
+        let c = t / self.ch * thm.cos();
+        let x = u * c - v * s;
+        let y = -v * c - u * s;
+        Vec2::new(x, y)
+    }
+
+    fn eval_with_offset(&self, t: f32, offset: f32) -> Vec2 {
+        let th = self.eval_th(t);
+        let v = Vec2::new(offset * th.sin(), offset * th.cos());
+        self.eval(t) + v
+    }
+}
+
+impl EulerSeg {
+    pub fn from_params(p0: Vec2, p1: Vec2, params: EulerParams) -> Self {
+        EulerSeg { p0, p1, params }
+    }
+
+    #[allow(unused)]
+    pub fn eval(&self, t: f32) -> Vec2 {
+        let Vec2 { x, y } = self.params.eval(t);
+        let chord = self.p1 - self.p0;
+        Vec2::new(
+            self.p0.x + chord.x * x - chord.y * y,
+            self.p0.y + chord.x * y + chord.y * x,
+        )
+    }
+
+    pub fn eval_with_offset(&self, t: f32, offset: f32) -> Vec2 {
+        let chord = self.p1 - self.p0;
+        let scaled = offset / chord.length();
+        let Vec2 { x, y } = self.params.eval_with_offset(t, scaled);
+        Vec2::new(
+            self.p0.x + chord.x * x - chord.y * y,
+            self.p0.y + chord.x * y + chord.y * x,
+        )
+    }
+}
+
+/// Integrate Euler spiral.
+///
+/// TODO: investigate needed accuracy. We might be able to get away
+/// with 8th order.
+fn integ_euler_10(k0: f32, k1: f32) -> (f32, f32) {
+    let t1_1 = k0;
+    let t1_2 = 0.5 * k1;
+    let t2_2 = t1_1 * t1_1;
+    let t2_3 = 2. * (t1_1 * t1_2);
+    let t2_4 = t1_2 * t1_2;
+    let t3_4 = t2_2 * t1_2 + t2_3 * t1_1;
+    let t3_6 = t2_4 * t1_2;
+    let t4_4 = t2_2 * t2_2;
+    let t4_5 = 2. * (t2_2 * t2_3);
+    let t4_6 = 2. * (t2_2 * t2_4) + t2_3 * t2_3;
+    let t4_7 = 2. * (t2_3 * t2_4);
+    let t4_8 = t2_4 * t2_4;
+    let t5_6 = t4_4 * t1_2 + t4_5 * t1_1;
+    let t5_8 = t4_6 * t1_2 + t4_7 * t1_1;
+    let t6_6 = t4_4 * t2_2;
+    let t6_7 = t4_4 * t2_3 + t4_5 * t2_2;
+    let t6_8 = t4_4 * t2_4 + t4_5 * t2_3 + t4_6 * t2_2;
+    let t7_8 = t6_6 * t1_2 + t6_7 * t1_1;
+    let t8_8 = t6_6 * t2_2;
+    let mut u = 1.;
+    u -= (1. / 24.) * t2_2 + (1. / 160.) * t2_4;
+    u += (1. / 1920.) * t4_4 + (1. / 10752.) * t4_6 + (1. / 55296.) * t4_8;
+    u -= (1. / 322560.) * t6_6 + (1. / 1658880.) * t6_8;
+    u += (1. / 92897280.) * t8_8;
+    let mut v = (1. / 12.) * t1_2;
+    v -= (1. / 480.) * t3_4 + (1. / 2688.) * t3_6;
+    v += (1. / 53760.) * t5_6 + (1. / 276480.) * t5_8;
+    v -= (1. / 11612160.) * t7_8;
+    (u, v)
+}
+
+const BREAK1: f32 = 0.8;
+const BREAK2: f32 = 1.25;
+const BREAK3: f32 = 2.1;
+const SIN_SCALE: f32 = 1.0976991822760038;
+const QUAD_A1: f32 = 0.6406;
+const QUAD_B1: f32 = -0.81;
+const QUAD_C1: f32 = 0.9148117935952064;
+const QUAD_A2: f32 = 0.5;
+const QUAD_B2: f32 = -0.156;
+const QUAD_C2: f32 = 0.16145779359520596;
+
+pub fn espc_int_approx(x: f32) -> f32 {
+    let y = x.abs();
+    let a = if y < BREAK1 {
+        (SIN_SCALE * y).sin() * (1.0 / SIN_SCALE)
+    } else if y < BREAK2 {
+        (8.0f32.sqrt() / 3.0) * (y - 1.0) * (y - 1.0).abs().sqrt() + FRAC_PI_4
+    } else {
+        let (a, b, c) = if y < BREAK3 {
+            (QUAD_A1, QUAD_B1, QUAD_C1)
+        } else {
+            (QUAD_A2, QUAD_B2, QUAD_C2)
+        };
+        a * y * y + b * y + c
+    };
+    a.copysign(x)
+}
+
+pub fn espc_int_inv_approx(x: f32) -> f32 {
+    let y = x.abs();
+    let a = if y < 0.7010707591262915 {
+        (x * SIN_SCALE).asin() * (1.0 / SIN_SCALE)
+    } else if y < 0.903249293595206 {
+        let b = y - FRAC_PI_4;
+        let u = b.abs().powf(2. / 3.).copysign(b);
+        u * (9.0f32 / 8.).cbrt() + 1.0
+    } else {
+        let (u, v, w) = if y < 2.038857793595206 {
+            const B: f32 = 0.5 * QUAD_B1 / QUAD_A1;
+            (B * B - QUAD_C1 / QUAD_A1, 1.0 / QUAD_A1, B)
+        } else {
+            const B: f32 = 0.5 * QUAD_B2 / QUAD_A2;
+            (B * B - QUAD_C2 / QUAD_A2, 1.0 / QUAD_A2, B)
+        };
+        (u + v * y).sqrt() - w
+    };
+    a.copysign(x)
+}
diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs
index 34a616b6c..89f5e96ec 100644
--- a/src/cpu_shader/flatten.rs
+++ b/src/cpu_shader/flatten.rs
@@ -1,14 +1,24 @@
-// Copyright 2023 the Vello Authors
+// Copyright 2023 The Vello Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
 use crate::cpu_dispatch::CpuBinding;
 
-use super::util::{Transform, Vec2, ROBUST_EPSILON};
+use super::{
+    euler::{espc_int_approx, espc_int_inv_approx, CubicParams, EulerParams, EulerSeg},
+    util::{Transform, Vec2, ROBUST_EPSILON},
+};
 use vello_encoding::{
     math::f16_to_f32, BumpAllocators, ConfigUniform, LineSoup, Monoid, PathBbox, PathMonoid,
     PathTag, Style, DRAW_INFO_FLAGS_FILL_RULE_BIT,
 };
 
+// TODO: remove this
+macro_rules! log {
+    ($($arg:tt)*) => {{
+        //println!($($arg)*);
+    }};
+}
+
 fn to_minus_one_quarter(x: f32) -> f32 {
     // could also be written x.powf(-0.25)
     x.sqrt().sqrt().recip()
@@ -118,6 +128,28 @@ fn cubic_end_normal(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2) -> Vec2 {
     Vec2::new(-tangent.y, tangent.x)
 }
 
+fn cubic_subsegment(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2, t0: f32, t1: f32) -> CubicPoints {
+    let scp0 = eval_cubic(p0, p1, p2, p3, t0);
+    let scp3 = eval_cubic(p0, p1, p2, p3, t1);
+    // Compute the derivate curve
+    let qp0 = 3. * (p1 - p0);
+    let qp1 = 3. * (p2 - p1);
+    let qp2 = 3. * (p3 - p2);
+    let scale = (t1 - t0) * (1.0 / 3.0);
+    let scp1 = scp0 + scale * eval_quad(qp0, qp1, qp2, t0);
+    let scp2 = scp3 - scale * eval_quad(qp0, qp1, qp2, t1);
+    CubicPoints {
+        p0: scp0,
+        p1: scp1,
+        p2: scp2,
+        p3: scp3,
+    }
+}
+
+fn check_colinear(p0: Vec2, p1: Vec2, p2: Vec2) -> bool {
+    (p1 - p0).cross(p2 - p0).abs() < 1e-9
+}
+
 fn write_line(
     line_ix: usize,
     path_ix: u32,
@@ -128,7 +160,7 @@ fn write_line(
 ) {
     assert!(
         !p0.is_nan() && !p1.is_nan(),
-        "wrote line segment with NaN: p0: {:?}, p1: {:?}",
+        "wrote NaNs: p0: {:?}, p1: {:?}",
         p0,
         p1
     );
@@ -328,19 +360,22 @@ fn flatten_arc(
     lines: &mut [LineSoup],
     bbox: &mut IntBbox,
 ) {
-    const MIN_THETA: f32 = 0.0001;
-
     let mut p0 = transform.apply(begin);
     let mut r = begin - center;
-    let tol: f32 = 0.1;
+    let tol: f32 = 0.5;
     let radius = tol.max((p0 - transform.apply(center)).length());
-    let theta = (2. * (1. - tol / radius).acos()).max(MIN_THETA);
-
-    // Always output at least one line so that we always draw the chord.
-    let n_lines = ((angle / theta).ceil() as u32).max(1);
+    let x = 1. - tol / radius;
+    let theta = (2. * x * x - 1.).clamp(-1., 1.).acos();
+    const MAX_LINES: u32 = 1000;
+    let n_lines = if theta <= ROBUST_EPSILON {
+        MAX_LINES
+    } else {
+        MAX_LINES.min((std::f32::consts::TAU / theta).ceil() as u32)
+    };
 
-    let c = theta.cos();
-    let s = theta.sin();
+    let th = angle / (n_lines as f32);
+    let c = th.cos();
+    let s = th.sin();
     let rot = Transform([c, -s, s, c, 0., 0.]);
 
     for _ in 0..(n_lines - 1) {
@@ -353,6 +388,181 @@ fn flatten_arc(
     output_line(path_ix, p0, p1, line_ix, bbox, lines);
 }
 
+fn flatten_euler(
+    cubic: &CubicPoints,
+    path_ix: u32,
+    local_to_device: &Transform,
+    offset: f32,
+    is_line: bool,
+    line_ix: &mut usize,
+    lines: &mut [LineSoup],
+    bbox: &mut IntBbox,
+) {
+    log!("@@@ flatten_euler: {:#?}", cubic);
+    // Flatten in local coordinates if this is a stroke. Flatten in device space otherwise.
+    let (p0, p1, p2, p3, scale, transform) = if offset == 0. {
+        (
+            local_to_device.apply(cubic.p0),
+            local_to_device.apply(cubic.p1),
+            local_to_device.apply(cubic.p2),
+            local_to_device.apply(cubic.p3),
+            1.,
+            Transform::identity(),
+        )
+    } else {
+        let t = local_to_device.0;
+        let scale = 0.5 * Vec2::new(t[0] + t[3], t[1] - t[2]).length()
+            + Vec2::new(t[0] - t[3], t[1] + t[2]).length();
+        (
+            cubic.p0,
+            cubic.p1,
+            cubic.p2,
+            cubic.p3,
+            scale,
+            local_to_device.clone(),
+        )
+    };
+
+    // Special-case lines.
+    // We still have to handle colinear cubic parameters. We are special casing the line-to
+    // encoding because floating point errors in the degree raise can cause some line-tos to slip
+    // through the epsilon threshold in check_colinear.
+    //if check_colinear(p0, p1, p3) && check_colinear(p0, p2, p3) {
+    if is_line {
+        let tan = p3 - p0;
+        if tan.length() > ROBUST_EPSILON {
+            let tan_norm = tan.normalize();
+            let n = Vec2::new(-tan_norm.y, tan_norm.x);
+            let lp0 = p0 + n * offset;
+            let lp1 = p3 + n * offset;
+            let l0 = if offset > 0. { lp0 } else { lp1 };
+            let l1 = if offset > 0. { lp1 } else { lp0 };
+            log!("@@@ output line: {:?}, {:?}, tan: {:?}", l0, l1, tan);
+            output_line_with_transform(path_ix, l0, l1, &transform, line_ix, lines, bbox);
+        } else {
+            log!("@@@ drop line: {:?}, {:?}, tan: {:?}", p0, p3, tan);
+        }
+        return;
+    }
+
+    // Special-case colinear cubic segments
+    // TODO: clean this up
+    if check_colinear(p0, p1, p3)
+        && check_colinear(p0, p2, p3)
+        && check_colinear(p0, p1, p2)
+        && check_colinear(p1, p2, p3)
+    {
+        let (start, end) = {
+            let distances = [
+                ((p1 - p0).length(), p1, p0),
+                ((p2 - p0).length(), p2, p0),
+                ((p3 - p0).length(), p3, p0),
+                ((p2 - p1).length(), p2, p1),
+                ((p3 - p1).length(), p3, p1),
+                ((p3 - p2).length(), p3, p2),
+            ];
+            let mut longest = distances[0];
+            for d in &distances[1..] {
+                if d.0 > longest.0 {
+                    longest = *d;
+                }
+            }
+            (longest.1, longest.2)
+        };
+        let tan = end - start;
+        if tan.length() > ROBUST_EPSILON {
+            let tan_norm = tan.normalize();
+            let n = Vec2::new(-tan_norm.y, tan_norm.x);
+            let lp0 = start + n * offset;
+            let lp1 = end + n * offset;
+            let l0 = if offset > 0. { lp0 } else { lp1 };
+            let l1 = if offset > 0. { lp1 } else { lp0 };
+            log!("@@@ output line: {:?}, {:?}, tan: {:?}", l0, l1, tan);
+            output_line_with_transform(path_ix, l0, l1, &transform, line_ix, lines, bbox);
+        } else {
+            log!("@@@ drop line: {:?}, {:?}, tan: {:?}", start, end, tan);
+        }
+        return;
+    }
+
+    let tol: f32 = 0.01;
+    let scaled_sqrt_tol = (tol / scale).sqrt();
+    let mut t0_u: u32 = 0;
+    let mut dt: f32 = 1.;
+
+    loop {
+        if dt < ROBUST_EPSILON {
+            break;
+        }
+        let t0 = (t0_u as f32) * dt;
+        if t0 == 1. {
+            break;
+        }
+        log!("@@@ loop1: t0: {t0}, dt: {dt}");
+        loop {
+            let t1 = t0 + dt;
+            // Subdivide into cubics
+            let subcubic = cubic_subsegment(p0, p1, p2, p3, t0, t1);
+            let cubic_params =
+                CubicParams::from_cubic(subcubic.p0, subcubic.p1, subcubic.p2, subcubic.p3);
+            let est_err = cubic_params.est_euler_err();
+            let err = est_err * (subcubic.p0 - subcubic.p3).length();
+            log!("@@@   loop2: sub:{:?}, {:?} t0: {t0}, t1: {t1}, dt: {dt}, est_err: {est_err}, err: {err}", subcubic, cubic_params);
+            if err <= scaled_sqrt_tol {
+                log!("@@@   error within tolerance");
+                t0_u += 1;
+                let shift = t0_u.trailing_zeros();
+                t0_u >>= shift;
+                dt *= (1 << shift) as f32;
+                let euler_params = EulerParams::from_angles(cubic_params.th0, cubic_params.th1);
+                let es = EulerSeg::from_params(subcubic.p0, subcubic.p3, euler_params);
+
+                let es_scale = (es.p0 - es.p1).length();
+                let (k0, k1) = (es.params.k0 - 0.5 * es.params.k1, es.params.k1);
+
+                // compute forward integral to determine number of subdivisions
+                let dist_scaled = offset * scale / es_scale;
+                let a = -2.0 * dist_scaled * k1;
+                let b = -1.0 - 2.0 * dist_scaled * k0;
+                let int0 = espc_int_approx(b);
+                let int1 = espc_int_approx(a + b);
+                let integral = int1 - int0;
+                let k_peak = k0 - k1 * b / a;
+                let integrand_peak = (k_peak * (k_peak * dist_scaled + 1.0)).abs().sqrt();
+                let scaled_int = integral * integrand_peak / a;
+                let n_frac = 0.5 * (es_scale / scaled_sqrt_tol).sqrt() * scaled_int;
+                let n = n_frac.ceil();
+
+                // Flatten line segments
+                log!("@@@   loop2: lines: {n}");
+                // TODO: make all computation above robust and uncomment this assertion
+                //assert!(!n.is_nan());
+                if n.is_nan() {
+                    // Skip the segment if `n` is NaN. This is for debugging purposes only
+                    log!("@@@   NaN: parameters:\n  es: {:#?}\n  k0: {k0}, k1: {k1}\n  dist_scaled: {dist_scaled}\n  es_scale: {es_scale}\n  a: {a}\n  b: {b}\n  int0: {int0}, int1: {int1}, integral: {integral}\n  k_peak: {k_peak}\n  integrand_peak: {integrand_peak}\n  scaled_int: {scaled_int}\n  n_frac:  {n_frac}", es);
+                } else {
+                    let mut lp0 = es.eval_with_offset(0., offset);
+                    for i in 0..n as usize {
+                        let t = (i + 1) as f32 / n;
+                        let inv = espc_int_inv_approx(integral * t + int0);
+                        let s = (inv - b) / a;
+                        let lp1 = es.eval_with_offset(s, offset);
+                        let l0 = if offset > 0. { lp0 } else { lp1 };
+                        let l1 = if offset > 0. { lp1 } else { lp0 };
+                        output_line_with_transform(
+                            path_ix, l0, l1, &transform, line_ix, lines, bbox,
+                        );
+                        lp0 = lp1;
+                    }
+                }
+                break;
+            }
+            t0_u = t0_u.saturating_mul(2);
+            dt *= 0.5;
+        }
+    }
+}
+
 fn draw_cap(
     path_ix: u32,
     cap_style: u32,
@@ -538,6 +748,7 @@ fn compute_tag_monoid(ix: usize, pathtags: &[u32], tag_monoids: &[PathMonoid]) -
     }
 }
 
+#[derive(Debug)]
 struct CubicPoints {
     p0: Vec2,
     p1: Vec2,
@@ -691,12 +902,24 @@ fn flatten_main(
                         // Don't draw anything if the path is closed.
                     }
                 } else {
+                    let is_line = seg_type == PATH_TAG_LINETO;
                     // Render offset curves
-                    flatten_cubic(
+                    flatten_euler(
                         &pts,
                         path_ix,
                         &transform,
                         offset,
+                        is_line,
+                        &mut line_ix,
+                        lines,
+                        &mut bbox,
+                    );
+                    flatten_euler(
+                        &pts,
+                        path_ix,
+                        &transform,
+                        -offset,
+                        is_line,
                         &mut line_ix,
                         lines,
                         &mut bbox,
@@ -707,10 +930,28 @@ fn flatten_main(
                         read_neighboring_segment(ix + 1, pathtags, pathdata, tag_monoids);
                     let tan_prev = cubic_end_tangent(pts.p0, pts.p1, pts.p2, pts.p3);
                     let tan_next = neighbor.tangent;
+
+                    // TODO: add NaN assertions to CPU shaders PR (when writing lines)
+                    // TODO: not all zero-length segments are getting filtered out
+                    // TODO: this is a hack. How to handle caps on degenerate stroke?
+                    // TODO: debug tricky stroke by isolation
+                    let tan_prev = if tan_prev.length_squared() < ROBUST_EPSILON {
+                        Vec2::new(0.01, 0.)
+                    } else {
+                        tan_prev
+                    };
+                    let tan_next = if tan_next.length_squared() < ROBUST_EPSILON {
+                        Vec2::new(0.01, 0.)
+                    } else {
+                        tan_next
+                    };
+
                     let offset_tangent = offset * tan_prev.normalize();
                     let n_prev = Vec2::new(-offset_tangent.y, offset_tangent.x);
                     let tan_next_norm = tan_next.normalize();
                     let n_next = offset * Vec2::new(-tan_next_norm.y, tan_next_norm.x);
+                    log!("@ tan_prev: {:#?}", tan_prev);
+                    log!("@ tan_next: {:#?}", tan_next);
                     if neighbor.do_join {
                         draw_join(
                             path_ix,
diff --git a/src/cpu_shader/mod.rs b/src/cpu_shader/mod.rs
index 8b1fb0df6..d46c77d50 100644
--- a/src/cpu_shader/mod.rs
+++ b/src/cpu_shader/mod.rs
@@ -15,6 +15,7 @@ mod clip_reduce;
 mod coarse;
 mod draw_leaf;
 mod draw_reduce;
+mod euler;
 mod fine;
 mod flatten;
 mod path_count;
diff --git a/src/cpu_shader/util.rs b/src/cpu_shader/util.rs
index 7748ffcd3..1f6354404 100644
--- a/src/cpu_shader/util.rs
+++ b/src/cpu_shader/util.rs
@@ -81,10 +81,18 @@ impl Vec2 {
         self.x * other.x + self.y * other.y
     }
 
+    pub fn cross(self, other: Vec2) -> f32 {
+        (self.x * other.y) - (self.y * other.x)
+    }
+
     pub fn length(self) -> f32 {
         self.x.hypot(self.y)
     }
 
+    pub fn length_squared(self) -> f32 {
+        self.dot(self)
+    }
+
     pub fn to_array(self) -> [f32; 2] {
         [self.x, self.y]
     }
@@ -103,9 +111,21 @@ impl Vec2 {
         self / self.length()
     }
 
+    pub fn atan2(self) -> f32 {
+        self.y.atan2(self.x)
+    }
+
     pub fn is_nan(&self) -> bool {
         self.x.is_nan() || self.y.is_nan()
     }
+
+    pub fn min(&self, other: Vec2) -> Vec2 {
+        Vec2::new(self.x.min(other.x), self.y.min(other.y))
+    }
+
+    pub fn max(&self, other: Vec2) -> Vec2 {
+        Vec2::new(self.x.max(other.x), self.y.max(other.y))
+    }
 }
 
 #[derive(Clone)]

From 6d677f534383034b979a0b3a03d2715b7bdac4c5 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Sat, 24 Feb 2024 09:36:21 -0800
Subject: [PATCH 02/16] Checkpoint numerical robustness work

Apply Euler spiral numerical robustness to CPU shader.

There are a number of minor tweaks, and some important changes:

* Cubic segments are computed with points and derivatives, not subdivision
* Those derivatives are adjusted if they are near-zero
* The ESPC integral is robust to small k1 and dist

Note that dealing with small dist is essential for doing ES flattening for fills as well as strokes.
---
 src/cpu_shader/euler.rs   |  69 ++++++++----------
 src/cpu_shader/flatten.rs | 149 +++++++++++++++++++++++++++-----------
 2 files changed, 137 insertions(+), 81 deletions(-)

diff --git a/src/cpu_shader/euler.rs b/src/cpu_shader/euler.rs
index 3ce1b3218..ca3734f15 100644
--- a/src/cpu_shader/euler.rs
+++ b/src/cpu_shader/euler.rs
@@ -31,46 +31,26 @@ pub struct EulerSeg {
 }
 
 impl CubicParams {
-    pub fn from_cubic(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2) -> Self {
-        let chord = p3 - p0;
-        // TODO: if chord is 0, we have a problem
-        //assert!(chord.length() > 1e-9);//ROBUST_EPSILON);
-        let d01 = p1 - p0;
-        // TODO: This handles coincident control points. Clean this up
-        let d01 = if d01.length() < 1e-9 {
-            let d02 = p2 - p1;
-            if d02.length() < 1e-9 {
-                chord
-            } else {
-                d02
-            }
-        } else {
-            d01
-        };
+    /// Compute parameters from endpoints and derivatives.
+    pub fn from_points_derivs(p0: Vec2, p1: Vec2, q0: Vec2, q1: Vec2, dt: f32) -> Self {
+        let chord = p1 - p0;
+        // Robustness note: we must protect this function from being called when the
+        // chord length is (near-)zero.
+        let scale = dt / chord.length_squared();
         let h0 = Vec2::new(
-            d01.x * chord.x + d01.y * chord.y,
-            d01.y * chord.x - d01.x * chord.y,
+            q0.x * chord.x + q0.y * chord.y,
+            q0.y * chord.x - q0.x * chord.y,
         );
         let th0 = h0.atan2();
-        let d0 = h0.length() / chord.length_squared();
-        let d23 = p3 - p2;
-        // TODO: This handles coincident control points. Clean this up
-        let d23 = if d23.length() < 1e-9 {
-            let d13 = p3 - p1;
-            if d13.length() < 1e-9 {
-                chord
-            } else {
-                d13
-            }
-        } else {
-            d23
-        };
+        let d0 = h0.length() * scale;
         let h1 = Vec2::new(
-            d23.x * chord.x + d23.y * chord.y,
-            d23.x * chord.y - d23.y * chord.x,
+            q1.x * chord.x + q1.y * chord.y,
+            q1.x * chord.y - q1.y * chord.x,
         );
         let th1 = h1.atan2();
-        let d1 = h1.length() / chord.length_squared();
+        let d1 = h1.length() * scale;
+        // Robustness note: we may want to clamp the magnitude of the angles to
+        // a bit less than pi. Perhaps here, perhaps downstream.
         CubicParams { th0, th1, d0, d1 }
     }
 
@@ -80,19 +60,28 @@ impl CubicParams {
     // by chord.
     pub fn est_euler_err(&self) -> f32 {
         // Potential optimization: work with unit vector rather than angle
-        // TODO: preventing division by zero here but may need to work around this in a better way.
-        let e0 = (2. / 3.) / (1.0 + self.th0.cos()).max(1e-9);
-        let e1 = (2. / 3.) / (1.0 + self.th1.cos()).max(1e-9);
+        let cth0 = self.th0.cos();
+        let cth1 = self.th1.cos();
+        if cth0 * cth1 < 0.0 {
+            // Rationale: this happens when fitting a cusp or near-cusp with
+            // a near 180 degree u-turn. The actual ES is bounded in that case.
+            // Further subdivision won't reduce the angles if actually a cusp.
+            return 2.0;
+        }
+        let e0 = (2. / 3.) / (1.0 + cth0);
+        let e1 = (2. / 3.) / (1.0 + cth1);
         let s0 = self.th0.sin();
         let s1 = self.th1.sin();
-        let s01 = (s0 + s1).sin();
+        // Note: some other versions take sin of s0 + s1 instead. Those are incorrect.
+        // Strangely, calibration is the same, but more work could be done.
+        let s01 = cth0 * s1 + cth1 * s0;
         let amin = 0.15 * (2. * e0 * s0 + 2. * e1 * s1 - e0 * e1 * s01);
         let a = 0.15 * (2. * self.d0 * s0 + 2. * self.d1 * s1 - self.d0 * self.d1 * s01);
         let aerr = (a - amin).abs();
         let symm = (self.th0 + self.th1).abs();
         let asymm = (self.th0 - self.th1).abs();
         let dist = (self.d0 - e0).hypot(self.d1 - e1);
-        let ctr = 3.7e-6 * symm.powi(5) + 6e-3 * asymm * symm.powi(2);
+        let ctr = 4.625e-6 * symm.powi(5) + 7.5e-3 * asymm * symm.powi(2);
         let halo_symm = 5e-3 * symm * dist;
         let halo_asymm = 7e-2 * asymm * dist;
         /*
@@ -111,7 +100,7 @@ impl CubicParams {
         println!("    halo_symm: {halo_symm}");
         println!("    halo_asymm: {halo_asymm}");
         */
-        1.25 * ctr + 1.55 * aerr + halo_symm + halo_asymm
+        ctr + 1.55 * aerr + halo_symm + halo_asymm
     }
 }
 
diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs
index 89f5e96ec..e6215b7ce 100644
--- a/src/cpu_shader/flatten.rs
+++ b/src/cpu_shader/flatten.rs
@@ -1,6 +1,8 @@
 // Copyright 2023 The Vello Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
+use std::f32::consts::FRAC_1_SQRT_2;
+
 use crate::cpu_dispatch::CpuBinding;
 
 use super::{
@@ -41,6 +43,11 @@ struct SubdivResult {
     a2: f32,
 }
 
+/// Threshold below which a derivative is considered too small.
+const DERIV_THRESH: f32 = 1e-6;
+/// Amount to nudge t when derivative is near-zero.
+const DERIV_EPS: f32 = 1e-6;
+
 fn estimate_subdiv(p0: Vec2, p1: Vec2, p2: Vec2, sqrt_tol: f32) -> SubdivResult {
     let d01 = p1 - p0;
     let d12 = p2 - p1;
@@ -81,6 +88,17 @@ fn eval_cubic(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2, t: f32) -> Vec2 {
     p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t
 }
 
+/// Evaluate both the point and derivative of a cubic bezier.
+fn eval_cubic_and_deriv(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2, t: f32) -> (Vec2, Vec2) {
+    let m = 1.0 - t;
+    let mm = m * m;
+    let mt = m * t;
+    let tt = t * t;
+    let p = p0 * (mm * m) + (p1 * (3.0 * mm) + p2 * (3.0 * mt) + p3 * tt) * t;
+    let q = (p1 - p0) * mm + (p2 - p1) * (2.0 * mt) + (p3 - p2) * tt;
+    (p, q)
+}
+
 fn eval_quad_tangent(p0: Vec2, p1: Vec2, p2: Vec2, t: f32) -> Vec2 {
     let dp0 = 2. * (p1 - p0);
     let dp1 = 2. * (p2 - p1);
@@ -128,24 +146,6 @@ fn cubic_end_normal(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2) -> Vec2 {
     Vec2::new(-tangent.y, tangent.x)
 }
 
-fn cubic_subsegment(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2, t0: f32, t1: f32) -> CubicPoints {
-    let scp0 = eval_cubic(p0, p1, p2, p3, t0);
-    let scp3 = eval_cubic(p0, p1, p2, p3, t1);
-    // Compute the derivate curve
-    let qp0 = 3. * (p1 - p0);
-    let qp1 = 3. * (p2 - p1);
-    let qp2 = 3. * (p3 - p2);
-    let scale = (t1 - t0) * (1.0 / 3.0);
-    let scp1 = scp0 + scale * eval_quad(qp0, qp1, qp2, t0);
-    let scp2 = scp3 - scale * eval_quad(qp0, qp1, qp2, t1);
-    CubicPoints {
-        p0: scp0,
-        p1: scp1,
-        p2: scp2,
-        p3: scp3,
-    }
-}
-
 fn check_colinear(p0: Vec2, p1: Vec2, p2: Vec2) -> bool {
     (p1 - p0).cross(p2 - p0).abs() < 1e-9
 }
@@ -388,6 +388,16 @@ fn flatten_arc(
     output_line(path_ix, p0, p1, line_ix, bbox, lines);
 }
 
+/// A robustness strategy for the ESPC integral
+enum EspcRobust {
+    /// Both k1 and dist are large enough to divide by robustly.
+    Normal,
+    /// k1 is low, so model curve as a circular arc.
+    LowK1,
+    /// dist is low, so model curve as just an Euler spiral.
+    LowDist,
+}
+
 fn flatten_euler(
     cubic: &CubicPoints,
     path_ix: u32,
@@ -485,10 +495,15 @@ fn flatten_euler(
         return;
     }
 
-    let tol: f32 = 0.01;
-    let scaled_sqrt_tol = (tol / scale).sqrt();
+    let tol: f32 = 0.25;
     let mut t0_u: u32 = 0;
     let mut dt: f32 = 1.;
+    let mut last_p = p0;
+    let mut last_q = p1 - p0;
+    if last_q.length_squared() < DERIV_THRESH.powi(2) {
+        last_q = eval_cubic_and_deriv(p0, p1, p2, p3, DERIV_EPS).1;
+    }
+    let mut last_t = 0.;
 
     loop {
         if dt < ROBUST_EPSILON {
@@ -500,38 +515,76 @@ fn flatten_euler(
         }
         log!("@@@ loop1: t0: {t0}, dt: {dt}");
         loop {
-            let t1 = t0 + dt;
-            // Subdivide into cubics
-            let subcubic = cubic_subsegment(p0, p1, p2, p3, t0, t1);
+            let mut t1 = t0 + dt;
+            let this_p0 = last_p;
+            let this_q0 = last_q;
+            let (mut this_p1, mut this_q1) = eval_cubic_and_deriv(p0, p1, p2, p3, t1);
+            if this_q1.length_squared() < DERIV_THRESH.powi(2) {
+                let (new_p1, new_q1) = eval_cubic_and_deriv(p0, p1, p2, p3, t1 - DERIV_EPS);
+                this_q1 = new_q1;
+                if t1 < 1. {
+                    this_p1 = new_p1;
+                    t1 = t1 - DERIV_EPS;
+                }
+            }
+            let actual_dt = t1 - last_t;
             let cubic_params =
-                CubicParams::from_cubic(subcubic.p0, subcubic.p1, subcubic.p2, subcubic.p3);
+                CubicParams::from_points_derivs(this_p0, this_p1, this_q0, this_q1, actual_dt);
             let est_err = cubic_params.est_euler_err();
-            let err = est_err * (subcubic.p0 - subcubic.p3).length();
+            let chord_len = (this_p1 - this_p0).length();
+            let err = est_err * chord_len;
             log!("@@@   loop2: sub:{:?}, {:?} t0: {t0}, t1: {t1}, dt: {dt}, est_err: {est_err}, err: {err}", subcubic, cubic_params);
-            if err <= scaled_sqrt_tol {
+            if err * scale <= tol {
                 log!("@@@   error within tolerance");
                 t0_u += 1;
                 let shift = t0_u.trailing_zeros();
                 t0_u >>= shift;
                 dt *= (1 << shift) as f32;
                 let euler_params = EulerParams::from_angles(cubic_params.th0, cubic_params.th1);
-                let es = EulerSeg::from_params(subcubic.p0, subcubic.p3, euler_params);
+                let es = EulerSeg::from_params(this_p0, this_p1, euler_params);
 
-                let es_scale = (es.p0 - es.p1).length();
                 let (k0, k1) = (es.params.k0 - 0.5 * es.params.k1, es.params.k1);
 
+                // TODO (raph, important): figure out how scale applies.
                 // compute forward integral to determine number of subdivisions
-                let dist_scaled = offset * scale / es_scale;
-                let a = -2.0 * dist_scaled * k1;
-                let b = -1.0 - 2.0 * dist_scaled * k0;
-                let int0 = espc_int_approx(b);
-                let int1 = espc_int_approx(a + b);
-                let integral = int1 - int0;
-                let k_peak = k0 - k1 * b / a;
-                let integrand_peak = (k_peak * (k_peak * dist_scaled + 1.0)).abs().sqrt();
-                let scaled_int = integral * integrand_peak / a;
-                let n_frac = 0.5 * (es_scale / scaled_sqrt_tol).sqrt() * scaled_int;
-                let n = n_frac.ceil();
+                let dist_scaled = offset * es.params.ch / chord_len;
+                // The number of subdivisions for curvature = 1
+                let scale_multiplier =
+                    0.5 * FRAC_1_SQRT_2 * (scale * chord_len / (es.params.ch * tol)).sqrt();
+                // TODO: tune these thresholds
+                const K1_THRESH: f32 = 1e-3;
+                const DIST_THRESH: f32 = 1e-3;
+                let mut a = 0.0;
+                let mut b = 0.0;
+                let mut integral = 0.0;
+                let mut int0 = 0.0;
+                let (n_frac, robust) = if k1.abs() < K1_THRESH {
+                    let k = k0 + 0.5 * k1;
+                    let n_frac = (k * (k * dist_scaled + 1.0)).abs().sqrt();
+                    (n_frac, EspcRobust::LowK1)
+                } else if dist_scaled.abs() < DIST_THRESH {
+                    let f = |x: f32| x * x.abs().sqrt();
+                    a = k1;
+                    b = k0;
+                    int0 = f(b);
+                    let int1 = f(a + b);
+                    integral = int1 - int0;
+                    //println!("int0={int0}, int1={int1} a={a} b={b}");
+                    let n_frac = (2. / 3.) * integral / a;
+                    (n_frac, EspcRobust::LowDist)
+                } else {
+                    a = -2.0 * dist_scaled * k1;
+                    b = -1.0 - 2.0 * dist_scaled * k0;
+                    int0 = espc_int_approx(b);
+                    let int1 = espc_int_approx(a + b);
+                    integral = int1 - int0;
+                    let k_peak = k0 - k1 * b / a;
+                    let integrand_peak = (k_peak * (k_peak * dist_scaled + 1.0)).abs().sqrt();
+                    let scaled_int = integral * integrand_peak / a;
+                    let n_frac = scaled_int;
+                    (n_frac, EspcRobust::Normal)
+                };
+                let n = (n_frac * scale_multiplier).ceil().max(1.0);
 
                 // Flatten line segments
                 log!("@@@   loop2: lines: {n}");
@@ -544,8 +597,19 @@ fn flatten_euler(
                     let mut lp0 = es.eval_with_offset(0., offset);
                     for i in 0..n as usize {
                         let t = (i + 1) as f32 / n;
-                        let inv = espc_int_inv_approx(integral * t + int0);
-                        let s = (inv - b) / a;
+                        let s = match robust {
+                            EspcRobust::LowK1 => t,
+                            // Note opportunities to minimize divergence
+                            EspcRobust::LowDist => {
+                                let c = (integral * t + int0).cbrt();
+                                let inv = c * c.abs();
+                                (inv - b) / a
+                            }
+                            EspcRobust::Normal => {
+                                let inv = espc_int_inv_approx(integral * t + int0);
+                                (inv - b) / a
+                            }
+                        };
                         let lp1 = es.eval_with_offset(s, offset);
                         let l0 = if offset > 0. { lp0 } else { lp1 };
                         let l1 = if offset > 0. { lp1 } else { lp0 };
@@ -555,6 +619,9 @@ fn flatten_euler(
                         lp0 = lp1;
                     }
                 }
+                last_p = this_p1;
+                last_q = this_q1;
+                last_t = t1;
                 break;
             }
             t0_u = t0_u.saturating_mul(2);

From 1e18875aefe86d80693214d9eff4c0c0fba6b776 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Sun, 25 Feb 2024 17:17:46 -0800
Subject: [PATCH 03/16] Checkpoint

This pipes through the start/end points (always using them as endpoints) and always uses Euler flattening.

It seems to basically work for fills, but has lots of artifacts still for strokes.
---
 src/cpu_shader/flatten.rs | 158 +++++++++++++++++---------------------
 1 file changed, 72 insertions(+), 86 deletions(-)

diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs
index e6215b7ce..61711b4f1 100644
--- a/src/cpu_shader/flatten.rs
+++ b/src/cpu_shader/flatten.rs
@@ -404,6 +404,8 @@ fn flatten_euler(
     local_to_device: &Transform,
     offset: f32,
     is_line: bool,
+    start_p: Vec2,
+    end_p: Vec2,
     line_ix: &mut usize,
     lines: &mut [LineSoup],
     bbox: &mut IntBbox,
@@ -432,6 +434,11 @@ fn flatten_euler(
             local_to_device.clone(),
         )
     };
+    let (t_start, t_end) = if offset == 0.0 {
+        (p0, p3)
+    } else {
+        (start_p, end_p)
+    };
 
     // Special-case lines.
     // We still have to handle colinear cubic parameters. We are special casing the line-to
@@ -441,12 +448,10 @@ fn flatten_euler(
     if is_line {
         let tan = p3 - p0;
         if tan.length() > ROBUST_EPSILON {
-            let tan_norm = tan.normalize();
-            let n = Vec2::new(-tan_norm.y, tan_norm.x);
-            let lp0 = p0 + n * offset;
-            let lp1 = p3 + n * offset;
-            let l0 = if offset > 0. { lp0 } else { lp1 };
-            let l1 = if offset > 0. { lp1 } else { lp0 };
+            let lp0 = t_start;
+            let lp1 = t_end;
+            let l0 = if offset >= 0. { lp0 } else { lp1 };
+            let l1 = if offset >= 0. { lp1 } else { lp0 };
             log!("@@@ output line: {:?}, {:?}, tan: {:?}", l0, l1, tan);
             output_line_with_transform(path_ix, l0, l1, &transform, line_ix, lines, bbox);
         } else {
@@ -455,46 +460,6 @@ fn flatten_euler(
         return;
     }
 
-    // Special-case colinear cubic segments
-    // TODO: clean this up
-    if check_colinear(p0, p1, p3)
-        && check_colinear(p0, p2, p3)
-        && check_colinear(p0, p1, p2)
-        && check_colinear(p1, p2, p3)
-    {
-        let (start, end) = {
-            let distances = [
-                ((p1 - p0).length(), p1, p0),
-                ((p2 - p0).length(), p2, p0),
-                ((p3 - p0).length(), p3, p0),
-                ((p2 - p1).length(), p2, p1),
-                ((p3 - p1).length(), p3, p1),
-                ((p3 - p2).length(), p3, p2),
-            ];
-            let mut longest = distances[0];
-            for d in &distances[1..] {
-                if d.0 > longest.0 {
-                    longest = *d;
-                }
-            }
-            (longest.1, longest.2)
-        };
-        let tan = end - start;
-        if tan.length() > ROBUST_EPSILON {
-            let tan_norm = tan.normalize();
-            let n = Vec2::new(-tan_norm.y, tan_norm.x);
-            let lp0 = start + n * offset;
-            let lp1 = end + n * offset;
-            let l0 = if offset > 0. { lp0 } else { lp1 };
-            let l1 = if offset > 0. { lp1 } else { lp0 };
-            log!("@@@ output line: {:?}, {:?}, tan: {:?}", l0, l1, tan);
-            output_line_with_transform(path_ix, l0, l1, &transform, line_ix, lines, bbox);
-        } else {
-            log!("@@@ drop line: {:?}, {:?}, tan: {:?}", start, end, tan);
-        }
-        return;
-    }
-
     let tol: f32 = 0.25;
     let mut t0_u: u32 = 0;
     let mut dt: f32 = 1.;
@@ -506,6 +471,7 @@ fn flatten_euler(
     let mut last_t = 0.;
 
     loop {
+        // TODO: this isn't right, it drops nasty lines; we should just limit subdivision
         if dt < ROBUST_EPSILON {
             break;
         }
@@ -594,25 +560,34 @@ fn flatten_euler(
                     // Skip the segment if `n` is NaN. This is for debugging purposes only
                     log!("@@@   NaN: parameters:\n  es: {:#?}\n  k0: {k0}, k1: {k1}\n  dist_scaled: {dist_scaled}\n  es_scale: {es_scale}\n  a: {a}\n  b: {b}\n  int0: {int0}, int1: {int1}, integral: {integral}\n  k_peak: {k_peak}\n  integrand_peak: {integrand_peak}\n  scaled_int: {scaled_int}\n  n_frac:  {n_frac}", es);
                 } else {
-                    let mut lp0 = es.eval_with_offset(0., offset);
+                    let mut lp0 = if t0 == 0.0 {
+                        t_start
+                    } else {
+                        es.eval_with_offset(0.0, offset)
+                    };
                     for i in 0..n as usize {
-                        let t = (i + 1) as f32 / n;
-                        let s = match robust {
-                            EspcRobust::LowK1 => t,
-                            // Note opportunities to minimize divergence
-                            EspcRobust::LowDist => {
-                                let c = (integral * t + int0).cbrt();
-                                let inv = c * c.abs();
-                                (inv - b) / a
-                            }
-                            EspcRobust::Normal => {
-                                let inv = espc_int_inv_approx(integral * t + int0);
-                                (inv - b) / a
-                            }
-                        };
-                        let lp1 = es.eval_with_offset(s, offset);
-                        let l0 = if offset > 0. { lp0 } else { lp1 };
-                        let l1 = if offset > 0. { lp1 } else { lp0 };
+                        let lp1;
+                        if i == n as usize - 1 && t1 == 1.0 {
+                            lp1 = t_end;
+                        } else {
+                            let t = (i + 1) as f32 / n;
+                            let s = match robust {
+                                EspcRobust::LowK1 => t,
+                                // Note opportunities to minimize divergence
+                                EspcRobust::LowDist => {
+                                    let c = (integral * t + int0).cbrt();
+                                    let inv = c * c.abs();
+                                    (inv - b) / a
+                                }
+                                EspcRobust::Normal => {
+                                    let inv = espc_int_inv_approx(integral * t + int0);
+                                    (inv - b) / a
+                                }
+                            };
+                            lp1 = es.eval_with_offset(s, offset);
+                        }
+                        let l0 = if offset >= 0. { lp0 } else { lp1 };
+                        let l1 = if offset >= 0. { lp1 } else { lp0 };
                         output_line_with_transform(
                             path_ix, l0, l1, &transform, line_ix, lines, bbox,
                         );
@@ -970,27 +945,6 @@ fn flatten_main(
                     }
                 } else {
                     let is_line = seg_type == PATH_TAG_LINETO;
-                    // Render offset curves
-                    flatten_euler(
-                        &pts,
-                        path_ix,
-                        &transform,
-                        offset,
-                        is_line,
-                        &mut line_ix,
-                        lines,
-                        &mut bbox,
-                    );
-                    flatten_euler(
-                        &pts,
-                        path_ix,
-                        &transform,
-                        -offset,
-                        is_line,
-                        &mut line_ix,
-                        lines,
-                        &mut bbox,
-                    );
 
                     // Read the neighboring segment.
                     let neighbor =
@@ -1019,6 +973,33 @@ fn flatten_main(
                     let n_next = offset * Vec2::new(-tan_next_norm.y, tan_next_norm.x);
                     log!("@ tan_prev: {:#?}", tan_prev);
                     log!("@ tan_next: {:#?}", tan_next);
+
+                    // Render offset curves
+                    flatten_euler(
+                        &pts,
+                        path_ix,
+                        &transform,
+                        offset,
+                        is_line,
+                        pts.p0 + n_prev,
+                        pts.p3 + n_prev,
+                        &mut line_ix,
+                        lines,
+                        &mut bbox,
+                    );
+                    flatten_euler(
+                        &pts,
+                        path_ix,
+                        &transform,
+                        -offset,
+                        is_line,
+                        pts.p0 - n_prev,
+                        pts.p3 - n_prev,
+                        &mut line_ix,
+                        lines,
+                        &mut bbox,
+                    );
+
                     if neighbor.do_join {
                         draw_join(
                             path_ix,
@@ -1050,11 +1031,16 @@ fn flatten_main(
                     }
                 }
             } else {
-                flatten_cubic(
+                // TODO: determine if we still need this; we shouldn't for correctness
+                let is_line = seg_type == PATH_TAG_LINETO;
+                flatten_euler(
                     &pts,
                     path_ix,
                     &transform,
                     /*offset*/ 0.,
+                    is_line,
+                    pts.p0,
+                    pts.p3,
                     &mut line_ix,
                     lines,
                     &mut bbox,

From 73e2c98c50ad26d671f9d25e094f68265845a1cc Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Sun, 25 Feb 2024 18:12:22 -0800
Subject: [PATCH 04/16] Don't special case lines

Drop zero-length lines. This doesn't get rid of all the code, but it shouldn't be needed for correctness.

In addition, zero length lines are dropped. We need to come back to this, they do have some semantics for strokes.
---
 src/cpu_shader/flatten.rs | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs
index 61711b4f1..53436840e 100644
--- a/src/cpu_shader/flatten.rs
+++ b/src/cpu_shader/flatten.rs
@@ -440,6 +440,14 @@ fn flatten_euler(
         (start_p, end_p)
     };
 
+    // Drop zero length lines.
+    // TODO: in some stroke cases we should keep these. Should it be handled while
+    // encoding or here?
+    if p0 == p1 && p0 == p2 && p0 == p3 {
+        return;
+    }
+
+    /*
     // Special-case lines.
     // We still have to handle colinear cubic parameters. We are special casing the line-to
     // encoding because floating point errors in the degree raise can cause some line-tos to slip
@@ -459,6 +467,7 @@ fn flatten_euler(
         }
         return;
     }
+    */
 
     let tol: f32 = 0.25;
     let mut t0_u: u32 = 0;

From 0dff81e3431900f38f82bb0f364d148ee5e0e961 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Sun, 25 Feb 2024 20:25:07 -0800
Subject: [PATCH 05/16] Force style for each encoded glyph

---
 crates/encoding/src/glyph_cache.rs | 3 +++
 src/cpu_shader/flatten.rs          | 8 ++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/crates/encoding/src/glyph_cache.rs b/crates/encoding/src/glyph_cache.rs
index b83fd0a79..72e95423b 100644
--- a/crates/encoding/src/glyph_cache.rs
+++ b/crates/encoding/src/glyph_cache.rs
@@ -49,6 +49,9 @@ impl GlyphCache {
                 Style::Fill(fill) => *fill,
                 Style::Stroke(_) => Fill::NonZero,
             };
+            // Make sure each glyph gets encoded with a style.
+            // TODO: can probably optimize by setting style per run
+            encoding_cache.force_next_transform_and_style();
             encoding_cache.encode_fill_style(fill);
             let mut path = encoding_cache.encode_path(true);
             let outline = outlines.get(GlyphId::new(key.glyph_id as u16))?;
diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs
index 53436840e..ecce74eff 100644
--- a/src/cpu_shader/flatten.rs
+++ b/src/cpu_shader/flatten.rs
@@ -114,9 +114,9 @@ fn cubic_start_tangent(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2) -> Vec2 {
     let d01 = p1 - p0;
     let d02 = p2 - p0;
     let d03 = p3 - p0;
-    if d01.dot(d01) > ROBUST_EPSILON {
+    if d01.length_squared() > ROBUST_EPSILON {
         d01
-    } else if d02.dot(d02) > ROBUST_EPSILON {
+    } else if d02.length_squared() > ROBUST_EPSILON {
         d02
     } else {
         d03
@@ -127,9 +127,9 @@ fn cubic_end_tangent(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2) -> Vec2 {
     let d23 = p3 - p2;
     let d13 = p3 - p1;
     let d03 = p3 - p0;
-    if d23.dot(d23) > ROBUST_EPSILON {
+    if d23.length_squared() > ROBUST_EPSILON {
         d23
-    } else if d13.dot(d13) > ROBUST_EPSILON {
+    } else if d13.length_squared() > ROBUST_EPSILON {
         d13
     } else {
         d03

From 932b2546f2823e73dc9de5489dc446da39e3629d Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Sun, 25 Feb 2024 20:52:30 -0800
Subject: [PATCH 06/16] Better continuity of flattening

Reuse endpoints of Euler segments. In theory, this should ensure watertight meshes, but we're definitely not seeing it.
---
 src/cpu_shader/flatten.rs | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs
index ecce74eff..ebcfdf9f5 100644
--- a/src/cpu_shader/flatten.rs
+++ b/src/cpu_shader/flatten.rs
@@ -478,6 +478,7 @@ fn flatten_euler(
         last_q = eval_cubic_and_deriv(p0, p1, p2, p3, DERIV_EPS).1;
     }
     let mut last_t = 0.;
+    let mut lp0 = t_start;
 
     loop {
         // TODO: this isn't right, it drops nasty lines; we should just limit subdivision
@@ -569,11 +570,6 @@ fn flatten_euler(
                     // Skip the segment if `n` is NaN. This is for debugging purposes only
                     log!("@@@   NaN: parameters:\n  es: {:#?}\n  k0: {k0}, k1: {k1}\n  dist_scaled: {dist_scaled}\n  es_scale: {es_scale}\n  a: {a}\n  b: {b}\n  int0: {int0}, int1: {int1}, integral: {integral}\n  k_peak: {k_peak}\n  integrand_peak: {integrand_peak}\n  scaled_int: {scaled_int}\n  n_frac:  {n_frac}", es);
                 } else {
-                    let mut lp0 = if t0 == 0.0 {
-                        t_start
-                    } else {
-                        es.eval_with_offset(0.0, offset)
-                    };
                     for i in 0..n as usize {
                         let lp1;
                         if i == n as usize - 1 && t1 == 1.0 {

From c3285fa9be324e0ddf04236e460ad9f6414e93ff Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Sun, 25 Feb 2024 22:32:59 -0800
Subject: [PATCH 07/16] Fix end normals

Calculation of end normals for stroke offset was mistaken. At this point, it's starting to look fairly good. Some of the tricky strokes need investigation, and handling of zero-length lines is still a bit dodgy.
---
 src/cpu_shader/flatten.rs | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs
index ebcfdf9f5..86f4cb4a3 100644
--- a/src/cpu_shader/flatten.rs
+++ b/src/cpu_shader/flatten.rs
@@ -674,9 +674,11 @@ fn draw_join(
 
     match style_flags & Style::FLAGS_JOIN_MASK {
         Style::FLAGS_JOIN_BITS_BEVEL => {
-            output_two_lines_with_transform(
-                path_ix, front0, front1, back0, back1, transform, line_ix, lines, bbox,
-            );
+            if front0 != front1 && back0 != back1 {
+                output_two_lines_with_transform(
+                    path_ix, front0, front1, back0, back1, transform, line_ix, lines, bbox,
+                );
+            }
         }
         Style::FLAGS_JOIN_BITS_MITER => {
             let hypot = cr.hypot(d);
@@ -956,6 +958,8 @@ fn flatten_main(
                         read_neighboring_segment(ix + 1, pathtags, pathdata, tag_monoids);
                     let tan_prev = cubic_end_tangent(pts.p0, pts.p1, pts.p2, pts.p3);
                     let tan_next = neighbor.tangent;
+                    let tan_start = cubic_start_tangent(pts.p0, pts.p1, pts.p2, pts.p3);
+                    // TODO: be consistent w/ robustness here
 
                     // TODO: add NaN assertions to CPU shaders PR (when writing lines)
                     // TODO: not all zero-length segments are getting filtered out
@@ -972,6 +976,7 @@ fn flatten_main(
                         tan_next
                     };
 
+                    let n_start = offset * Vec2::new(-tan_start.y, tan_start.x).normalize();
                     let offset_tangent = offset * tan_prev.normalize();
                     let n_prev = Vec2::new(-offset_tangent.y, offset_tangent.x);
                     let tan_next_norm = tan_next.normalize();
@@ -986,7 +991,7 @@ fn flatten_main(
                         &transform,
                         offset,
                         is_line,
-                        pts.p0 + n_prev,
+                        pts.p0 + n_start,
                         pts.p3 + n_prev,
                         &mut line_ix,
                         lines,
@@ -998,7 +1003,7 @@ fn flatten_main(
                         &transform,
                         -offset,
                         is_line,
-                        pts.p0 - n_prev,
+                        pts.p0 - n_start,
                         pts.p3 - n_prev,
                         &mut line_ix,
                         lines,

From d7325aa7df588d295f6bdf679f83728f73917ec7 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Tue, 5 Mar 2024 17:00:40 -0800
Subject: [PATCH 08/16] Checkpoint WGSL work

Euler flattening seems to be almost working as long as subdivision is disabled, at least for fills. This commit has that disabled.
---
 shader/flatten.wgsl | 411 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 404 insertions(+), 7 deletions(-)

diff --git a/shader/flatten.wgsl b/shader/flatten.wgsl
index 65cb6545a..7ce14940c 100644
--- a/shader/flatten.wgsl
+++ b/shader/flatten.wgsl
@@ -53,6 +53,211 @@ fn approx_parabola_inv_integral(x: f32) -> f32 {
     return x * sqrt(1.0 - B + (B * B + 0.5 * x * x));
 }
 
+// Functions for Euler spirals
+
+struct CubicParams {
+    th0: f32,
+    th1: f32,
+    d0: f32,
+    d1: f32,
+}
+
+struct EulerParams {
+    th0: f32,
+    th1: f32,
+    k0: f32,
+    k1: f32,
+    ch: f32,
+}
+
+struct EulerSeg {
+    p0: vec2f,
+    p1: vec2f,
+    params: EulerParams,
+}
+
+/// Compute cubic parameters from endpoints and derivatives.
+fn cubic_from_points_derivs(p0: vec2f, p1: vec2f, q0: vec2f, q1: vec2f, dt: f32) -> CubicParams {
+    let chord = p1 - p0;
+    let scale = dt / dot(chord, chord);
+    let h0 = vec2(q0.x * chord.x + q0.y * chord.y, q0.y * chord.x - q0.x * chord.y);
+    let th0 = atan2(h0.y, h0.x);
+    let d0 = length(h0) * scale;
+    let h1 = vec2(q1.x * chord.x + q1.y * chord.y, q1.x * chord.y - q1.y * chord.x);
+    let th1 = atan2(h1.y, h1.x);
+    let d1 = length(h1) * scale;
+    return CubicParams(th0, th1, d0, d1);
+}
+
+// Estimate error of geometric Hermite interpolation to Euler spiral.
+fn est_euler_err(cparams: CubicParams) -> f32 {
+    let cth0 = cos(cparams.th0);
+    let cth1 = cos(cparams.th1);
+    if cth0 * cth1 < 0.0 {
+        return 2.0;
+    }
+    let e0 = (2. / 3.) / (1.0 + cth0);
+    let e1 = (2. / 3.) / (1.0 + cth1);
+    let s0 = sin(cparams.th0);
+    let s1 = sin(cparams.th1);
+    let s01 = cth0 * s1 + cth1 * s0;
+    let amin = 0.15 * (2. * e0 * s0 + 2. * e1 * s1 - e0 * e1 * s01);
+    let a = 0.15 * (2. * cparams.d0 * s0 + 2. * cparams.d1 * s1 - cparams.d0 * cparams.d1 * s01);
+    let aerr = abs(a - amin);
+    let symm = abs(cparams.th0 + cparams.th1);
+    let asymm = abs(cparams.th0 - cparams.th1);
+    let dist = length(vec2(cparams.d0 - e0, cparams.d1 - e1));
+    let symm2 = symm * symm;
+    let ctr = (4.625e-6 * symm * symm2 + 7.5e-3 * asymm) * symm2;
+    let halo = (5e-3 * symm + 7e-2 * asymm) * dist;
+    return ctr + 1.55 * aerr + halo;
+}
+
+fn es_params_from_angles(th0: f32, th1: f32) -> EulerParams {
+    let k0 = th0 + th1;
+    let dth = th1 - th0;
+    let d2 = dth * dth;
+    let k2 = k0 * k0;
+    var a = 6.0;
+    a -= d2 * (1. / 70.);
+    a -= (d2 * d2) * (1. / 10780.);
+    a += (d2 * d2 * d2) * 2.769178184818219e-07;
+    let b = -0.1 + d2 * (1. / 4200.) + d2 * d2 * 1.6959677820260655e-05;
+    let c = -1. / 1400. + d2 * 6.84915970574303e-05 - k2 * 7.936475029053326e-06;
+    a += (b + c * k2) * k2;
+    let k1 = dth * a;
+
+    // calculation of chord
+    var ch = 1.0;
+    ch -= d2 * (1. / 40.);
+    ch += (d2 * d2) * 0.00034226190482569864;
+    ch -= (d2 * d2 * d2) * 1.9349474568904524e-06;
+    let b_ = -1. / 24. + d2 * 0.0024702380951963226 - d2 * d2 * 3.7297408997537985e-05;
+    let c_ = 1. / 1920. - d2 * 4.87350869747975e-05 - k2 * 3.1001936068463107e-06;
+    ch += (b_ + c_ * k2) * k2;
+    return EulerParams(th0, th1, k0, k1, ch);
+}
+
+fn es_params_eval_th(params: EulerParams, t: f32) -> f32 {
+    return (params.k0 + 0.5 * params.k1 * (t - 1.0)) * t - params.th0;
+}
+
+// Integrate Euler spiral.
+fn integ_euler_10(k0: f32, k1: f32) -> vec2f {
+    let t1_1 = k0;
+    let t1_2 = 0.5 * k1;
+    let t2_2 = t1_1 * t1_1;
+    let t2_3 = 2. * (t1_1 * t1_2);
+    let t2_4 = t1_2 * t1_2;
+    let t3_4 = t2_2 * t1_2 + t2_3 * t1_1;
+    let t3_6 = t2_4 * t1_2;
+    let t4_4 = t2_2 * t2_2;
+    let t4_5 = 2. * (t2_2 * t2_3);
+    let t4_6 = 2. * (t2_2 * t2_4) + t2_3 * t2_3;
+    let t4_7 = 2. * (t2_3 * t2_4);
+    let t4_8 = t2_4 * t2_4;
+    let t5_6 = t4_4 * t1_2 + t4_5 * t1_1;
+    let t5_8 = t4_6 * t1_2 + t4_7 * t1_1;
+    let t6_6 = t4_4 * t2_2;
+    let t6_7 = t4_4 * t2_3 + t4_5 * t2_2;
+    let t6_8 = t4_4 * t2_4 + t4_5 * t2_3 + t4_6 * t2_2;
+    let t7_8 = t6_6 * t1_2 + t6_7 * t1_1;
+    let t8_8 = t6_6 * t2_2;
+    var u = 1.;
+    u -= (1. / 24.) * t2_2 + (1. / 160.) * t2_4;
+    u += (1. / 1920.) * t4_4 + (1. / 10752.) * t4_6 + (1. / 55296.) * t4_8;
+    u -= (1. / 322560.) * t6_6 + (1. / 1658880.) * t6_8;
+    u += (1. / 92897280.) * t8_8;
+    var v = (1. / 12.) * t1_2;
+    v -= (1. / 480.) * t3_4 + (1. / 2688.) * t3_6;
+    v += (1. / 53760.) * t5_6 + (1. / 276480.) * t5_8;
+    v -= (1. / 11612160.) * t7_8;
+    return vec2(u, v);
+}
+
+fn es_params_eval(params: EulerParams, t: f32) -> vec2f {
+    let thm = es_params_eval_th(params, t * 0.5);
+    let k0 = params.k0;
+    let k1 = params.k1;
+    let uv = integ_euler_10((k0 + k1 * (0.5 * t - 0.5)) * t, k1 * t * t);
+    let scale = t / params.ch;
+    let s = scale * sin(thm);
+    let c = scale * cos(thm);
+    let x = uv.x * c - uv.y * s;
+    let y = -uv.y * c - uv.x * s;
+    return vec2(x, y);
+}
+
+fn es_params_eval_with_offset(params: EulerParams, t: f32, offset: f32) -> vec2f {
+    let th = es_params_eval_th(params, t);
+    let v = offset * vec2f(sin(th), cos(th));
+    return es_params_eval(params, t) + v;
+}
+
+fn es_seg_from_params(p0: vec2f, p1: vec2f, params: EulerParams) -> EulerSeg {
+    return EulerSeg(p0, p1, params);
+}
+
+fn es_seg_eval_with_offset(es: EulerSeg, t: f32, offset: f32) -> vec2f {
+    let chord = es.p1 - es.p0;
+    let scaled = offset / length(chord);
+    let xy = es_params_eval_with_offset(es.params, t, scaled);
+    return es.p0 + vec2f(chord.x * xy.x - chord.y * xy.y, chord.x * xy.y + chord.y * xy.x);
+}
+
+fn pow_1_5_signed(x: f32) -> f32 {
+    return x * sqrt(abs(x));
+}
+
+const BREAK1: f32 = 0.8;
+const BREAK2: f32 = 1.25;
+const BREAK3: f32 = 2.1;
+const SIN_SCALE: f32 = 1.0976991822760038;
+const QUAD_A1: f32 = 0.6406;
+const QUAD_B1: f32 = -0.81;
+const QUAD_C1: f32 = 0.9148117935952064;
+const QUAD_A2: f32 = 0.5;
+const QUAD_B2: f32 = -0.156;
+const QUAD_C2: f32 = 0.16145779359520596;
+const QUAD_W1: f32 = 0.5 * QUAD_B1 / QUAD_A1;
+const QUAD_V1: f32 = 1.0 / QUAD_A1;
+const QUAD_U1: f32 = QUAD_W1 * QUAD_W1 - QUAD_C1 / QUAD_A1;
+const QUAD_W2: f32 = 0.5 * QUAD_B2 / QUAD_A2;
+const QUAD_V2: f32 = 1.0 / QUAD_A2;
+const QUAD_U2: f32 = QUAD_W2 * QUAD_W2 - QUAD_C2 / QUAD_A2;
+const FRAC_PI_4: f32 = 0.7853981633974483;
+const CBRT_9_8: f32 = 1.040041911525952;
+
+fn espc_int_approx(x: f32) -> f32 {
+    let y = abs(x);
+    var a: f32;
+    if y < BREAK1 {
+        a = sin(SIN_SCALE * y) * (1.0 / SIN_SCALE);
+    } else if y < BREAK2 {
+        a = (sqrt(8.0) / 3.0) * pow_1_5_signed(y - 1.0) + FRAC_PI_4;
+    } else {
+        let abc = select(vec3(QUAD_A2, QUAD_B2, QUAD_C2), vec3(QUAD_A1, QUAD_B1, QUAD_C1), y < BREAK3);
+        a = (abc.x * y + abc.y) * y + abc.z;
+    };
+    return a * sign(x);
+}
+
+fn espc_int_inv_approx(x: f32) -> f32 {
+    let y = abs(x);
+    var a: f32;
+    if y < 0.7010707591262915 {
+        a = asin(x * SIN_SCALE) * (1.0 / SIN_SCALE);
+    } else if y < 0.903249293595206 {
+        let b = y - FRAC_PI_4;
+        let u = pow(abs(b), 2. / 3.) * sign(b);
+        a = u * CBRT_9_8 + 1.0;
+    } else {
+        let uvw = select(vec3(QUAD_U2, QUAD_V2, QUAD_W2), vec3(QUAD_U1, QUAD_V1, QUAD_W1), y < 2.038857793595206);
+        a = sqrt(uvw.x + uvw.y * y) - uvw.z;
+    }
+    return a * sign(x);
+}
+
 // Notes on fractional subdivision:
 // --------------------------------
 // The core of the existing flattening algorithm (see `flatten_cubic` below) is to approximate the
@@ -107,6 +312,21 @@ fn eval_cubic(p0: vec2f, p1: vec2f, p2: vec2f, p3: vec2f, t: f32) -> vec2f {
     return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
 }
 
+struct PointDeriv {
+    point: vec2f,
+    deriv: vec2f,
+}
+
+fn eval_cubic_and_deriv(p0: vec2f, p1: vec2f, p2: vec2f, p3: vec2f, t: f32) -> PointDeriv {
+    let m = 1.0 - t;
+    let mm = m * m;
+    let mt = m * t;
+    let tt = t * t;
+    let p = p0 * (mm * m) + (p1 * (3.0 * mm) + p2 * (3.0 * mt) + p3 * tt) * t;
+    let q = (p1 - p0) * mm + (p2 - p1) * (2.0 * mt) + (p3 - p2) * tt;
+    return PointDeriv(p, q);
+}
+
 fn eval_quad_tangent(p0: vec2f, p1: vec2f, p2: vec2f, t: f32) -> vec2f {
     let dp0 = 2. * (p1 - p0);
     let dp1 = 2. * (p2 - p1);
@@ -144,6 +364,177 @@ fn cubic_end_normal(p0: vec2f, p1: vec2f, p2: vec2f, p3: vec2f) -> vec2f {
     return vec2(-tangent.y, tangent.x);
 }
 
+const ESPC_ROBUST_NORMAL = 0;
+const ESPC_ROBUST_LOW_K1 = 1;
+const ESPC_ROBUST_LOW_DIST = 2;
+
+// Threshold below which a derivative is considered too small.
+const DERIV_THRESH: f32 = 1e-6;
+// Amount to nudge t when derivative is near-zero.
+const DERIV_EPS: f32 = 1e-6;
+// Limit for subdivision of cubic Béziers.
+const SUBDIV_LIMIT: f32 = 1.0 / 65536.0;
+// Robust ESPC computation: below this value, treat curve as circular arc
+const K1_THRESH: f32 = 1e-3;
+// Robust ESPC: below this value, evaluate ES rather than parallel curve
+const DIST_THRESH: f32 = 1e-3;
+
+// This function flattens a cubic Bézier by first converting it into Euler spiral
+// segments, and then computes a near-optimal flattening of the parallel curves of
+// the Euler spiral segments.
+fn flatten_euler(
+    cubic: CubicPoints,
+    path_ix: u32,
+    local_to_device: Transform,
+    offset: f32,
+    start_p: vec2f,
+    end_p: vec2f,
+) {
+    var p0: vec2f;
+    var p1: vec2f;
+    var p2: vec2f;
+    var p3: vec2f;
+    var scale: f32;
+    var transform: Transform;
+    var t_start = start_p;
+    var t_end = end_p;
+    if offset == 0. {
+        let t = local_to_device;
+        p0 = transform_apply(t, cubic.p0);
+        p1 = transform_apply(t, cubic.p1);
+        p2 = transform_apply(t, cubic.p2);
+        p3 = transform_apply(t, cubic.p3);
+        scale = 1.;
+        transform = transform_identity();
+        t_start = p0;
+        t_end = p3;
+    } else {
+        p0 = cubic.p0;
+        p1 = cubic.p1;
+        p2 = cubic.p2;
+        p3 = cubic.p3;
+
+        transform = local_to_device;
+        let mat = transform.mat;
+        scale = 0.5 * length(vec2(mat.x + mat.w, mat.y - mat.z)) +
+                length(vec2(mat.x - mat.w, mat.y + mat.z));
+    }
+
+    // Drop zero length lines.
+    if all(p0 == p1) && all(p0 == p2) && all(p0 == p3) {
+        return;
+    }
+    //output_line_with_transform(path_ix, t_start, t_end, transform);
+    //if true { return; }
+
+    let tol = 0.25;
+    var t0_u = 0u;
+    var dt = 1.0;
+    var last_p = p0;
+    var last_q = p1 - p0;
+    if dot(last_q, last_q) < DERIV_THRESH * DERIV_THRESH {
+        last_q = eval_cubic_and_deriv(p0, p1, p2, p3, DERIV_EPS).deriv;
+    }
+    var last_t = 0.0;
+    var lp0 = t_start;
+    loop {
+        let t0 = f32(t0_u) * dt;
+        if t0 == 1.0 {
+            break;
+        }
+        loop {
+            var t1 = t0 + dt;
+            let this_p0 = last_p;
+            let this_q0 = last_q;
+            var this_pq1 = eval_cubic_and_deriv(p0, p1, p2, p3, t1);
+            if dot(this_pq1.deriv, this_pq1.deriv) < DERIV_THRESH * DERIV_THRESH {
+                let new_pq1 = eval_cubic_and_deriv(p0, p1, p2, p3, t1 - DERIV_EPS);
+                this_pq1.deriv = new_pq1.deriv;
+                if t1 < 1.0 {
+                    this_pq1.point = new_pq1.point;
+                    t1 = t1 - DERIV_EPS;
+                }
+            }
+            let actual_dt = t1 - last_t;
+            let cubic_params = cubic_from_points_derivs(this_p0, this_pq1.point, this_q0, this_pq1.deriv, actual_dt);
+            let est_err = est_euler_err(cubic_params);
+            let chord_len = length(this_pq1.point - this_p0);
+            let err = est_err * chord_len;
+            if true || err * scale <= tol || dt <= SUBDIV_LIMIT {
+                t0_u += 1u;
+                let shift = countTrailingZeros(t0_u);
+                t0_u >>= shift;
+                dt = f32(1u << shift);
+                let euler_params = es_params_from_angles(cubic_params.th0, cubic_params.th1);
+                let es = es_seg_from_params(this_p0, this_pq1.point, euler_params);
+                let k0 = es.params.k0 - 0.5 * es.params.k1;
+                let k1 = es.params.k1;
+                let dist_scaled = offset * es.params.ch / chord_len;
+                let scale_multiplier = sqrt(0.125 * scale * chord_len / (es.params.ch * tol));
+                var a = 0.0;
+                var b = 0.0;
+                var integral = 0.0;
+                var int0 = 0.0;
+                var n_frac: f32;
+                var robust = ESPC_ROBUST_NORMAL;
+                if abs(k1) < K1_THRESH {
+                    let k = es.params.k0;
+                    n_frac = sqrt(abs(k * (k * dist_scaled + 1.0)));
+                    robust = ESPC_ROBUST_LOW_K1;
+                } else if abs(dist_scaled) < DIST_THRESH {
+                    a = k1;
+                    b = k0;
+                    int0 = pow_1_5_signed(b);
+                    let int1 = pow_1_5_signed(a + b);
+                    integral = int1 - int0;
+                    n_frac = (2. / 3.) * integral / a;
+                    robust = ESPC_ROBUST_LOW_DIST;
+                } else {
+                    a = -2.0 * dist_scaled * k1;
+                    b = -1.0 - 2.0 * dist_scaled * k0;
+                    int0 = espc_int_approx(b);
+                    let int1 = espc_int_approx(a + b);
+                    integral = int1 - int0;
+                    let k_peak = k0 - k1 * b / a;
+                    let integrand_peak = sqrt(abs(k_peak * (k_peak * dist_scaled + 1.0)));
+                    n_frac = integral * integrand_peak / a;
+                }
+                let n = max(ceil(n_frac * scale_multiplier), 1.0);
+                for (var i = 0u; i < u32(n); i++) {
+                    var lp1: vec2f;
+                    if i + 1u == u32(n) && t1 == 1.0 {
+                        lp1 = t_end;
+                    } else {
+                        let t = f32(i + 1u) / n;
+                        var s = t;
+                        if robust != ESPC_ROBUST_LOW_K1 {
+                            let u = integral * t + int0;
+                            var inv: f32;
+                            if robust == ESPC_ROBUST_LOW_DIST {
+                                inv = pow(abs(u), 2. / 3.) * sign(u);
+                            } else {
+                                inv = espc_int_inv_approx(u);
+                            }
+                            s = (inv - b) / a;
+                        }
+                        lp1 = es_seg_eval_with_offset(es, s, offset);
+                    }
+                    let l0 = select(lp1, lp0, offset >= 0.);
+                    let l1 = select(lp0, lp1, offset >= 0.);
+                    output_line_with_transform(path_ix, lp0, lp1, transform);
+                    lp0 = lp1;
+                }
+                last_p = this_pq1.point;
+                last_q = this_pq1.deriv;
+                last_t = t1;
+                break;
+            }
+            t0_u = t0_u * 2u;
+            dt *= 0.5;
+        }
+    }
+}
+
 let MAX_QUADS = 16u;
 
 // This function flattens a cubic Bézier by first converting it into quadratics and
@@ -362,10 +753,10 @@ fn draw_join(
     let d = dot(tan_prev, tan_next);
 
     switch style_flags & STYLE_FLAGS_JOIN_MASK {
-        case /*STYLE_FLAGS_JOIN_BEVEL*/0u: {
+        case STYLE_FLAGS_JOIN_BEVEL: {
             output_two_lines_with_transform(path_ix, front0, front1, back0, back1, transform);
         }
-        case /*STYLE_FLAGS_JOIN_MITER*/0x10000000u: {
+        case STYLE_FLAGS_JOIN_MITER: {
             let hypot = length(vec2f(cr, d));
             let miter_limit = unpack2x16float(style_flags & STYLE_MITER_LIMIT_MASK)[0];
 
@@ -395,7 +786,7 @@ fn draw_join(
             write_line_with_transform(line_ix, path_ix, front0, front1, transform);
             write_line_with_transform(line_ix + 1u, path_ix, back0, back1, transform);
         }
-        case /*STYLE_FLAGS_JOIN_ROUND*/0x20000000u: {
+        case STYLE_FLAGS_JOIN_ROUND: {
             var arc0: vec2f;
             var arc1: vec2f;
             var other0: vec2f;
@@ -656,16 +1047,21 @@ fn main(
                     // Don't draw anything if the path is closed.
                 }
             } else {
-                // Render offset curves
-                flatten_cubic(pts, path_ix, transform, offset);
-
                 // Read the neighboring segment.
                 let neighbor = read_neighboring_segment(ix + 1u);
                 let tan_prev = cubic_end_tangent(pts.p0, pts.p1, pts.p2, pts.p3);
                 let tan_next = neighbor.tangent;
+                let tan_start = cubic_start_tangent(pts.p0, pts.p1, pts.p3, pts.p3);
+                // TODO: probably have special casing of zero tangents here
+                let n_start = offset * normalize(vec2(-tan_start.y, tan_start.x));
                 let offset_tangent = offset * normalize(tan_prev);
                 let n_prev = offset_tangent.yx * vec2f(-1., 1.);
                 let n_next = offset * normalize(tan_next).yx * vec2f(-1., 1.);
+
+                // Render offset curves
+                flatten_euler(pts, path_ix, transform, offset, pts.p0 + n_start, pts.p3 + n_prev);
+                flatten_euler(pts, path_ix, transform, -offset, pts.p0 - n_start, pts.p3 - n_prev);
+
                 if neighbor.do_join {
                     draw_join(path_ix, style_flags, pts.p3, tan_prev, tan_next,
                               n_prev, n_next, transform);
@@ -676,7 +1072,8 @@ fn main(
                 }
             }
         } else {
-            flatten_cubic(pts, path_ix, transform, /*offset*/ 0.);
+            let offset = 0.;
+            flatten_euler(pts, path_ix, transform, offset, pts.p0, pts.p3);
         }
         // Update bounding box using atomics only. Computing a monoid is a
         // potential future optimization.

From f256921e9c137ec56876f176a5370eaa33026be1 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Thu, 7 Mar 2024 13:36:21 -0800
Subject: [PATCH 09/16] Fix bugs

Simple typo in subdivision logic (and re-enable subdivision, which was disabled in previous commit).

Sign wrong in inverse integral.

Typo in calculation of start tangent (repeated p3 twice instead of p2, p3).

Note: also enables GPU-side stroking
---
 shader/flatten.wgsl | 10 +++++-----
 src/scene.rs        |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/shader/flatten.wgsl b/shader/flatten.wgsl
index 7ce14940c..0a34d4584 100644
--- a/shader/flatten.wgsl
+++ b/shader/flatten.wgsl
@@ -246,7 +246,7 @@ fn espc_int_inv_approx(x: f32) -> f32 {
     let y = abs(x);
     var a: f32;
     if y < 0.7010707591262915 {
-        a = asin(x * SIN_SCALE) * (1.0 / SIN_SCALE);
+        a = asin(y * SIN_SCALE) * (1.0 / SIN_SCALE);
     } else if y < 0.903249293595206 {
         let b = y - FRAC_PI_4;
         let u = pow(abs(b), 2. / 3.) * sign(b);
@@ -460,11 +460,11 @@ fn flatten_euler(
             let est_err = est_euler_err(cubic_params);
             let chord_len = length(this_pq1.point - this_p0);
             let err = est_err * chord_len;
-            if true || err * scale <= tol || dt <= SUBDIV_LIMIT {
+            if err * scale <= tol || dt <= SUBDIV_LIMIT {
                 t0_u += 1u;
                 let shift = countTrailingZeros(t0_u);
                 t0_u >>= shift;
-                dt = f32(1u << shift);
+                dt *= f32(1u << shift);
                 let euler_params = es_params_from_angles(cubic_params.th0, cubic_params.th1);
                 let es = es_seg_from_params(this_p0, this_pq1.point, euler_params);
                 let k0 = es.params.k0 - 0.5 * es.params.k1;
@@ -521,7 +521,7 @@ fn flatten_euler(
                     }
                     let l0 = select(lp1, lp0, offset >= 0.);
                     let l1 = select(lp0, lp1, offset >= 0.);
-                    output_line_with_transform(path_ix, lp0, lp1, transform);
+                    output_line_with_transform(path_ix, l0, l1, transform);
                     lp0 = lp1;
                 }
                 last_p = this_pq1.point;
@@ -1051,7 +1051,7 @@ fn main(
                 let neighbor = read_neighboring_segment(ix + 1u);
                 let tan_prev = cubic_end_tangent(pts.p0, pts.p1, pts.p2, pts.p3);
                 let tan_next = neighbor.tangent;
-                let tan_start = cubic_start_tangent(pts.p0, pts.p1, pts.p3, pts.p3);
+                let tan_start = cubic_start_tangent(pts.p0, pts.p1, pts.p2, pts.p3);
                 // TODO: probably have special casing of zero tangents here
                 let n_start = offset * normalize(vec2(-tan_start.y, tan_start.x));
                 let offset_tangent = offset * normalize(tan_prev);
diff --git a/src/scene.rs b/src/scene.rs
index 136dc8b2c..c89097e85 100644
--- a/src/scene.rs
+++ b/src/scene.rs
@@ -104,7 +104,7 @@ impl Scene {
         const SHAPE_TOLERANCE: f64 = 0.01;
         const STROKE_TOLERANCE: f64 = SHAPE_TOLERANCE;
 
-        const GPU_STROKES: bool = false; // Set this to `true` to enable GPU-side stroking
+        const GPU_STROKES: bool = true; // Set this to `true` to enable GPU-side stroking
         if GPU_STROKES {
             self.encoding
                 .encode_transform(Transform::from_kurbo(&transform));

From 9c7535f36693151bf4116c8a4d9b92254a8b3ef5 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Tue, 12 Mar 2024 08:36:19 -0700
Subject: [PATCH 10/16] Small fixes and cleanups

Make handling of small tangent consistent. Remove cubic flattening. Make subdivision limiting logic of CPU shader consistent with GPU version. Get rid of unused is_line predicate on CPU side.
---
 shader/flatten.wgsl       | 220 +++--------------------------------
 src/cpu_shader/flatten.rs | 235 +++-----------------------------------
 2 files changed, 29 insertions(+), 426 deletions(-)

diff --git a/shader/flatten.wgsl b/shader/flatten.wgsl
index 0a34d4584..27bdcf76e 100644
--- a/shader/flatten.wgsl
+++ b/shader/flatten.wgsl
@@ -258,60 +258,6 @@ fn espc_int_inv_approx(x: f32) -> f32 {
     return a * sign(x);
 }
 
-// Notes on fractional subdivision:
-// --------------------------------
-// The core of the existing flattening algorithm (see `flatten_cubic` below) is to approximate the
-// original cubic Bézier into a simpler curve (quadratic Bézier), subdivided to meet the error
-// bound, then apply flattening to that. Doing this the simplest way would put a subdivision point
-// in the output at each subdivision point here. That in general does not match where the
-// subdivision points would go in an optimal flattening. Fractional subdivision addresses that
-// problem.
-//
-// The return value of this function (`val`) represents this fractional subdivision count and has
-// the following meaning: an optimal subdivision of the quadratic into `val / 2` subdivisions
-// will have an error `sqrt_tol^2` (i.e. the desired tolerance).
-//
-// In the non-cusp case, the error scales as the inverse square of `val` (doubling `val` causes the
-// error to be one fourth), so the tolerance is actually not needed for the calculation (and gets
-// applied in the caller). In the cusp case, this scaling breaks down and the tolerance parameter
-// is needed to compute the correct result.
-fn estimate_subdiv(p0: vec2f, p1: vec2f, p2: vec2f, sqrt_tol: f32) -> SubdivResult {
-    let d01 = p1 - p0;
-    let d12 = p2 - p1;
-    let dd = d01 - d12;
-    let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
-    let cross_inv = select(1.0 / cross, 1.0e9, abs(cross) < 1.0e-9);
-    let x0 = dot(d01, dd) * cross_inv;
-    let x2 = dot(d12, dd) * cross_inv;
-    let scale = abs(cross / (length(dd) * (x2 - x0)));
-
-    let a0 = approx_parabola_integral(x0);
-    let a2 = approx_parabola_integral(x2);
-    var val = 0.0;
-    if scale < 1e9 {
-        let da = abs(a2 - a0);
-        let sqrt_scale = sqrt(scale);
-        if sign(x0) == sign(x2) {
-            val = sqrt_scale;
-        } else {
-            let xmin = sqrt_tol / sqrt_scale;
-            val = sqrt_tol / approx_parabola_integral(xmin);
-        }
-        val *= da;
-    }
-    return SubdivResult(val, a0, a2);
-}
-
-fn eval_quad(p0: vec2f, p1: vec2f, p2: vec2f, t: f32) -> vec2f {
-    let mt = 1.0 - t;
-    return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
-}
-
-fn eval_cubic(p0: vec2f, p1: vec2f, p2: vec2f, p3: vec2f, t: f32) -> vec2f {
-    let mt = 1.0 - t;
-    return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
-}
-
 struct PointDeriv {
     point: vec2f,
     deriv: vec2f,
@@ -327,17 +273,6 @@ fn eval_cubic_and_deriv(p0: vec2f, p1: vec2f, p2: vec2f, p3: vec2f, t: f32) -> P
     return PointDeriv(p, q);
 }
 
-fn eval_quad_tangent(p0: vec2f, p1: vec2f, p2: vec2f, t: f32) -> vec2f {
-    let dp0 = 2. * (p1 - p0);
-    let dp1 = 2. * (p2 - p1);
-    return mix(dp0, dp1, t);
-}
-
-fn eval_quad_normal(p0: vec2f, p1: vec2f, p2: vec2f, t: f32) -> vec2f {
-    let tangent = normalize(eval_quad_tangent(p0, p1, p2, t));
-    return vec2(-tangent.y, tangent.x);
-}
-
 fn cubic_start_tangent(p0: vec2f, p1: vec2f, p2: vec2f, p3: vec2f) -> vec2f {
     let EPS = 1e-12;
     let d01 = p1 - p0;
@@ -378,6 +313,8 @@ const SUBDIV_LIMIT: f32 = 1.0 / 65536.0;
 const K1_THRESH: f32 = 1e-3;
 // Robust ESPC: below this value, evaluate ES rather than parallel curve
 const DIST_THRESH: f32 = 1e-3;
+// Threshold for tangents to be considered near zero length
+const TANGENT_THRESH: f32 = 1e-6;
 
 // This function flattens a cubic Bézier by first converting it into Euler spiral
 // segments, and then computes a near-optimal flattening of the parallel curves of
@@ -535,143 +472,6 @@ fn flatten_euler(
     }
 }
 
-let MAX_QUADS = 16u;
-
-// This function flattens a cubic Bézier by first converting it into quadratics and
-// approximates the optimal flattening of those using a variation of the method described in
-// https://raphlinus.github.io/graphics/curves/2019/12/23/flatten-quadbez.html.
-//
-// When the `offset` parameter is zero (i.e. the path is a "fill"), the flattening is performed
-// directly on the transformed (device-space) control points as this produces near-optimal
-// flattening even in the presence of a non-angle-preserving transform.
-//
-// When the `offset` is non-zero, the flattening is performed in the curve's local coordinate space
-// and the offset curve gets transformed to device-space post-flattening. This handles
-// non-angle-preserving transforms well while keeping the logic simple.
-//
-// When subdividing the cubic in its local coordinate space, the scale factor gets decomposed out of
-// the local-to-device transform and gets factored into the tolerance threshold when estimating
-// subdivisions.
-fn flatten_cubic(cubic: CubicPoints, path_ix: u32, local_to_device: Transform, offset: f32) {
-    var p0: vec2f;
-    var p1: vec2f;
-    var p2: vec2f;
-    var p3: vec2f;
-    var scale: f32;
-    var transform: Transform;
-    if offset == 0. {
-        let t = local_to_device;
-        p0 = transform_apply(t, cubic.p0);
-        p1 = transform_apply(t, cubic.p1);
-        p2 = transform_apply(t, cubic.p2);
-        p3 = transform_apply(t, cubic.p3);
-        scale = 1.;
-        transform = transform_identity();
-    } else {
-        p0 = cubic.p0;
-        p1 = cubic.p1;
-        p2 = cubic.p2;
-        p3 = cubic.p3;
-
-        transform = local_to_device;
-        let mat = transform.mat;
-        scale = 0.5 * length(vec2(mat.x + mat.w, mat.y - mat.z)) +
-                length(vec2(mat.x - mat.w, mat.y + mat.z));
-    }
-
-    let err_v = 3.0 * (p2 - p1) + p0 - p3;
-    let err = dot(err_v, err_v);
-    let ACCURACY = 0.25;
-    let Q_ACCURACY = ACCURACY * 0.1;
-    let REM_ACCURACY = ACCURACY - Q_ACCURACY;
-    let MAX_HYPOT2 = 432.0 * Q_ACCURACY * Q_ACCURACY;
-    let scaled_sqrt_tol = sqrt(REM_ACCURACY / scale);
-    // Fudge the subdivision count metric to account for `scale` when the subdivision is done in local
-    // coordinates.
-    var n_quads = max(u32(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0)) * scale), 1u);
-    n_quads = min(n_quads, MAX_QUADS);
-
-    var keep_params: array<SubdivResult, MAX_QUADS>;
-    var val = 0.0;
-    var qp0 = p0;
-    let step = 1.0 / f32(n_quads);
-    for (var i = 0u; i < n_quads; i += 1u) {
-        let t = f32(i + 1u) * step;
-        let qp2 = eval_cubic(p0, p1, p2, p3, t);
-        var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
-        qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
-
-        // TODO: Estimate an accurate subdivision count for strokes
-        let params = estimate_subdiv(qp0, qp1, qp2, scaled_sqrt_tol);
-        keep_params[i] = params;
-        val += params.val;
-        qp0 = qp2;
-    }
-
-    // Normal vector to calculate the start point of the offset curve.
-    var n0 = offset * cubic_start_normal(p0, p1, p2, p3);
-
-    let n = max(u32(ceil(val * (0.5 / scaled_sqrt_tol))), 1u);
-    var lp0 = p0;
-    qp0 = p0;
-    let v_step = val / f32(n);
-    var n_out = 1u;
-    var val_sum = 0.0;
-    for (var i = 0u; i < n_quads; i += 1u) {
-        let t = f32(i + 1u) * step;
-        let qp2 = eval_cubic(p0, p1, p2, p3, t);
-        var qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
-        qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
-        let params = keep_params[i];
-        let u0 = approx_parabola_inv_integral(params.a0);
-        let u2 = approx_parabola_inv_integral(params.a2);
-        let uscale = 1.0 / (u2 - u0);
-        var val_target = f32(n_out) * v_step;
-        while n_out == n || val_target < val_sum + params.val {
-            var lp1: vec2f;
-            var t1: f32;
-            if n_out == n {
-                lp1 = p3;
-                t1 = 1.;
-            } else {
-                let u = (val_target - val_sum) / params.val;
-                let a = mix(params.a0, params.a2, u);
-                let au = approx_parabola_inv_integral(a);
-                let t = (au - u0) * uscale;
-                t1 = t;
-                lp1 = eval_quad(qp0, qp1, qp2, t);
-            }
-
-            // TODO: Instead of outputting two offset segments here, restructure this function as
-            // "flatten_cubic_at_offset" such that it outputs one cubic at an offset. That should
-            // more closely resemble the end state of this shader which will work like a state
-            // machine.
-            if offset > 0. {
-                var n1: vec2f;
-                if all(lp1 == p3) {
-                    n1 = cubic_end_normal(p0, p1, p2, p3);
-                } else {
-                    n1 = eval_quad_normal(qp0, qp1, qp2, t1);
-                }
-                n1 *= offset;
-                output_two_lines_with_transform(path_ix,
-                                                lp0 + n0, lp1 + n1,
-                                                lp1 - n1, lp0 - n0,
-                                                transform);
-                n0 = n1;
-            } else {
-                // Output line segment lp0..lp1
-                output_line_with_transform(path_ix, lp0, lp1, transform);
-            }
-            n_out += 1u;
-            val_target += v_step;
-            lp0 = lp1;
-        }
-        val_sum += params.val;
-        qp0 = qp2;
-    }
-}
-
 // Flattens the circular arc that subtends the angle begin-center-end. It is assumed that
 // ||begin - center|| == ||end - center||. `begin`, `end`, and `center` are defined in the path's
 // local coordinate space.
@@ -1049,10 +849,18 @@ fn main(
             } else {
                 // Read the neighboring segment.
                 let neighbor = read_neighboring_segment(ix + 1u);
-                let tan_prev = cubic_end_tangent(pts.p0, pts.p1, pts.p2, pts.p3);
-                let tan_next = neighbor.tangent;
-                let tan_start = cubic_start_tangent(pts.p0, pts.p1, pts.p2, pts.p3);
-                // TODO: probably have special casing of zero tangents here
+                var tan_start = cubic_start_tangent(pts.p0, pts.p1, pts.p2, pts.p3);
+                if dot(tan_start, tan_start) < TANGENT_THRESH * TANGENT_THRESH {
+                    tan_start = vec2(TANGENT_THRESH, 0.);
+                }
+                var tan_prev = cubic_end_tangent(pts.p0, pts.p1, pts.p2, pts.p3);
+                if dot(tan_prev, tan_prev) < TANGENT_THRESH * TANGENT_THRESH {
+                    tan_prev = vec2(TANGENT_THRESH, 0.);
+                }
+                var tan_next = neighbor.tangent;
+                if dot(tan_next, tan_next) < TANGENT_THRESH * TANGENT_THRESH {
+                    tan_next = vec2(TANGENT_THRESH, 0.);
+                }
                 let n_start = offset * normalize(vec2(-tan_start.y, tan_start.x));
                 let offset_tangent = offset * normalize(tan_prev);
                 let n_prev = offset_tangent.yx * vec2f(-1., 1.);
diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs
index 86f4cb4a3..757720671 100644
--- a/src/cpu_shader/flatten.rs
+++ b/src/cpu_shader/flatten.rs
@@ -21,72 +21,12 @@ macro_rules! log {
     }};
 }
 
-fn to_minus_one_quarter(x: f32) -> f32 {
-    // could also be written x.powf(-0.25)
-    x.sqrt().sqrt().recip()
-}
-
-const D: f32 = 0.67;
-fn approx_parabola_integral(x: f32) -> f32 {
-    x * to_minus_one_quarter(1.0 - D + (D * D * D * D + 0.25 * x * x))
-}
-
-const B: f32 = 0.39;
-fn approx_parabola_inv_integral(x: f32) -> f32 {
-    x * (1.0 - B + (B * B + 0.5 * x * x)).sqrt()
-}
-
-#[derive(Clone, Copy, Default)]
-struct SubdivResult {
-    val: f32,
-    a0: f32,
-    a2: f32,
-}
-
 /// Threshold below which a derivative is considered too small.
 const DERIV_THRESH: f32 = 1e-6;
 /// Amount to nudge t when derivative is near-zero.
 const DERIV_EPS: f32 = 1e-6;
-
-fn estimate_subdiv(p0: Vec2, p1: Vec2, p2: Vec2, sqrt_tol: f32) -> SubdivResult {
-    let d01 = p1 - p0;
-    let d12 = p2 - p1;
-    let dd = d01 - d12;
-    let cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
-    let cross_inv = if cross.abs() < 1.0e-9 {
-        1.0e9
-    } else {
-        cross.recip()
-    };
-    let x0 = d01.dot(dd) * cross_inv;
-    let x2 = d12.dot(dd) * cross_inv;
-    let scale = (cross / (dd.length() * (x2 - x0))).abs();
-    let a0 = approx_parabola_integral(x0);
-    let a2 = approx_parabola_integral(x2);
-    let mut val = 0.0;
-    if scale < 1e9 {
-        let da = (a2 - a0).abs();
-        let sqrt_scale = scale.sqrt();
-        if x0.signum() == x2.signum() {
-            val = sqrt_scale;
-        } else {
-            let xmin = sqrt_tol / sqrt_scale;
-            val = sqrt_tol / approx_parabola_integral(xmin);
-        }
-        val *= da;
-    }
-    SubdivResult { val, a0, a2 }
-}
-
-fn eval_quad(p0: Vec2, p1: Vec2, p2: Vec2, t: f32) -> Vec2 {
-    let mt = 1.0 - t;
-    p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t
-}
-
-fn eval_cubic(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2, t: f32) -> Vec2 {
-    let mt = 1.0 - t;
-    p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t
-}
+// Limit for subdivision of cubic Béziers.
+const SUBDIV_LIMIT: f32 = 1.0 / 65536.0;
 
 /// Evaluate both the point and derivative of a cubic bezier.
 fn eval_cubic_and_deriv(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2, t: f32) -> (Vec2, Vec2) {
@@ -99,17 +39,6 @@ fn eval_cubic_and_deriv(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2, t: f32) -> (Vec2
     (p, q)
 }
 
-fn eval_quad_tangent(p0: Vec2, p1: Vec2, p2: Vec2, t: f32) -> Vec2 {
-    let dp0 = 2. * (p1 - p0);
-    let dp1 = 2. * (p2 - p1);
-    dp0.mix(dp1, t)
-}
-
-fn eval_quad_normal(p0: Vec2, p1: Vec2, p2: Vec2, t: f32) -> Vec2 {
-    let tangent = eval_quad_tangent(p0, p1, p2, t).normalize();
-    Vec2::new(-tangent.y, tangent.x)
-}
-
 fn cubic_start_tangent(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2) -> Vec2 {
     let d01 = p1 - p0;
     let d02 = p2 - p0;
@@ -136,20 +65,6 @@ fn cubic_end_tangent(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2) -> Vec2 {
     }
 }
 
-fn cubic_start_normal(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2) -> Vec2 {
-    let tangent = cubic_start_tangent(p0, p1, p2, p3).normalize();
-    Vec2::new(-tangent.y, tangent.x)
-}
-
-fn cubic_end_normal(p0: Vec2, p1: Vec2, p2: Vec2, p3: Vec2) -> Vec2 {
-    let tangent = cubic_end_tangent(p0, p1, p2, p3).normalize();
-    Vec2::new(-tangent.y, tangent.x)
-}
-
-fn check_colinear(p0: Vec2, p1: Vec2, p2: Vec2) -> bool {
-    (p1 - p0).cross(p2 - p0).abs() < 1e-9
-}
-
 fn write_line(
     line_ix: usize,
     path_ix: u32,
@@ -234,121 +149,6 @@ fn output_two_lines_with_transform(
     *line_ix += 2;
 }
 
-const MAX_QUADS: u32 = 16;
-
-fn flatten_cubic(
-    cubic: &CubicPoints,
-    path_ix: u32,
-    local_to_device: &Transform,
-    offset: f32,
-    line_ix: &mut usize,
-    lines: &mut [LineSoup],
-    bbox: &mut IntBbox,
-) {
-    let (p0, p1, p2, p3, scale, transform) = if offset == 0. {
-        (
-            local_to_device.apply(cubic.p0),
-            local_to_device.apply(cubic.p1),
-            local_to_device.apply(cubic.p2),
-            local_to_device.apply(cubic.p3),
-            1.,
-            Transform::identity(),
-        )
-    } else {
-        let t = local_to_device.0;
-        let scale = 0.5 * Vec2::new(t[0] + t[3], t[1] - t[2]).length()
-            + Vec2::new(t[0] - t[3], t[1] + t[2]).length();
-        (
-            cubic.p0,
-            cubic.p1,
-            cubic.p2,
-            cubic.p3,
-            scale,
-            local_to_device.clone(),
-        )
-    };
-    let err_v = (p2 - p1) * 3.0 + p0 - p3;
-    let err = err_v.dot(err_v);
-    const ACCURACY: f32 = 0.25;
-    const Q_ACCURACY: f32 = ACCURACY * 0.1;
-    const REM_ACCURACY: f32 = ACCURACY - Q_ACCURACY;
-    const MAX_HYPOT2: f32 = 432.0 * Q_ACCURACY * Q_ACCURACY;
-    let scaled_sqrt_tol = (REM_ACCURACY / scale).sqrt();
-    let mut n_quads = (((err * (1.0 / MAX_HYPOT2)).powf(1.0 / 6.0).ceil() * scale) as u32).max(1);
-    n_quads = n_quads.min(MAX_QUADS);
-
-    let mut keep_params = [SubdivResult::default(); MAX_QUADS as usize];
-    let mut val = 0.0;
-    let mut qp0 = p0;
-    let step = (n_quads as f32).recip();
-    for i in 0..n_quads {
-        let t = (i + 1) as f32 * step;
-        let qp2 = eval_cubic(p0, p1, p2, p3, t);
-        let mut qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
-        qp1 = qp1 * 2.0 - (qp0 + qp2) * 0.5;
-        let params = estimate_subdiv(qp0, qp1, qp2, scaled_sqrt_tol);
-        keep_params[i as usize] = params;
-        val += params.val;
-        qp0 = qp2;
-    }
-
-    let mut n0 = offset * cubic_start_normal(p0, p1, p2, p3);
-    let n = ((val * (0.5 / scaled_sqrt_tol)).ceil() as u32).max(1);
-    let mut lp0 = p0;
-    qp0 = p0;
-    let v_step = val / (n as f32);
-    let mut n_out = 1;
-    let mut val_sum = 0.0;
-    for i in 0..n_quads {
-        let t = (i + 1) as f32 * step;
-        let qp2 = eval_cubic(p0, p1, p2, p3, t);
-        let mut qp1 = eval_cubic(p0, p1, p2, p3, t - 0.5 * step);
-        qp1 = qp1 * 2.0 - (qp0 + qp2) * 0.5;
-        let params = keep_params[i as usize];
-        let u0 = approx_parabola_inv_integral(params.a0);
-        let u2 = approx_parabola_inv_integral(params.a2);
-        let uscale = (u2 - u0).recip();
-        let mut val_target = (n_out as f32) * v_step;
-        while n_out == n || val_target < val_sum + params.val {
-            let (lp1, t1) = if n_out == n {
-                (p3, 1.)
-            } else {
-                let u = (val_target - val_sum) / params.val;
-                let a = params.a0 + (params.a2 - params.a0) * u;
-                let au = approx_parabola_inv_integral(a);
-                let t = (au - u0) * uscale;
-                (eval_quad(qp0, qp1, qp2, t), t)
-            };
-            if offset > 0. {
-                let n1 = if lp1 == p3 {
-                    cubic_end_normal(p0, p1, p2, p3)
-                } else {
-                    eval_quad_normal(qp0, qp1, qp2, t1)
-                } * offset;
-                output_two_lines_with_transform(
-                    path_ix,
-                    lp0 + n0,
-                    lp1 + n1,
-                    lp1 - n1,
-                    lp0 - n0,
-                    &transform,
-                    line_ix,
-                    lines,
-                    bbox,
-                );
-                n0 = n1;
-            } else {
-                output_line_with_transform(path_ix, lp0, lp1, &transform, line_ix, lines, bbox);
-            }
-            n_out += 1;
-            val_target += v_step;
-            lp0 = lp1;
-        }
-        val_sum += params.val;
-        qp0 = qp2;
-    }
-}
-
 fn flatten_arc(
     path_ix: u32,
     begin: Vec2,
@@ -403,7 +203,6 @@ fn flatten_euler(
     path_ix: u32,
     local_to_device: &Transform,
     offset: f32,
-    is_line: bool,
     start_p: Vec2,
     end_p: Vec2,
     line_ix: &mut usize,
@@ -481,10 +280,6 @@ fn flatten_euler(
     let mut lp0 = t_start;
 
     loop {
-        // TODO: this isn't right, it drops nasty lines; we should just limit subdivision
-        if dt < ROBUST_EPSILON {
-            break;
-        }
         let t0 = (t0_u as f32) * dt;
         if t0 == 1. {
             break;
@@ -510,7 +305,7 @@ fn flatten_euler(
             let chord_len = (this_p1 - this_p0).length();
             let err = est_err * chord_len;
             log!("@@@   loop2: sub:{:?}, {:?} t0: {t0}, t1: {t1}, dt: {dt}, est_err: {est_err}, err: {err}", subcubic, cubic_params);
-            if err * scale <= tol {
+            if err * scale <= tol || dt <= SUBDIV_LIMIT {
                 log!("@@@   error within tolerance");
                 t0_u += 1;
                 let shift = t0_u.trailing_zeros();
@@ -521,7 +316,6 @@ fn flatten_euler(
 
                 let (k0, k1) = (es.params.k0 - 0.5 * es.params.k1, es.params.k1);
 
-                // TODO (raph, important): figure out how scale applies.
                 // compute forward integral to determine number of subdivisions
                 let dist_scaled = offset * es.params.ch / chord_len;
                 // The number of subdivisions for curvature = 1
@@ -886,6 +680,9 @@ const PATH_TAG_QUADTO: u8 = 2;
 const PATH_TAG_CUBICTO: u8 = 3;
 const PATH_TAG_F32: u8 = 8;
 
+// Threshold for tangents to be considered near zero length
+const TANGENT_THRESH: f32 = 1e-6;
+
 fn flatten_main(
     n_wg: u32,
     config: &ConfigUniform,
@@ -951,8 +748,6 @@ fn flatten_main(
                         // Don't draw anything if the path is closed.
                     }
                 } else {
-                    let is_line = seg_type == PATH_TAG_LINETO;
-
                     // Read the neighboring segment.
                     let neighbor =
                         read_neighboring_segment(ix + 1, pathtags, pathdata, tag_monoids);
@@ -965,13 +760,18 @@ fn flatten_main(
                     // TODO: not all zero-length segments are getting filtered out
                     // TODO: this is a hack. How to handle caps on degenerate stroke?
                     // TODO: debug tricky stroke by isolation
-                    let tan_prev = if tan_prev.length_squared() < ROBUST_EPSILON {
-                        Vec2::new(0.01, 0.)
+                    let tan_start = if tan_start.length_squared() < TANGENT_THRESH.powi(2) {
+                        Vec2::new(TANGENT_THRESH, 0.)
+                    } else {
+                        tan_start
+                    };
+                    let tan_prev = if tan_prev.length_squared() < TANGENT_THRESH.powi(2) {
+                        Vec2::new(TANGENT_THRESH, 0.)
                     } else {
                         tan_prev
                     };
-                    let tan_next = if tan_next.length_squared() < ROBUST_EPSILON {
-                        Vec2::new(0.01, 0.)
+                    let tan_next = if tan_next.length_squared() < TANGENT_THRESH.powi(2) {
+                        Vec2::new(TANGENT_THRESH, 0.)
                     } else {
                         tan_next
                     };
@@ -990,7 +790,6 @@ fn flatten_main(
                         path_ix,
                         &transform,
                         offset,
-                        is_line,
                         pts.p0 + n_start,
                         pts.p3 + n_prev,
                         &mut line_ix,
@@ -1002,7 +801,6 @@ fn flatten_main(
                         path_ix,
                         &transform,
                         -offset,
-                        is_line,
                         pts.p0 - n_start,
                         pts.p3 - n_prev,
                         &mut line_ix,
@@ -1041,14 +839,11 @@ fn flatten_main(
                     }
                 }
             } else {
-                // TODO: determine if we still need this; we shouldn't for correctness
-                let is_line = seg_type == PATH_TAG_LINETO;
                 flatten_euler(
                     &pts,
                     path_ix,
                     &transform,
                     /*offset*/ 0.,
-                    is_line,
                     pts.p0,
                     pts.p3,
                     &mut line_ix,

From f4a843481a6ea6c46ecf3b40bba419b3b826ec23 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Tue, 12 Mar 2024 12:41:48 -0700
Subject: [PATCH 11/16] Clippy fixes

---
 src/cpu_shader/euler.rs   |  3 +++
 src/cpu_shader/flatten.rs | 11 +++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/cpu_shader/euler.rs b/src/cpu_shader/euler.rs
index ca3734f15..cacd547cb 100644
--- a/src/cpu_shader/euler.rs
+++ b/src/cpu_shader/euler.rs
@@ -3,6 +3,9 @@
 
 //! Utility functions for Euler Spiral based stroke expansion.
 
+// Use the same constants as the f64 version.
+#![allow(clippy::excessive_precision)]
+
 use super::util::Vec2;
 use std::f32::consts::FRAC_PI_4;
 
diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs
index 757720671..6bd93a38d 100644
--- a/src/cpu_shader/flatten.rs
+++ b/src/cpu_shader/flatten.rs
@@ -295,7 +295,7 @@ fn flatten_euler(
                 this_q1 = new_q1;
                 if t1 < 1. {
                     this_p1 = new_p1;
-                    t1 = t1 - DERIV_EPS;
+                    t1 -= DERIV_EPS;
                 }
             }
             let actual_dt = t1 - last_t;
@@ -365,9 +365,8 @@ fn flatten_euler(
                     log!("@@@   NaN: parameters:\n  es: {:#?}\n  k0: {k0}, k1: {k1}\n  dist_scaled: {dist_scaled}\n  es_scale: {es_scale}\n  a: {a}\n  b: {b}\n  int0: {int0}, int1: {int1}, integral: {integral}\n  k_peak: {k_peak}\n  integrand_peak: {integrand_peak}\n  scaled_int: {scaled_int}\n  n_frac:  {n_frac}", es);
                 } else {
                     for i in 0..n as usize {
-                        let lp1;
-                        if i == n as usize - 1 && t1 == 1.0 {
-                            lp1 = t_end;
+                        let lp1 = if i == n as usize - 1 && t1 == 1.0 {
+                            t_end
                         } else {
                             let t = (i + 1) as f32 / n;
                             let s = match robust {
@@ -383,8 +382,8 @@ fn flatten_euler(
                                     (inv - b) / a
                                 }
                             };
-                            lp1 = es.eval_with_offset(s, offset);
-                        }
+                            es.eval_with_offset(s, offset)
+                        };
                         let l0 = if offset >= 0. { lp0 } else { lp1 };
                         let l1 = if offset >= 0. { lp1 } else { lp0 };
                         output_line_with_transform(

From f1faf9b7acef18b6374afb5c1a2e1ba27d6d78f5 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Tue, 12 Mar 2024 12:48:45 -0700
Subject: [PATCH 12/16] Fix copyright headers

(and I also added this to my local pre-push git hook)
---
 src/cpu_shader/euler.rs   | 2 +-
 src/cpu_shader/flatten.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpu_shader/euler.rs b/src/cpu_shader/euler.rs
index cacd547cb..e2681de6a 100644
--- a/src/cpu_shader/euler.rs
+++ b/src/cpu_shader/euler.rs
@@ -1,4 +1,4 @@
-// Copyright 2023 The Vello Authors
+// Copyright 2023 the Vello Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
 //! Utility functions for Euler Spiral based stroke expansion.
diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs
index 6bd93a38d..434ec92e5 100644
--- a/src/cpu_shader/flatten.rs
+++ b/src/cpu_shader/flatten.rs
@@ -1,4 +1,4 @@
-// Copyright 2023 The Vello Authors
+// Copyright 2023 the Vello Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
 
 use std::f32::consts::FRAC_1_SQRT_2;

From 48a402029c1abf42164ebbf61d0c48975a2112d3 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Tue, 12 Mar 2024 17:02:03 -0700
Subject: [PATCH 13/16] Review suggestions

Mostly doc improvements, but also improvements to robustness for short chords.
---
 shader/flatten.wgsl       | 149 ++++++++++++-----------
 src/cpu_shader/euler.rs   |  36 +++++-
 src/cpu_shader/flatten.rs | 246 ++++++++++++++++++--------------------
 3 files changed, 224 insertions(+), 207 deletions(-)

diff --git a/shader/flatten.wgsl b/shader/flatten.wgsl
index 27bdcf76e..52ec79fc1 100644
--- a/shader/flatten.wgsl
+++ b/shader/flatten.wgsl
@@ -305,6 +305,7 @@ const ESPC_ROBUST_LOW_DIST = 2;
 
 // Threshold below which a derivative is considered too small.
 const DERIV_THRESH: f32 = 1e-6;
+const DERIV_THRESH_SQUARED: f32 = DERIV_THRESH * DERIV_THRESH;
 // Amount to nudge t when derivative is near-zero.
 const DERIV_EPS: f32 = 1e-6;
 // Limit for subdivision of cubic Béziers.
@@ -357,19 +358,18 @@ fn flatten_euler(
                 length(vec2(mat.x - mat.w, mat.y + mat.z));
     }
 
-    // Drop zero length lines.
+    // Drop zero length lines. This is an exact equality test because dropping very short
+    // line segments may result in loss of watertightness.
     if all(p0 == p1) && all(p0 == p2) && all(p0 == p3) {
         return;
     }
-    //output_line_with_transform(path_ix, t_start, t_end, transform);
-    //if true { return; }
 
     let tol = 0.25;
     var t0_u = 0u;
     var dt = 1.0;
     var last_p = p0;
     var last_q = p1 - p0;
-    if dot(last_q, last_q) < DERIV_THRESH * DERIV_THRESH {
+    if dot(last_q, last_q) < DERIV_THRESH_SQUARED {
         last_q = eval_cubic_and_deriv(p0, p1, p2, p3, DERIV_EPS).deriv;
     }
     var last_t = 0.0;
@@ -384,7 +384,7 @@ fn flatten_euler(
             let this_p0 = last_p;
             let this_q0 = last_q;
             var this_pq1 = eval_cubic_and_deriv(p0, p1, p2, p3, t1);
-            if dot(this_pq1.deriv, this_pq1.deriv) < DERIV_THRESH * DERIV_THRESH {
+            if dot(this_pq1.deriv, this_pq1.deriv) < DERIV_THRESH_SQUARED {
                 let new_pq1 = eval_cubic_and_deriv(p0, p1, p2, p3, t1 - DERIV_EPS);
                 this_pq1.deriv = new_pq1.deriv;
                 if t1 < 1.0 {
@@ -393,78 +393,85 @@ fn flatten_euler(
                 }
             }
             let actual_dt = t1 - last_t;
-            let cubic_params = cubic_from_points_derivs(this_p0, this_pq1.point, this_q0, this_pq1.deriv, actual_dt);
-            let est_err = est_euler_err(cubic_params);
             let chord_len = length(this_pq1.point - this_p0);
-            let err = est_err * chord_len;
-            if err * scale <= tol || dt <= SUBDIV_LIMIT {
-                t0_u += 1u;
-                let shift = countTrailingZeros(t0_u);
-                t0_u >>= shift;
-                dt *= f32(1u << shift);
-                let euler_params = es_params_from_angles(cubic_params.th0, cubic_params.th1);
-                let es = es_seg_from_params(this_p0, this_pq1.point, euler_params);
-                let k0 = es.params.k0 - 0.5 * es.params.k1;
-                let k1 = es.params.k1;
-                let dist_scaled = offset * es.params.ch / chord_len;
-                let scale_multiplier = sqrt(0.125 * scale * chord_len / (es.params.ch * tol));
-                var a = 0.0;
-                var b = 0.0;
-                var integral = 0.0;
-                var int0 = 0.0;
-                var n_frac: f32;
-                var robust = ESPC_ROBUST_NORMAL;
-                if abs(k1) < K1_THRESH {
-                    let k = es.params.k0;
-                    n_frac = sqrt(abs(k * (k * dist_scaled + 1.0)));
-                    robust = ESPC_ROBUST_LOW_K1;
-                } else if abs(dist_scaled) < DIST_THRESH {
-                    a = k1;
-                    b = k0;
-                    int0 = pow_1_5_signed(b);
-                    let int1 = pow_1_5_signed(a + b);
-                    integral = int1 - int0;
-                    n_frac = (2. / 3.) * integral / a;
-                    robust = ESPC_ROBUST_LOW_DIST;
-                } else {
-                    a = -2.0 * dist_scaled * k1;
-                    b = -1.0 - 2.0 * dist_scaled * k0;
-                    int0 = espc_int_approx(b);
-                    let int1 = espc_int_approx(a + b);
-                    integral = int1 - int0;
-                    let k_peak = k0 - k1 * b / a;
-                    let integrand_peak = sqrt(abs(k_peak * (k_peak * dist_scaled + 1.0)));
-                    n_frac = integral * integrand_peak / a;
-                }
-                let n = max(ceil(n_frac * scale_multiplier), 1.0);
-                for (var i = 0u; i < u32(n); i++) {
-                    var lp1: vec2f;
-                    if i + 1u == u32(n) && t1 == 1.0 {
-                        lp1 = t_end;
+            // Subdivide the loop case when the chord is short, but don't subdivide when it is
+            // simply a very short segment.
+            if chord_len >= TANGENT_THRESH
+                || (dot(this_q0, this_q0) * actual_dt * actual_dt < DERIV_THRESH_SQUARED
+                    && dot(this_pq1.deriv, this_pq1.deriv) * actual_dt * actual_dt < DERIV_THRESH_SQUARED)
+            {
+                let cubic_params = cubic_from_points_derivs(this_p0, this_pq1.point, this_q0, this_pq1.deriv, actual_dt);
+                let est_err = est_euler_err(cubic_params);
+                let err = est_err * chord_len;
+                if err * scale <= tol || dt <= SUBDIV_LIMIT {
+                    t0_u += 1u;
+                    let shift = countTrailingZeros(t0_u);
+                    t0_u >>= shift;
+                    dt *= f32(1u << shift);
+                    let euler_params = es_params_from_angles(cubic_params.th0, cubic_params.th1);
+                    let es = es_seg_from_params(this_p0, this_pq1.point, euler_params);
+                    let k0 = es.params.k0 - 0.5 * es.params.k1;
+                    let k1 = es.params.k1;
+                    let dist_scaled = offset * es.params.ch / chord_len;
+                    let scale_multiplier = sqrt(0.125 * scale * chord_len / (es.params.ch * tol));
+                    var a = 0.0;
+                    var b = 0.0;
+                    var integral = 0.0;
+                    var int0 = 0.0;
+                    var n_frac: f32;
+                    var robust = ESPC_ROBUST_NORMAL;
+                    if abs(k1) < K1_THRESH {
+                        let k = es.params.k0;
+                        n_frac = sqrt(abs(k * (k * dist_scaled + 1.0)));
+                        robust = ESPC_ROBUST_LOW_K1;
+                    } else if abs(dist_scaled) < DIST_THRESH {
+                        a = k1;
+                        b = k0;
+                        int0 = pow_1_5_signed(b);
+                        let int1 = pow_1_5_signed(a + b);
+                        integral = int1 - int0;
+                        n_frac = (2. / 3.) * integral / a;
+                        robust = ESPC_ROBUST_LOW_DIST;
                     } else {
-                        let t = f32(i + 1u) / n;
-                        var s = t;
-                        if robust != ESPC_ROBUST_LOW_K1 {
-                            let u = integral * t + int0;
-                            var inv: f32;
-                            if robust == ESPC_ROBUST_LOW_DIST {
-                                inv = pow(abs(u), 2. / 3.) * sign(u);
-                            } else {
-                                inv = espc_int_inv_approx(u);
+                        a = -2.0 * dist_scaled * k1;
+                        b = -1.0 - 2.0 * dist_scaled * k0;
+                        int0 = espc_int_approx(b);
+                        let int1 = espc_int_approx(a + b);
+                        integral = int1 - int0;
+                        let k_peak = k0 - k1 * b / a;
+                        let integrand_peak = sqrt(abs(k_peak * (k_peak * dist_scaled + 1.0)));
+                        n_frac = integral * integrand_peak / a;
+                    }
+                    let n = max(ceil(n_frac * scale_multiplier), 1.0);
+                    for (var i = 0u; i < u32(n); i++) {
+                        var lp1: vec2f;
+                        if i + 1u == u32(n) && t1 == 1.0 {
+                            lp1 = t_end;
+                        } else {
+                            let t = f32(i + 1u) / n;
+                            var s = t;
+                            if robust != ESPC_ROBUST_LOW_K1 {
+                                let u = integral * t + int0;
+                                var inv: f32;
+                                if robust == ESPC_ROBUST_LOW_DIST {
+                                    inv = pow(abs(u), 2. / 3.) * sign(u);
+                                } else {
+                                    inv = espc_int_inv_approx(u);
+                                }
+                                s = (inv - b) / a;
                             }
-                            s = (inv - b) / a;
+                            lp1 = es_seg_eval_with_offset(es, s, offset);
                         }
-                        lp1 = es_seg_eval_with_offset(es, s, offset);
+                        let l0 = select(lp1, lp0, offset >= 0.);
+                        let l1 = select(lp0, lp1, offset >= 0.);
+                        output_line_with_transform(path_ix, l0, l1, transform);
+                        lp0 = lp1;
                     }
-                    let l0 = select(lp1, lp0, offset >= 0.);
-                    let l1 = select(lp0, lp1, offset >= 0.);
-                    output_line_with_transform(path_ix, l0, l1, transform);
-                    lp0 = lp1;
+                    last_p = this_pq1.point;
+                    last_q = this_pq1.deriv;
+                    last_t = t1;
+                    break;
                 }
-                last_p = this_pq1.point;
-                last_q = this_pq1.deriv;
-                last_t = t1;
-                break;
             }
             t0_u = t0_u * 2u;
             dt *= 0.5;
diff --git a/src/cpu_shader/euler.rs b/src/cpu_shader/euler.rs
index e2681de6a..08c8eddcf 100644
--- a/src/cpu_shader/euler.rs
+++ b/src/cpu_shader/euler.rs
@@ -9,11 +9,24 @@
 use super::util::Vec2;
 use std::f32::consts::FRAC_PI_4;
 
+/// This struct contains parameters derived from a cubic Bézier for the
+/// purpose of fitting a G1 continuous Euler spiral segment and estimating
+/// the Fréchet distance.
+///
+/// The tangent angles represent deviation from the chord, so that when they
+/// are equal, the corresponding Euler spiral is a circular arc.
+///
+/// Similar, the control point distances are normalized to a chord, so that
+/// for small angles values near 1/3 represent a smooth curve.
 #[derive(Debug)]
 pub struct CubicParams {
+    /// Tangent angle relative to chord at start.
     pub th0: f32,
+    /// Tangent angle relative to chord at end.
     pub th1: f32,
+    /// Distance of first control point.
     pub d0: f32,
+    /// Distance of second control point.
     pub d1: f32,
 }
 
@@ -35,11 +48,14 @@ pub struct EulerSeg {
 
 impl CubicParams {
     /// Compute parameters from endpoints and derivatives.
+    ///
+    /// Robustness note: this function must be protected from being called when the
+    /// chord is near zero.
     pub fn from_points_derivs(p0: Vec2, p1: Vec2, q0: Vec2, q1: Vec2, dt: f32) -> Self {
         let chord = p1 - p0;
-        // Robustness note: we must protect this function from being called when the
-        // chord length is (near-)zero.
-        let scale = dt / chord.length_squared();
+        let length_squared = chord.length_squared();
+        assert_ne!(length_squared, 0.0);
+        let scale = dt / length_squared;
         let h0 = Vec2::new(
             q0.x * chord.x + q0.y * chord.y,
             q0.y * chord.x - q0.x * chord.y,
@@ -193,8 +209,18 @@ impl EulerSeg {
 
 /// Integrate Euler spiral.
 ///
-/// TODO: investigate needed accuracy. We might be able to get away
-/// with 8th order.
+/// This is a 10th order polynomial. The evaluation method is explained in
+/// Raph's thesis in section 8.1.2.
+///
+/// This choice of polynomial is fairly conservative, as it will produce
+/// very good accuracy for angles up to about 1 radian, and those angles
+/// should almost never happen (the exception being cusps). One possibility
+/// to explore is using a lower degree polynomial here, but changing the
+/// tuning parameters for subdivision so the additional error here is also
+/// factored into the error threshold. However, two cautions against that:
+/// First, it doesn't really address the cusp case, where angles will remain
+/// large even after further subdivision, and second, the cost of even this
+/// more conservative choice is modest; it's just some multiply-adds.
 fn integ_euler_10(k0: f32, k1: f32) -> (f32, f32) {
     let t1_1 = k0;
     let t1_2 = 0.5 * k1;
diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs
index 434ec92e5..aa2a77a85 100644
--- a/src/cpu_shader/flatten.rs
+++ b/src/cpu_shader/flatten.rs
@@ -160,22 +160,18 @@ fn flatten_arc(
     lines: &mut [LineSoup],
     bbox: &mut IntBbox,
 ) {
+    const MIN_THETA: f32 = 0.0001;
+
     let mut p0 = transform.apply(begin);
     let mut r = begin - center;
-    let tol: f32 = 0.5;
+    let tol: f32 = 0.1;
     let radius = tol.max((p0 - transform.apply(center)).length());
-    let x = 1. - tol / radius;
-    let theta = (2. * x * x - 1.).clamp(-1., 1.).acos();
-    const MAX_LINES: u32 = 1000;
-    let n_lines = if theta <= ROBUST_EPSILON {
-        MAX_LINES
-    } else {
-        MAX_LINES.min((std::f32::consts::TAU / theta).ceil() as u32)
-    };
+    let theta = (2. * (1. - tol / radius).acos()).max(MIN_THETA);
+
+    // Always output at least one line so that we always draw the chord.
+    let n_lines = ((angle / theta).ceil() as u32).max(1);
 
-    let th = angle / (n_lines as f32);
-    let c = th.cos();
-    let s = th.sin();
+    let (s, c) = theta.sin_cos();
     let rot = Transform([c, -s, s, c, 0., 0.]);
 
     for _ in 0..(n_lines - 1) {
@@ -209,7 +205,6 @@ fn flatten_euler(
     lines: &mut [LineSoup],
     bbox: &mut IntBbox,
 ) {
-    log!("@@@ flatten_euler: {:#?}", cubic);
     // Flatten in local coordinates if this is a stroke. Flatten in device space otherwise.
     let (p0, p1, p2, p3, scale, transform) = if offset == 0. {
         (
@@ -239,40 +234,20 @@ fn flatten_euler(
         (start_p, end_p)
     };
 
-    // Drop zero length lines.
-    // TODO: in some stroke cases we should keep these. Should it be handled while
-    // encoding or here?
+    // Drop zero length lines. This is an exact equality test because dropping very short
+    // line segments may result in loss of watertightness. The parallel curves of zero
+    // length lines add nothing to stroke outlines, but we still may need to draw caps.
     if p0 == p1 && p0 == p2 && p0 == p3 {
         return;
     }
 
-    /*
-    // Special-case lines.
-    // We still have to handle colinear cubic parameters. We are special casing the line-to
-    // encoding because floating point errors in the degree raise can cause some line-tos to slip
-    // through the epsilon threshold in check_colinear.
-    //if check_colinear(p0, p1, p3) && check_colinear(p0, p2, p3) {
-    if is_line {
-        let tan = p3 - p0;
-        if tan.length() > ROBUST_EPSILON {
-            let lp0 = t_start;
-            let lp1 = t_end;
-            let l0 = if offset >= 0. { lp0 } else { lp1 };
-            let l1 = if offset >= 0. { lp1 } else { lp0 };
-            log!("@@@ output line: {:?}, {:?}, tan: {:?}", l0, l1, tan);
-            output_line_with_transform(path_ix, l0, l1, &transform, line_ix, lines, bbox);
-        } else {
-            log!("@@@ drop line: {:?}, {:?}, tan: {:?}", p0, p3, tan);
-        }
-        return;
-    }
-    */
-
     let tol: f32 = 0.25;
     let mut t0_u: u32 = 0;
     let mut dt: f32 = 1.;
     let mut last_p = p0;
     let mut last_q = p1 - p0;
+    // We want to avoid near zero derivatives, so the general technique is to
+    // detect, then sample a nearby t value if it fails to meet the threshold.
     if last_q.length_squared() < DERIV_THRESH.powi(2) {
         last_q = eval_cubic_and_deriv(p0, p1, p2, p3, DERIV_EPS).1;
     }
@@ -293,109 +268,118 @@ fn flatten_euler(
             if this_q1.length_squared() < DERIV_THRESH.powi(2) {
                 let (new_p1, new_q1) = eval_cubic_and_deriv(p0, p1, p2, p3, t1 - DERIV_EPS);
                 this_q1 = new_q1;
+                // Change just the derivative at the endpoint, but also move the point so it
+                // matches the derivative exactly if in the interior.
                 if t1 < 1. {
                     this_p1 = new_p1;
                     t1 -= DERIV_EPS;
                 }
             }
             let actual_dt = t1 - last_t;
-            let cubic_params =
-                CubicParams::from_points_derivs(this_p0, this_p1, this_q0, this_q1, actual_dt);
-            let est_err = cubic_params.est_euler_err();
             let chord_len = (this_p1 - this_p0).length();
-            let err = est_err * chord_len;
-            log!("@@@   loop2: sub:{:?}, {:?} t0: {t0}, t1: {t1}, dt: {dt}, est_err: {est_err}, err: {err}", subcubic, cubic_params);
-            if err * scale <= tol || dt <= SUBDIV_LIMIT {
-                log!("@@@   error within tolerance");
-                t0_u += 1;
-                let shift = t0_u.trailing_zeros();
-                t0_u >>= shift;
-                dt *= (1 << shift) as f32;
-                let euler_params = EulerParams::from_angles(cubic_params.th0, cubic_params.th1);
-                let es = EulerSeg::from_params(this_p0, this_p1, euler_params);
-
-                let (k0, k1) = (es.params.k0 - 0.5 * es.params.k1, es.params.k1);
-
-                // compute forward integral to determine number of subdivisions
-                let dist_scaled = offset * es.params.ch / chord_len;
-                // The number of subdivisions for curvature = 1
-                let scale_multiplier =
-                    0.5 * FRAC_1_SQRT_2 * (scale * chord_len / (es.params.ch * tol)).sqrt();
-                // TODO: tune these thresholds
-                const K1_THRESH: f32 = 1e-3;
-                const DIST_THRESH: f32 = 1e-3;
-                let mut a = 0.0;
-                let mut b = 0.0;
-                let mut integral = 0.0;
-                let mut int0 = 0.0;
-                let (n_frac, robust) = if k1.abs() < K1_THRESH {
-                    let k = k0 + 0.5 * k1;
-                    let n_frac = (k * (k * dist_scaled + 1.0)).abs().sqrt();
-                    (n_frac, EspcRobust::LowK1)
-                } else if dist_scaled.abs() < DIST_THRESH {
-                    let f = |x: f32| x * x.abs().sqrt();
-                    a = k1;
-                    b = k0;
-                    int0 = f(b);
-                    let int1 = f(a + b);
-                    integral = int1 - int0;
-                    //println!("int0={int0}, int1={int1} a={a} b={b}");
-                    let n_frac = (2. / 3.) * integral / a;
-                    (n_frac, EspcRobust::LowDist)
-                } else {
-                    a = -2.0 * dist_scaled * k1;
-                    b = -1.0 - 2.0 * dist_scaled * k0;
-                    int0 = espc_int_approx(b);
-                    let int1 = espc_int_approx(a + b);
-                    integral = int1 - int0;
-                    let k_peak = k0 - k1 * b / a;
-                    let integrand_peak = (k_peak * (k_peak * dist_scaled + 1.0)).abs().sqrt();
-                    let scaled_int = integral * integrand_peak / a;
-                    let n_frac = scaled_int;
-                    (n_frac, EspcRobust::Normal)
-                };
-                let n = (n_frac * scale_multiplier).ceil().max(1.0);
-
-                // Flatten line segments
-                log!("@@@   loop2: lines: {n}");
-                // TODO: make all computation above robust and uncomment this assertion
-                //assert!(!n.is_nan());
-                if n.is_nan() {
-                    // Skip the segment if `n` is NaN. This is for debugging purposes only
-                    log!("@@@   NaN: parameters:\n  es: {:#?}\n  k0: {k0}, k1: {k1}\n  dist_scaled: {dist_scaled}\n  es_scale: {es_scale}\n  a: {a}\n  b: {b}\n  int0: {int0}, int1: {int1}, integral: {integral}\n  k_peak: {k_peak}\n  integrand_peak: {integrand_peak}\n  scaled_int: {scaled_int}\n  n_frac:  {n_frac}", es);
-                } else {
-                    for i in 0..n as usize {
-                        let lp1 = if i == n as usize - 1 && t1 == 1.0 {
-                            t_end
-                        } else {
-                            let t = (i + 1) as f32 / n;
-                            let s = match robust {
-                                EspcRobust::LowK1 => t,
-                                // Note opportunities to minimize divergence
-                                EspcRobust::LowDist => {
-                                    let c = (integral * t + int0).cbrt();
-                                    let inv = c * c.abs();
-                                    (inv - b) / a
-                                }
-                                EspcRobust::Normal => {
-                                    let inv = espc_int_inv_approx(integral * t + int0);
-                                    (inv - b) / a
-                                }
+            // Subdivide the loop case when the chord is short, but don't subdivide when it is
+            // simply a very short segment.
+            if chord_len >= TANGENT_THRESH
+                || (this_q0.length_squared() * actual_dt * actual_dt < DERIV_THRESH.powi(2)
+                    && this_q1.length_squared() * actual_dt * actual_dt < DERIV_THRESH.powi(2))
+            {
+                let cubic_params =
+                    CubicParams::from_points_derivs(this_p0, this_p1, this_q0, this_q1, actual_dt);
+                let est_err = cubic_params.est_euler_err();
+                let err = est_err * chord_len;
+                log!("@@@   loop2: sub:{:?}, {:?} t0: {t0}, t1: {t1}, dt: {dt}, est_err: {est_err}, err: {err}", subcubic, cubic_params);
+                if err * scale <= tol || dt <= SUBDIV_LIMIT {
+                    log!("@@@   error within tolerance");
+                    t0_u += 1;
+                    let shift = t0_u.trailing_zeros();
+                    t0_u >>= shift;
+                    dt *= (1 << shift) as f32;
+                    let euler_params = EulerParams::from_angles(cubic_params.th0, cubic_params.th1);
+                    let es = EulerSeg::from_params(this_p0, this_p1, euler_params);
+
+                    let (k0, k1) = (es.params.k0 - 0.5 * es.params.k1, es.params.k1);
+
+                    // compute forward integral to determine number of subdivisions
+                    let dist_scaled = offset * es.params.ch / chord_len;
+                    // The number of subdivisions for curvature = 1
+                    let scale_multiplier =
+                        0.5 * FRAC_1_SQRT_2 * (scale * chord_len / (es.params.ch * tol)).sqrt();
+                    // TODO: tune these thresholds
+                    const K1_THRESH: f32 = 1e-3;
+                    const DIST_THRESH: f32 = 1e-3;
+                    let mut a = 0.0;
+                    let mut b = 0.0;
+                    let mut integral = 0.0;
+                    let mut int0 = 0.0;
+                    let (n_frac, robust) = if k1.abs() < K1_THRESH {
+                        let k = k0 + 0.5 * k1;
+                        let n_frac = (k * (k * dist_scaled + 1.0)).abs().sqrt();
+                        (n_frac, EspcRobust::LowK1)
+                    } else if dist_scaled.abs() < DIST_THRESH {
+                        let f = |x: f32| x * x.abs().sqrt();
+                        a = k1;
+                        b = k0;
+                        int0 = f(b);
+                        let int1 = f(a + b);
+                        integral = int1 - int0;
+                        //println!("int0={int0}, int1={int1} a={a} b={b}");
+                        let n_frac = (2. / 3.) * integral / a;
+                        (n_frac, EspcRobust::LowDist)
+                    } else {
+                        a = -2.0 * dist_scaled * k1;
+                        b = -1.0 - 2.0 * dist_scaled * k0;
+                        int0 = espc_int_approx(b);
+                        let int1 = espc_int_approx(a + b);
+                        integral = int1 - int0;
+                        let k_peak = k0 - k1 * b / a;
+                        let integrand_peak = (k_peak * (k_peak * dist_scaled + 1.0)).abs().sqrt();
+                        let scaled_int = integral * integrand_peak / a;
+                        let n_frac = scaled_int;
+                        (n_frac, EspcRobust::Normal)
+                    };
+                    let n = (n_frac * scale_multiplier).ceil().max(1.0);
+
+                    // Flatten line segments
+                    log!("@@@   loop2: lines: {n}");
+                    // TODO: make all computation above robust and uncomment this assertion
+                    //assert!(!n.is_nan());
+                    if n.is_nan() {
+                        // Skip the segment if `n` is NaN. This is for debugging purposes only
+                        log!("@@@   NaN: parameters:\n  es: {:#?}\n  k0: {k0}, k1: {k1}\n  dist_scaled: {dist_scaled}\n  es_scale: {es_scale}\n  a: {a}\n  b: {b}\n  int0: {int0}, int1: {int1}, integral: {integral}\n  k_peak: {k_peak}\n  integrand_peak: {integrand_peak}\n  scaled_int: {scaled_int}\n  n_frac:  {n_frac}", es);
+                    } else {
+                        for i in 0..n as usize {
+                            let lp1 = if i == n as usize - 1 && t1 == 1.0 {
+                                t_end
+                            } else {
+                                let t = (i + 1) as f32 / n;
+                                let s = match robust {
+                                    EspcRobust::LowK1 => t,
+                                    // Note opportunities to minimize divergence
+                                    EspcRobust::LowDist => {
+                                        let c = (integral * t + int0).cbrt();
+                                        let inv = c * c.abs();
+                                        (inv - b) / a
+                                    }
+                                    EspcRobust::Normal => {
+                                        let inv = espc_int_inv_approx(integral * t + int0);
+                                        (inv - b) / a
+                                    }
+                                };
+                                es.eval_with_offset(s, offset)
                             };
-                            es.eval_with_offset(s, offset)
-                        };
-                        let l0 = if offset >= 0. { lp0 } else { lp1 };
-                        let l1 = if offset >= 0. { lp1 } else { lp0 };
-                        output_line_with_transform(
-                            path_ix, l0, l1, &transform, line_ix, lines, bbox,
-                        );
-                        lp0 = lp1;
+                            let l0 = if offset >= 0. { lp0 } else { lp1 };
+                            let l1 = if offset >= 0. { lp1 } else { lp0 };
+                            output_line_with_transform(
+                                path_ix, l0, l1, &transform, line_ix, lines, bbox,
+                            );
+                            lp0 = lp1;
+                        }
                     }
+                    last_p = this_p1;
+                    last_q = this_q1;
+                    last_t = t1;
+                    break;
                 }
-                last_p = this_p1;
-                last_q = this_q1;
-                last_t = t1;
-                break;
             }
             t0_u = t0_u.saturating_mul(2);
             dt *= 0.5;

From 8a521f7e10b0ca18e525954a39fa76bee8ddfd57 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Tue, 12 Mar 2024 19:28:02 -0700
Subject: [PATCH 14/16] Tune tolerance value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I have validated that the tolerance value for both arcs and Bézier segments is calibrated correctly. Thus, it makes sense to set both to a value of 0.25, so that caps are consistent. This comes quite close to matching the amount of subdivision in current main, and the line count is also consistent between CPU and GPU stroke expansion.
---
 src/cpu_shader/flatten.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpu_shader/flatten.rs b/src/cpu_shader/flatten.rs
index aa2a77a85..a69fc5d68 100644
--- a/src/cpu_shader/flatten.rs
+++ b/src/cpu_shader/flatten.rs
@@ -164,7 +164,7 @@ fn flatten_arc(
 
     let mut p0 = transform.apply(begin);
     let mut r = begin - center;
-    let tol: f32 = 0.1;
+    let tol: f32 = 0.25;
     let radius = tol.max((p0 - transform.apply(center)).length());
     let theta = (2. * (1. - tol / radius).acos()).max(MIN_THETA);
 

From df81efe15f526fece5103dc0371f171f1a839813 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Tue, 12 Mar 2024 19:41:48 -0700
Subject: [PATCH 15/16] Make GPU tolerance consistent with CPU

Oops, I made the change only on the CPU side. This brings GPU into agreement.
---
 shader/flatten.wgsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shader/flatten.wgsl b/shader/flatten.wgsl
index 52ec79fc1..0998cc994 100644
--- a/shader/flatten.wgsl
+++ b/shader/flatten.wgsl
@@ -497,7 +497,7 @@ fn flatten_arc(
     var r = begin - center;
 
     let MIN_THETA = 0.0001;
-    let tol = 0.1;
+    let tol = 0.25;
     let radius = max(tol, length(p0 - transform_apply(transform, center)));
     let theta = max(MIN_THETA, 2. * acos(1. - tol / radius));
 

From 9c35df82da5ab06443a5eae265c6ffb6f687d817 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Wed, 13 Mar 2024 14:17:11 -0700
Subject: [PATCH 16/16] Address review comments

A bit more comment, but also a more robust protection against a divide by zero, which happens in the colinear double-cusp case, as exercised by tricky-strokes.
---
 shader/flatten.wgsl     |  4 ++--
 src/cpu_shader/euler.rs | 12 ++++++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/shader/flatten.wgsl b/shader/flatten.wgsl
index 0998cc994..6ee46b354 100644
--- a/shader/flatten.wgsl
+++ b/shader/flatten.wgsl
@@ -96,8 +96,8 @@ fn est_euler_err(cparams: CubicParams) -> f32 {
     if cth0 * cth1 < 0.0 {
         return 2.0;
     }
-    let e0 = (2. / 3.) / (1.0 + cth0);
-    let e1 = (2. / 3.) / (1.0 + cth1);
+    let e0 = (2. / 3.) / max(1.0 + cth0, 1e-9);
+    let e1 = (2. / 3.) / max(1.0 + cth1, 1e-9);
     let s0 = sin(cparams.th0);
     let s1 = sin(cparams.th1);
     let s01 = cth0 * s1 + cth1 * s0;
diff --git a/src/cpu_shader/euler.rs b/src/cpu_shader/euler.rs
index 08c8eddcf..17c6bc9bb 100644
--- a/src/cpu_shader/euler.rs
+++ b/src/cpu_shader/euler.rs
@@ -85,10 +85,18 @@ impl CubicParams {
             // Rationale: this happens when fitting a cusp or near-cusp with
             // a near 180 degree u-turn. The actual ES is bounded in that case.
             // Further subdivision won't reduce the angles if actually a cusp.
+            //
+            // A value of 2.0 represents the approximate worst case distance
+            // from an Euler spiral with 0 and pi tangents to the chord. It
+            // is not very critical; doubling the value would result in one more
+            // subdivision in effectively a binary search for the cusp, while too
+            // small a value may result in the actual error exceeding the bound.
             return 2.0;
         }
-        let e0 = (2. / 3.) / (1.0 + cth0);
-        let e1 = (2. / 3.) / (1.0 + cth1);
+        // Protect against divide-by-zero. This happens with a double cusp, so
+        // should in the general case cause subdivisions.
+        let e0 = (2. / 3.) / (1.0 + cth0).max(1e-9);
+        let e1 = (2. / 3.) / (1.0 + cth1).max(1e-9);
         let s0 = self.th0.sin();
         let s1 = self.th1.sin();
         // Note: some other versions take sin of s0 + s1 instead. Those are incorrect.