diff --git a/Source/DafnyStandardLibraries/Makefile b/Source/DafnyStandardLibraries/Makefile index 231dabed7ab..7669cd5692e 100644 --- a/Source/DafnyStandardLibraries/Makefile +++ b/Source/DafnyStandardLibraries/Makefile @@ -10,6 +10,10 @@ DOO_FILE_ARITHMETIC_TARGET=binaries/DafnyStandardLibraries-arithmetic.doo all: check-binary test check-format check-examples +verify: + $(DAFNY) verify src/dfyconfig.toml + $(DAFNY) verify src/dfyconfig-arithmetic.toml + build-binary: $(DAFNY) build -t:lib src/dfyconfig.toml --output:${DOO_FILE_SOURCE} $(DAFNY) build -t:lib src/dfyconfig-arithmetic.toml --output:${DOO_FILE_ARITHMETIC_SOURCE} @@ -29,14 +33,29 @@ update-binary: build-binary # For now we only have examples and no dedicated tests. # We will likely want a test directory as well, # with deeper coverage of module functionality. -test: +build-examples: $(DAFNY) build -t:lib examples/dfyconfig.toml --output:build/stdlibexamples.doo + +test-cs: build-examples $(DAFNY) test -t:cs build/stdlibexamples.doo examples/BoundedInts/NonDefault.cs --output:build/stdlibexamples --coverage-report:build/testcoverage + +test-java: build-examples $(DAFNY) test -t:java build/stdlibexamples.doo examples/BoundedInts/Externs/NonDefault.java --output:build/stdlibexamples + +test-go: build-examples $(DAFNY) test -t:go build/stdlibexamples.doo examples/BoundedInts/NonDefault.go --output:build/stdlibexamples + +test-py: build-examples $(DAFNY) test -t:py build/stdlibexamples.doo examples/BoundedInts/NonDefault.py --output:build/stdlibexamples + +test-js: build-examples $(DAFNY) test -t:js build/stdlibexamples.doo examples/BoundedInts/NonDefault.js --output:build/stdlibexamples - + +test: test-cs test-java test-go test-py test-js + +# We only generate coverage from the C# tests +coverage: test-cs + format: $(DAFNY) format . @@ -47,4 +66,4 @@ check-examples: cd build && ../scripts/check-examples `find .. -name '*.md'` clean: - rm -rf build \ No newline at end of file + rm -rf build diff --git a/Source/DafnyStandardLibraries/binaries/DafnyStandardLibraries-arithmetic.doo b/Source/DafnyStandardLibraries/binaries/DafnyStandardLibraries-arithmetic.doo index e06c6a66bb7..8d909f67490 100644 Binary files a/Source/DafnyStandardLibraries/binaries/DafnyStandardLibraries-arithmetic.doo and b/Source/DafnyStandardLibraries/binaries/DafnyStandardLibraries-arithmetic.doo differ diff --git a/Source/DafnyStandardLibraries/binaries/DafnyStandardLibraries.doo b/Source/DafnyStandardLibraries/binaries/DafnyStandardLibraries.doo index ec6b9d3c069..a325a4b773e 100644 Binary files a/Source/DafnyStandardLibraries/binaries/DafnyStandardLibraries.doo and b/Source/DafnyStandardLibraries/binaries/DafnyStandardLibraries.doo differ diff --git a/Source/DafnyStandardLibraries/examples/CollectionsExamples.dfy b/Source/DafnyStandardLibraries/examples/CollectionsExamples.dfy index 27883101143..e50d60adbd4 100644 --- a/Source/DafnyStandardLibraries/examples/CollectionsExamples.dfy +++ b/Source/DafnyStandardLibraries/examples/CollectionsExamples.dfy @@ -1,6 +1,6 @@ /******************************************************************************* * Copyright by the contributors to the Dafny Project - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT *******************************************************************************/ module CollectionsExamples { @@ -9,7 +9,7 @@ module CollectionsExamples { // but they also use AssertAndExpect to prove some of the simpler cases as well. // If and when we have some way of measuring "specification coverage", // this pattern should help with that kind of coverage as well. - // + // // There are lots of small test methods here, // which besides following the general best practice // of testing one or perhaps a couple of related things at once, @@ -195,22 +195,4 @@ module CollectionsExamples { } } - module Helpers { - - import opened DafnyStdLibs.Wrappers - - // This should be moved somewhere more central at some point, - // perhaps even into a utilities library. - // Or tweak /testContracts to support this use case better. - method AssertAndExpect(p: bool, maybeMsg: Option := None) requires p { - match maybeMsg { - case None => { - expect p; - } - case Some(msg) => { - expect p, msg; - } - } - } - } -} \ No newline at end of file +} diff --git a/Source/DafnyStandardLibraries/examples/Helpers.dfy b/Source/DafnyStandardLibraries/examples/Helpers.dfy new file mode 100644 index 00000000000..081cdb5e954 --- /dev/null +++ b/Source/DafnyStandardLibraries/examples/Helpers.dfy @@ -0,0 +1,19 @@ +/******************************************************************************* + * Copyright by the contributors to the Dafny Project + * SPDX-License-Identifier: MIT + *******************************************************************************/ + +module Helpers { + import opened DafnyStdLibs.Wrappers + // TODO: consider tweaking /testContracts to support this use case better. + method AssertAndExpect(p: bool, maybeMsg: Option := None) requires p { + match maybeMsg { + case None => { + expect p; + } + case Some(msg) => { + expect p, msg; + } + } + } +} diff --git a/Source/DafnyStandardLibraries/examples/UnicodeExamples.dfy b/Source/DafnyStandardLibraries/examples/UnicodeExamples.dfy new file mode 100644 index 00000000000..fd841ae74c9 --- /dev/null +++ b/Source/DafnyStandardLibraries/examples/UnicodeExamples.dfy @@ -0,0 +1,193 @@ + +/******************************************************************************* + * Copyright by the contributors to the Dafny Project + * SPDX-License-Identifier: MIT + *******************************************************************************/ + +module UnicodeExamples { + module BaseExamples { + import opened DafnyStdLibs.Unicode.Base + import opened Helpers + + const TEST_ASSIGNED_PLANE_CODE_POINTS: set := { + 0x000055, // Latin capital letter U + 0x01F11D, // Parenthesized Latin capital letter N + 0x02053C, // CJK unified ideograph 𠔼 + 0x030256, // CJK unified ideograph 𰉖 + 0x0E004F, // Tag Latin capital letter O + 0x0FDDDD, // arbitrary code point in Private Use Area-A + 0x10EEEE // arbitrary code point in Private Use Area-B + } + + lemma LemmaAssignedCodePoints() + ensures forall p | p in TEST_ASSIGNED_PLANE_CODE_POINTS :: IsInAssignedPlane(p) + { + reveal IsInAssignedPlane(); + } + + method {:test} TestAssignedCodePoints() { + LemmaAssignedCodePoints(); + AssertAndExpect(forall p | p in TEST_ASSIGNED_PLANE_CODE_POINTS :: IsInAssignedPlane(p)); + } + } + + module Utf8EncodingFormExamples { + import opened DafnyStdLibs.Unicode.Base + import opened DafnyStdLibs.Unicode.Utf8EncodingForm + import opened DafnyStdLibs.Wrappers + import opened Helpers + + method {:test} TestEmptySequenceIsWellFormed() { + expect IsWellFormedCodeUnitSequence([]); + } + + const TEST_SCALAR_VALUES: seq<(ScalarValue, WellFormedCodeUnitSeq)> := [ + // One byte: dollar sign + (0x0024, [0x24]), + // Two bytes: pound sign + (0x00A3, [0xC2, 0xA3]), + // Three bytes: euro sign + (0x20AC, [0xE2, 0x82, 0xAC]), + // Four bytes: money bag emoji + (0x1F4B0, [0xF0, 0x9F, 0x92, 0xB0]) + ] + + method {:test} TestEncodeDecodeScalarValue() { + for i := 0 to |TEST_SCALAR_VALUES| { + var pair := TEST_SCALAR_VALUES[i]; + AssertAndExpect(EncodeScalarValue(pair.0) == pair.1); + AssertAndExpect(DecodeCodeUnitSequence(pair.1) == [pair.0]); + AssertAndExpect(DecodeCodeUnitSequenceChecked(pair.1) == Some([pair.0])); + } + } + + method {:test} TestEmptySequenceIsNotMinimalWellFormed() { + expect !IsMinimalWellFormedCodeUnitSubsequence([]); + } + + method {:test} TestMinimalWellFormedCodeUnitSubsequences() { + for i := 0 to |TEST_SCALAR_VALUES| { + var pair := TEST_SCALAR_VALUES[i]; + expect IsMinimalWellFormedCodeUnitSubsequence(pair.1); + } + } + + // Examples taken from description of Table 3-7. + const TEST_ILL_FORMED_SEQUENCES: seq := [ + // C0 is not well-formed as a first byte + [0xC0, 0xAF], + // 9F is not well-formed as a second byte when E0 is a well-formed first byte + [0xE0, 0x9F, 0x80] + ] + + method {:test} TestDecodeIllFormedSequence() { + for i := 0 to |TEST_ILL_FORMED_SEQUENCES| { + var s := TEST_ILL_FORMED_SEQUENCES[i]; + AssertAndExpect(DecodeCodeUnitSequenceChecked(s).None?); + } + } + } + + module Utf16EncodingFormExamples { + import opened DafnyStdLibs.Unicode.Base + import opened DafnyStdLibs.Unicode.Utf16EncodingForm + import opened DafnyStdLibs.Wrappers + import opened Helpers + + method {:test} TestEmptySequenceIsWellFormed() { + expect IsWellFormedCodeUnitSequence([]); + } + + const TEST_SCALAR_VALUES: seq<(ScalarValue, WellFormedCodeUnitSeq)> := [ + // One code unit: dollar sign + (0x0024, [0x0024]), + // Two code units: money bag emoji + (0x1F4B0, [0xD83D, 0xDCB0]) + ] + + method {:test} TestEncodeDecodeScalarValue() { + for i := 0 to |TEST_SCALAR_VALUES| { + var pair := TEST_SCALAR_VALUES[i]; + AssertAndExpect(EncodeScalarValue(pair.0) == pair.1); + AssertAndExpect(DecodeCodeUnitSequence(pair.1) == [pair.0]); + AssertAndExpect(DecodeCodeUnitSequenceChecked(pair.1) == Some([pair.0])); + } + } + + method {:test} TestEmptySequenceIsNotMinimalWellFormed() { + expect !IsMinimalWellFormedCodeUnitSubsequence([]); + } + + method {:test} TestMinimalWellFormedCodeUnitSubsequences() { + for i := 0 to |TEST_SCALAR_VALUES| { + var pair := TEST_SCALAR_VALUES[i]; + expect IsMinimalWellFormedCodeUnitSubsequence(pair.1); + } + } + + // Because surrogate code points are not Unicode scalar values, isolated UTF-16 code units in the range + // D800_16 .. DFFF_16 are ill-formed. (Section 3.9 D91) + const TEST_ILL_FORMED_SEQUENCES: seq := [ + [0xD800], + [0xDABC], + [0xDFFF] + ] + + method {:test} TestDecodeIllFormedSequence() { + for i := 0 to |TEST_ILL_FORMED_SEQUENCES| { + var s := TEST_ILL_FORMED_SEQUENCES[i]; + AssertAndExpect(DecodeCodeUnitSequenceChecked(s).None?); + } + } + } + + module UnicodeStringsWithUnicodeCharExamples { + import opened DafnyStdLibs.BoundedInts + import opened DafnyStdLibs.Unicode.Base + import opened DafnyStdLibs.Wrappers + import opened Helpers + + import UnicodeStrings = DafnyStdLibs.Unicode.UnicodeStringsWithUnicodeChar + + const currenciesStr := "\U{24}\U{A3}\U{20AC}\U{1F4B0}" + const currenciesUtf8: seq := [0x24] + [0xC2, 0xA3] + [0xE2, 0x82, 0xAC] + [0xF0, 0x9F, 0x92, 0xB0] + const currenciesUtf16: seq := [0x0024] + [0x00A3] + [0x20AC] + [0xD83D, 0xDCB0] + + method {:test} TestToUTF8Checked() { + expect UnicodeStrings.ToUTF8Checked(currenciesStr) == Some(currenciesUtf8); + } + + method {:test} TestFromUTF8Checked() { + expect UnicodeStrings.FromUTF8Checked(currenciesUtf8) == Some(currenciesStr); + expect UnicodeStrings.FromUTF8Checked(currenciesUtf8[2..]) == None; + } + + method {:test} TestToUTF16() { + expect UnicodeStrings.ToUTF16Checked(currenciesStr) == Some(currenciesUtf16); + } + + method {:test} TestFromUTF16Checked() { + expect UnicodeStrings.FromUTF16Checked(currenciesUtf16) == Some(currenciesStr); + expect UnicodeStrings.FromUTF16Checked(currenciesUtf16[..|currenciesUtf16| - 1]) == None; + } + + method {:test} TestASCIIToUnicode() { + expect UnicodeStrings.ASCIIToUTF8("foobar") == [0x66, 0x6F, 0x6F, 0x62, 0x61, 0x72]; + expect UnicodeStrings.ASCIIToUTF16("foobar") == [0x66, 0x6F, 0x6F, 0x62, 0x61, 0x72]; + } + } + + module Utf8EncodingSchemeExamples { + import opened DafnyStdLibs.Unicode.Base + import opened DafnyStdLibs.Unicode.Utf8EncodingForm + import EncodingScheme = DafnyStdLibs.Unicode.Utf8EncodingScheme + import opened DafnyStdLibs.Wrappers + import opened Helpers + + const currenciesUtf8: CodeUnitSeq := [0x24] + [0xC2, 0xA3] + [0xE2, 0x82, 0xAC] + [0xF0, 0x9F, 0x92, 0xB0] + + method {:test} TestSerializeDeserialize() { + expect EncodingScheme.Deserialize(EncodingScheme.Serialize(currenciesUtf8)) == currenciesUtf8; + } + } +} diff --git a/Source/DafnyStandardLibraries/src/DafnyStdLibs/Base64.dfy b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Base64.dfy index e463c71b82b..49f198ae0c1 100644 --- a/Source/DafnyStandardLibraries/src/DafnyStdLibs/Base64.dfy +++ b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Base64.dfy @@ -98,7 +98,7 @@ module DafnyStdLibs.Base64 { else (c - 65 as char) as index } - lemma {:vcs_split_on_every_assert} CharToIndexToChar(c: char) + lemma {:rlimit 2000} {:vcs_split_on_every_assert} CharToIndexToChar(c: char) requires IsBase64Char(c) ensures IndexToChar(CharToIndex(c)) == c { @@ -702,6 +702,9 @@ module DafnyStdLibs.Base64 { reveal SeqToBV24(); reveal IndexSeqToBV24(); reveal BV24ToIndexSeq(); + assert d'[0] == IndexToChar(CharToIndex(s[0])); + assert d'[1] == IndexToChar(CharToIndex(s[1])); + assert d'[2] == IndexToChar(CharToIndex(s[2])); } [IndexToChar(CharToIndex(s[0])), IndexToChar(CharToIndex(s[1])), IndexToChar(CharToIndex(s[2])), '=']; == { CharToIndexToChar(s[0]); CharToIndexToChar(s[1]); CharToIndexToChar(s[2]); } @@ -1210,7 +1213,7 @@ module DafnyStdLibs.Base64 { AboutDecodeValid(s, DecodeValid(s)); } - lemma DecodeValidEncode1Padding(s: seq) + lemma {:rlimit 12000} DecodeValidEncode1Padding(s: seq) requires IsBase64String(s) requires |s| >= 4 requires Is1Padding(s[(|s| - 4)..]) @@ -1446,7 +1449,7 @@ module DafnyStdLibs.Base64 { seq(|b|, i requires 0 <= i < |b| => b[i] as uint8) } - lemma UInt8sToBVsToUInt8s(u: seq) + lemma {:rlimit 8000} UInt8sToBVsToUInt8s(u: seq) ensures BVsToUInt8s(UInt8sToBVs(u)) == u { // TODO: reduce resource use diff --git a/Source/DafnyStandardLibraries/src/DafnyStdLibs/Collections/Seqs.dfy b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Collections/Seqs.dfy index b7e7d27e50c..f3b4b854a10 100644 --- a/Source/DafnyStandardLibraries/src/DafnyStdLibs/Collections/Seqs.dfy +++ b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Collections/Seqs.dfy @@ -624,7 +624,7 @@ module DafnyStdLibs.Collections.Seqs { /* Applying a function to a sequence is distributive over concatenation. That is, concatenating two sequences and then applying Map is the same as applying Map to each sequence separately, and then concatenating the two resulting sequences. */ - lemma {:opaque} LemmaMapDistributesOverConcat(f: (T ~> R), xs: seq, ys: seq) + lemma {:opaque} {:rlimit 2000} LemmaMapDistributesOverConcat(f: (T ~> R), xs: seq, ys: seq) requires forall i {:trigger xs[i]}:: 0 <= i < |xs| ==> f.requires(xs[i]) requires forall j {:trigger ys[j]}:: 0 <= j < |ys| ==> f.requires(ys[j]) ensures Map(f, xs + ys) == Map(f, xs) + Map(f, ys) diff --git a/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Unicode.dfy b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Unicode.dfy new file mode 100644 index 00000000000..cae119b91f1 --- /dev/null +++ b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Unicode.dfy @@ -0,0 +1,17 @@ +/******************************************************************************* + * Copyright by the contributors to the Dafny Project + * SPDX-License-Identifier: MIT + *******************************************************************************/ + +/** + This module defines concepts from the core specification + of the [Unicode Standard 15.1.0](https://www.unicode.org/versions/Unicode15.1.0/): + scalar values, code points, important kinds of code point sequences, + Unicode encoding forms and encoding schemes (including UTF-8 and UTF-16), + and helpful properties and functions for manipulation and conversion of Unicode strings. + */ +module DafnyStdLibs.Unicode { + // This top-level module contains no definitions. + // It exists to enable import between child modules, + // and so that its doc comment appears in generated documentation. +} diff --git a/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Unicode.md b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Unicode.md new file mode 100644 index 00000000000..d0e951cde74 --- /dev/null +++ b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Unicode.md @@ -0,0 +1,12 @@ +## The `Unicode` module + +This module defines concepts from the core specification +of the [Unicode Standard 15.1.0](https://www.unicode.org/versions/Unicode15.1.0/): +scalar values, code points, important kinds of code point sequences, +Unicode encoding forms and encoding schemes (including UTF-8 and UTF-16), +and helpful properties and functions for manipulation and conversion of Unicode strings. + +At this time, the module can only be used with `--unicode-char:true` +(the default value since Dafny 4.0.0). +In the future, support for `--unicode-char:false` (via module abstraction/refinement) +will be added. diff --git a/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/UnicodeBase.dfy b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/UnicodeBase.dfy new file mode 100644 index 00000000000..f3bb9fd02d6 --- /dev/null +++ b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/UnicodeBase.dfy @@ -0,0 +1,59 @@ +/******************************************************************************* + * Copyright by the contributors to the Dafny Project + * SPDX-License-Identifier: MIT + *******************************************************************************/ + +/** + * This module implements basic functionality of Unicode 15.1.0. + */ +module DafnyStdLibs.Unicode.Base { + /** + * Any value in the Unicode codespace (a range of integers from 0 to 10FFFF_16). (Section 3.4 D9-D10) + */ + type CodePoint = i: bv24 | 0 <= i <= 0x10FFFF + + /** + * A Unicode code point in the range U+D800 to U+DBFF. (Section 3.8 D71) + */ + type HighSurrogateCodePoint = p: CodePoint | HIGH_SURROGATE_MIN <= p <= HIGH_SURROGATE_MAX + witness HIGH_SURROGATE_MIN + const HIGH_SURROGATE_MIN: CodePoint := 0xD800 + const HIGH_SURROGATE_MAX: CodePoint := 0xDBFF + + /** + * A Unicode code point in the range U+DC00 to U+DFFF. (Section 3.8 D73) + */ + type LowSurrogateCodePoint = p: CodePoint | LOW_SURROGATE_MIN <= p <= LOW_SURROGATE_MAX + witness LOW_SURROGATE_MIN + const LOW_SURROGATE_MIN: CodePoint := 0xDC00 + const LOW_SURROGATE_MAX: CodePoint := 0xDFFF + + /** + * Any Unicode code point except high-surrogate and low-surrogate code points. (Section 3.9 D76) + */ + type ScalarValue = p: CodePoint | + && (p < HIGH_SURROGATE_MIN || p > HIGH_SURROGATE_MAX) + && (p < LOW_SURROGATE_MIN || p > LOW_SURROGATE_MAX) + + const ASSIGNED_PLANES: set := { + 0, // Basic Multilingual Plane + 1, // Supplementary Multilingual Plane + 2, // Supplementary Ideographic Plane + 3, // Tertiary Ideographic Plane + 14, // Supplementary Special Purpose Plane + 15, // Supplementary Private Use Area A + 16 // Supplementary Private Use Area B + } + + opaque predicate IsInAssignedPlane(i: CodePoint) { + var plane := (i >> 16) as bv8; + plane in ASSIGNED_PLANES + } + + // These are actually supersets of the Unicode planes, + // since not all code points in a plane are assigned. + // + // TODO: check against the list of assigned code points, instead of only checking their plane + // (https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt) + type AssignedCodePoint = p: CodePoint | IsInAssignedPlane(p) witness * +} diff --git a/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/UnicodeEncodingForm.dfy b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/UnicodeEncodingForm.dfy new file mode 100644 index 00000000000..20399c4d281 --- /dev/null +++ b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/UnicodeEncodingForm.dfy @@ -0,0 +1,337 @@ +/******************************************************************************* + * Copyright by the contributors to the Dafny Project + * SPDX-License-Identifier: MIT + *******************************************************************************/ + +/** + * A Unicode encoding form assigns each Unicode scalar value to a unique code unit sequence. + * + * A concrete `UnicodeEncodingForm` MUST define the following: + * - The type `CodeUnit`. + * - The predicate `IsMinimalWellFormedCodeUnitSubsequence`, which defines the set of encodings of scalar values, + * known as "minimal well-formed code unit subsequences". + * - The function `SplitPrefixMinimalWellFormedCodeUnitSubsequence`, which defines the algorithm by which to parse + * a minimal well-formed code unit subsequence from the beginning of a code unit sequence. + * - The function `EncodeScalarValue`, which defines the mapping from scalar values to minimal well-formed code unit + * subsequences. + * - The function `DecodeMinimalWellFormedCodeUnitSubsequence`, which defines the mapping from minimal well-formed + * code unit subsequences to scalar values. + */ +abstract module DafnyStdLibs.Unicode.UnicodeEncodingForm { + import opened Wrappers + + import Functions + import Collections.Seqs + import opened Base + + type CodeUnitSeq = seq + type WellFormedCodeUnitSeq = s: CodeUnitSeq + | IsWellFormedCodeUnitSequence(s) + witness [] + type MinimalWellFormedCodeUnitSeq = s: CodeUnitSeq + | IsMinimalWellFormedCodeUnitSubsequence(s) + witness * + + // + // Begin abstract items. + // + + /** + * A code unit is the minimal bit combination that can represent a unit of encoded text for processing or + * interchange. (Section 3.9 D77.) + */ + type CodeUnit + + /** + * A well-formed Unicode code unit sequence that maps to a single Unicode scalar value. + * (Section 3.9 D85a.) + */ + function IsMinimalWellFormedCodeUnitSubsequence(s: CodeUnitSeq): (b: bool) + ensures b ==> + && |s| > 0 + && forall i | 0 < i < |s| :: !IsMinimalWellFormedCodeUnitSubsequence(s[..i]) + decreases |s| + + /** + * Returns the shortest prefix of `s` that is a minimal well-formed code unit subsequence, + * or None if there is no such prefix. + */ + function SplitPrefixMinimalWellFormedCodeUnitSubsequence(s: CodeUnitSeq): (maybePrefix: Option) + ensures |s| == 0 ==> maybePrefix.None? + ensures (exists i | 0 < i <= |s| :: IsMinimalWellFormedCodeUnitSubsequence(s[..i])) <==> + && maybePrefix.Some? + ensures maybePrefix.Some? ==> + && var prefix := maybePrefix.Extract(); + && 0 < |prefix| <= |s| + && prefix == s[..|prefix|] + && forall i | 0 < i < |prefix| :: !IsMinimalWellFormedCodeUnitSubsequence(s[..i]) + + /** + * Returns the minimal well-formed code unit subsequence that this encoding form assigns to the given scalar value. + * The Unicode standard requires that this is injective. + * + * TODO: enforce that implementations satisfy Functions.Injective + */ + function EncodeScalarValue(v: ScalarValue): (m: MinimalWellFormedCodeUnitSeq) + + /** + * Returns the scalar value that this encoding form assigns to the given minimal well-formed code unit subsequence. + */ + function DecodeMinimalWellFormedCodeUnitSubsequence(m: MinimalWellFormedCodeUnitSeq): (v: ScalarValue) + ensures EncodeScalarValue(v) == m + + // + // End abstract items. + // + + /** + * If minimal well-formed code unique subsequences `m1` and `m2` are prefixes of `s`, then they are equal. + */ + lemma LemmaUniquePrefixMinimalWellFormedCodeUnitSeq( + s: CodeUnitSeq, m1: MinimalWellFormedCodeUnitSeq, m2: MinimalWellFormedCodeUnitSeq + ) + decreases |s|, |m1|, |m2| + requires m1 <= s + requires m2 <= s + ensures m1 == m2 + { + // Handle only the |m1| <= |m2| case explicitly + if |m1| > |m2| { + LemmaUniquePrefixMinimalWellFormedCodeUnitSeq(s, m2, m1); + } else { + assert m1 <= m2; + assert m1 == m2 by { + if m1 < m2 { + assert false by { assert m1 == m2[..|m1|]; } + } + } + } + } + + /** + * If `ms` is the concatenation of a minimal well-formed code unit subsequence `m` and a code unit sequence `s`, + * then the shortest minimal well-formed code unit subsequence prefix of `ms` is simply `m`. + */ + lemma LemmaSplitPrefixMinimalWellFormedCodeUnitSubsequenceInvertsPrepend(m: MinimalWellFormedCodeUnitSeq, s: CodeUnitSeq) + ensures SplitPrefixMinimalWellFormedCodeUnitSubsequence(m + s) == Some(m) + { + var ms := m + s; + var maybePrefix := SplitPrefixMinimalWellFormedCodeUnitSubsequence(ms); + assert maybePrefix.Some? by { assert IsMinimalWellFormedCodeUnitSubsequence(ms[..|m|]); } + var prefix := maybePrefix.Extract(); + + assert m <= ms; + assert prefix <= ms; + LemmaUniquePrefixMinimalWellFormedCodeUnitSeq(ms, m, prefix); + } + + /** + * Returns the unique partition of the given code unit sequence into minimal well-formed code unit subsequences, + * or None if no such partition exists. + */ + function PartitionCodeUnitSequenceChecked(s: CodeUnitSeq): (maybeParts: Option>) + ensures maybeParts.Some? ==> Seqs.Flatten(maybeParts.Extract()) == s + decreases |s| + { + if s == [] then Some([]) + else + var prefix :- SplitPrefixMinimalWellFormedCodeUnitSubsequence(s); + // Recursing/iterating on subsequences leads to quadratic running time in most/all Dafny runtimes as of this writing. + // This definition (and others in the Unicode modules) emphasizes clarity and correctness, + // and while the by-method implementation avoids stack overflows due to non-tail-recursive recursion, + // it doesn't yet avoid the subsequences. + // The best solution would be to ensure all Dafny runtimes implement subsequences as an O(1) + // operation, so this implementation would become linear. + var restParts :- PartitionCodeUnitSequenceChecked(s[|prefix|..]); + Some([prefix] + restParts) + } by method { + if s == [] { + return Some([]); + } + var result: seq := []; + var rest := s; + while |rest| > 0 + invariant PartitionCodeUnitSequenceChecked(s).Some? + <==> PartitionCodeUnitSequenceChecked(rest).Some? + invariant + PartitionCodeUnitSequenceChecked(s).Some? ==> + && PartitionCodeUnitSequenceChecked(s).value + == result + PartitionCodeUnitSequenceChecked(rest).value + { + var prefix :- SplitPrefixMinimalWellFormedCodeUnitSubsequence(rest); + result := result + [prefix]; + rest := rest[|prefix|..]; + } + assert result + [] == result; + return Some(result); + } + + /** + * Returns the unique partition of the given well-formed code unit sequence into minimal well-formed code unit + * subsequences. + */ + function PartitionCodeUnitSequence(s: WellFormedCodeUnitSeq): (parts: seq) + ensures Seqs.Flatten(parts) == s + { + PartitionCodeUnitSequenceChecked(s).Extract() + } + + /** + * The partitioning of a minimal well-formed code unit subsequence is the singleton sequence + * containing exactly the minimal well-formed code unit subsequence. + */ + lemma LemmaPartitionMinimalWellFormedCodeUnitSubsequence(m: MinimalWellFormedCodeUnitSeq) + ensures PartitionCodeUnitSequenceChecked(m) == Some([m]) + { + LemmaSplitPrefixMinimalWellFormedCodeUnitSubsequenceInvertsPrepend(m, []); + calc == { + Some(m); + SplitPrefixMinimalWellFormedCodeUnitSubsequence(m + []); + { assert m + [] == m; } + SplitPrefixMinimalWellFormedCodeUnitSubsequence(m); + } + calc == { + PartitionCodeUnitSequenceChecked(m); + Some([m] + []); + { assert [m] + [] == [m]; } + Some([m]); + } + } + + /** + * A code unit sequence is well-formed iff it can be partitioned into a sequence of minimal well-formed code unit subsequences. + */ + function IsWellFormedCodeUnitSequence(s: CodeUnitSeq): (b: bool) + { + PartitionCodeUnitSequenceChecked(s).Some? + } + + /** + * A minimal well-formed code unit subsequence is a well-formed code unit sequence. + */ + lemma LemmaMinimalWellFormedCodeUnitSubsequenceIsWellFormedSequence(m: MinimalWellFormedCodeUnitSeq) + ensures IsWellFormedCodeUnitSequence(m) + { + LemmaPartitionMinimalWellFormedCodeUnitSubsequence(m); + } + + /** + * The concatenation of a minimal well-formed code unit subsequence and a well-formed code unit sequence + * is itself a well-formed code unit sequence. + */ + lemma LemmaPrependMinimalWellFormedCodeUnitSubsequence(m: MinimalWellFormedCodeUnitSeq, s: WellFormedCodeUnitSeq) + ensures IsWellFormedCodeUnitSequence(m + s) + { + LemmaPartitionMinimalWellFormedCodeUnitSubsequence(m); + LemmaSplitPrefixMinimalWellFormedCodeUnitSubsequenceInvertsPrepend(m, s); + } + + /** + * The concatenation of minimal well-formed code unit subsequences is itself a well-formed code unit sequence. + */ + lemma LemmaFlattenMinimalWellFormedCodeUnitSubsequences(ms: seq) + ensures IsWellFormedCodeUnitSequence(Seqs.Flatten(ms)) + { + if |ms| == 0 { + // assert IsWellFormedCodeUnitSequence(Seqs.Flatten(ms)); + } + else { + var head := ms[0]; + var tail := ms[1..]; + LemmaFlattenMinimalWellFormedCodeUnitSubsequences(tail); + var flatTail := Seqs.Flatten(tail); + LemmaPrependMinimalWellFormedCodeUnitSubsequence(head, flatTail); + } + } + + /** + * The concatenation of well-formed code unit sequences is itself a well-formed code unit sequence. + */ + lemma LemmaConcatWellFormedCodeUnitSubsequences(s: WellFormedCodeUnitSeq, t: WellFormedCodeUnitSeq) + ensures IsWellFormedCodeUnitSequence(s + t) + { + var partsS := PartitionCodeUnitSequence(s); + var partsT := PartitionCodeUnitSequence(t); + var partsST := partsS + partsT; + Seqs.LemmaFlattenConcat(partsS, partsT); + + LemmaFlattenMinimalWellFormedCodeUnitSubsequences(partsST); + } + + /** + * Returns the well-formed Unicode string that is the encoding of the given scalar value sequence. + */ + function EncodeScalarSequence(vs: seq): (s: WellFormedCodeUnitSeq) + { + var ms := Seqs.Map(EncodeScalarValue, vs); + LemmaFlattenMinimalWellFormedCodeUnitSubsequences(ms); + Seqs.Flatten(ms) + } + by method { + // Optimize to to avoid allocating the intermediate unflattened sequence. + // We can't quite use Seqs.FlatMap easily because we need to prove the result + // is not just a seq but a WellFormedCodeUnitSeqs. + // TODO: We can be even more efficient by using a JSON.Utils.Vectors.Vector instead. + s := []; + ghost var unflattened: seq := []; + for i := |vs| downto 0 + invariant unflattened == Seqs.Map(EncodeScalarValue, vs[i..]) + invariant s == Seqs.Flatten(unflattened) + { + var next: MinimalWellFormedCodeUnitSeq := EncodeScalarValue(vs[i]); + unflattened := [next] + unflattened; + LemmaPrependMinimalWellFormedCodeUnitSubsequence(next, s); + s := next + s; + } + } + + /** + * Returns the scalar value sequence encoded by the given well-formed Unicode string. + */ + function DecodeCodeUnitSequence(s: WellFormedCodeUnitSeq): (vs: seq) + ensures EncodeScalarSequence(vs) == s + { + var parts := PartitionCodeUnitSequence(s); + var vs := Seqs.Map(DecodeMinimalWellFormedCodeUnitSubsequence, parts); + calc == { + s; + Seqs.Flatten(parts); + { assert parts == Seqs.Map(EncodeScalarValue, vs); } + Seqs.Flatten(Seqs.Map(EncodeScalarValue, vs)); + EncodeScalarSequence(vs); + } + vs + } + + /** + * Returns the scalar value sequence encoded by the given code unit sequence, or None if the given Unicode string + * is not well-formed. + */ + function DecodeCodeUnitSequenceChecked(s: CodeUnitSeq): (maybeVs: Option>) + ensures IsWellFormedCodeUnitSequence(s) ==> + && maybeVs.Some? + && maybeVs.Extract() == DecodeCodeUnitSequence(s) + ensures !IsWellFormedCodeUnitSequence(s) ==> && maybeVs.None? + { + // IsWellFormedCodeUnitSequence and DecodeCodeUnitSequence each call PartitionCodeUnitSequence, + // so for efficiency we avoid recomputing the partition in the by-method. + if IsWellFormedCodeUnitSequence(s) then Some(DecodeCodeUnitSequence(s)) + else None + } + by method { + var maybeParts := PartitionCodeUnitSequenceChecked(s); + if maybeParts.None? { + return None; + } + var parts := maybeParts.value; + var vs := Seqs.Map(DecodeMinimalWellFormedCodeUnitSubsequence, parts); + calc == { + s; + Seqs.Flatten(parts); + { assert parts == Seqs.Map(EncodeScalarValue, vs); } + Seqs.Flatten(Seqs.Map(EncodeScalarValue, vs)); + EncodeScalarSequence(vs); + } + return Some(vs); + } +} diff --git a/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/UnicodeStrings.dfy b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/UnicodeStrings.dfy new file mode 100644 index 00000000000..4af0f9b3b7f --- /dev/null +++ b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/UnicodeStrings.dfy @@ -0,0 +1,49 @@ +/******************************************************************************* + * Copyright by the contributors to the Dafny Project + * SPDX-License-Identifier: MIT + *******************************************************************************/ + +/** + * This abstract module is an interface for converting + * between the Dafny `string` type and sequences of UTF-8 and UTF-16 + * codepoints, where codepoints are represents as bounded ints + * (`uint8` and `uint16`). + * + * Because the `--unicode-char` option changes the meaning + * of the `char` type and hence the `string` type, + * this module must be implemented differently for each option value. + * Currently, the only available implementation is for `--unicode-char:true`, + * and the implementation for `--unicode-char:false` is upcoming. + * + * If you also want to maintain code that works for either `--unicode-char` value, + * implement your logic in an abstract module that imports this one. + * Then define a refining module for each `--unicode-char` value you wish to support, + * each of which imports the appropriate implementation of AbstractUnicodeStrings. + */ +abstract module DafnyStdLibs.Unicode.AbstractUnicodeStrings { + + import Collections.Seqs + + import opened Wrappers + import opened BoundedInts + + function ToUTF8Checked(s: string): Option> + + function ASCIIToUTF8(s: string): seq + requires forall i | 0 <= i < |s| :: 0 <= s[i] as int < 128 + { + Seqs.Map(c requires 0 <= c as int < 128 => c as uint8, s) + } + + function FromUTF8Checked(bs: seq): Option + + function ToUTF16Checked(s: string): Option> + + function ASCIIToUTF16(s: string): seq + requires forall i | 0 <= i < |s| :: 0 <= s[i] as int < 128 + { + Seqs.Map(c requires 0 <= c as int < 128 => c as uint16, s) + } + + function FromUTF16Checked(bs: seq): Option +} diff --git a/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/UnicodeStringsWithUnicodeChar.dfy b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/UnicodeStringsWithUnicodeChar.dfy new file mode 100644 index 00000000000..552373106b6 --- /dev/null +++ b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/UnicodeStringsWithUnicodeChar.dfy @@ -0,0 +1,84 @@ +/******************************************************************************* + * Copyright by the contributors to the Dafny Project + * SPDX-License-Identifier: MIT + *******************************************************************************/ + +/** + * A concrete implementation of `AbstractUnicodeStrings` for `--unicode-char:true`, + * defining functions for converting between strings and UTF-8/UTF-16. + * See the `AbstractUnicodeStrings` module for details. + */ +module DafnyStdLibs.Unicode.UnicodeStringsWithUnicodeChar refines AbstractUnicodeStrings { + + import Base + import Utf8EncodingForm + import Utf16EncodingForm + + lemma {:vcs_split_on_every_assert} CharIsUnicodeScalarValue(c: char) + ensures + && var asBits := c as bv24; + && asBits <= 0x10_FFFF + && (0 <= asBits < Base.HIGH_SURROGATE_MIN || Base.LOW_SURROGATE_MAX < asBits) + { + assert c as int < 0x11_0000; + // This seems to be just too expensive to verify for such a wide bit-vector type, + // but is clearly true given the above. + assume {:axiom} c as bv24 < 0x11_0000 as bv24; + var asBits := c as int as bv24; + assert (asBits < Base.HIGH_SURROGATE_MIN || asBits > Base.LOW_SURROGATE_MAX); + assert asBits <= 0x10_FFFF; + } + + lemma UnicodeScalarValueIsChar(sv: Base.ScalarValue) + ensures + && var asInt := sv as int; + && (0 <= asInt < 0xD800 || 0xE000 <= asInt < 0x11_0000) + { + var asInt := sv as int; + assert (asInt < 0xD800 || asInt > 0xDFFF); + assert (asInt < 0xDBFF || asInt > 0xDC00); + } + + function CharAsUnicodeScalarValue(c: char): Base.ScalarValue { + CharIsUnicodeScalarValue(c); + c as Base.ScalarValue + } + + function CharFromUnicodeScalarValue(sv: Base.ScalarValue): char { + UnicodeScalarValueIsChar(sv); + // TODO: Can we avoid the extra cast to int? + sv as int as char + } + + function ToUTF8Checked(s: string): Option> + ensures ToUTF8Checked(s).Some? + { + var asCodeUnits := Seqs.Map(CharAsUnicodeScalarValue, s); + var asUtf8CodeUnits := Utf8EncodingForm.EncodeScalarSequence(asCodeUnits); + var asBytes := Seqs.Map(cu => cu as uint8, asUtf8CodeUnits); + Some(asBytes) + } + + function FromUTF8Checked(bs: seq): Option { + var asCodeUnits := Seqs.Map(c => c as Utf8EncodingForm.CodeUnit, bs); + var utf32 :- Utf8EncodingForm.DecodeCodeUnitSequenceChecked(asCodeUnits); + var asChars := Seqs.Map(CharFromUnicodeScalarValue, utf32); + Some(asChars) + } + + function ToUTF16Checked(s: string): Option> + ensures ToUTF16Checked(s).Some? + { + var asCodeUnits := Seqs.Map(CharAsUnicodeScalarValue, s); + var asUtf16CodeUnits := Utf16EncodingForm.EncodeScalarSequence(asCodeUnits); + var asBytes := Seqs.Map(cu => cu as uint16, asUtf16CodeUnits); + Some(asBytes) + } + + function FromUTF16Checked(bs: seq): Option { + var asCodeUnits := Seqs.Map(c => c as Utf16EncodingForm.CodeUnit, bs); + var utf32 :- Utf16EncodingForm.DecodeCodeUnitSequenceChecked(asCodeUnits); + var asChars := Seqs.Map(CharFromUnicodeScalarValue, utf32); + Some(asChars) + } +} diff --git a/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Utf16EncodingForm.dfy b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Utf16EncodingForm.dfy new file mode 100644 index 00000000000..ed46cbd1fdb --- /dev/null +++ b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Utf16EncodingForm.dfy @@ -0,0 +1,135 @@ +/******************************************************************************* + * Copyright by the contributors to the Dafny Project + * SPDX-License-Identifier: MIT + *******************************************************************************/ + +// Definition of the UTF-16 Unicode Encoding Form, as specified in Section 3.9 D91. +module DafnyStdLibs.Unicode.Utf16EncodingForm refines UnicodeEncodingForm { + type CodeUnit = bv16 + + // + // Definitions of well-formedness. + // + + function IsMinimalWellFormedCodeUnitSubsequence(s: CodeUnitSeq): (b: bool) + { + if |s| == 1 then IsWellFormedSingleCodeUnitSequence(s) + else if |s| == 2 then ( + var b := IsWellFormedDoubleCodeUnitSequence(s); + assert b ==> forall i | 0 < i < |s| :: !IsMinimalWellFormedCodeUnitSubsequence(s[..i]); + b + ) + else false + } + + function IsWellFormedSingleCodeUnitSequence(s: CodeUnitSeq): (b: bool) + requires |s| == 1 + { + var firstWord := s[0]; + || 0x0 <= firstWord <= 0xD7FF + || 0xE000 <= firstWord <= 0xFFFF + } + + function IsWellFormedDoubleCodeUnitSequence(s: CodeUnitSeq): (b: bool) + requires |s| == 2 + ensures b ==> !IsWellFormedSingleCodeUnitSequence(s[..1]) + { + var firstWord := s[0]; + var secondWord := s[1]; + && 0xD800 <= firstWord <= 0xDBFF + && 0xDC00 <= secondWord <= 0xDFFF + } + + function SplitPrefixMinimalWellFormedCodeUnitSubsequence(s: CodeUnitSeq): (maybePrefix: Option) + ensures |s| == 0 ==> maybePrefix.None? + ensures (exists i | 0 < i <= |s| :: IsMinimalWellFormedCodeUnitSubsequence(s[..i])) <==> + && maybePrefix.Some? + ensures maybePrefix.Some? ==> + && var prefix := maybePrefix.Extract(); + && 0 < |prefix| <= |s| + && prefix == s[..|prefix|] + && IsMinimalWellFormedCodeUnitSubsequence(prefix) + { + if |s| >= 1 && IsWellFormedSingleCodeUnitSequence(s[..1]) then Some(s[..1]) + else if |s| >= 2 && IsWellFormedDoubleCodeUnitSequence(s[..2]) then Some(s[..2]) + else None + } + + // + // Encoding and decoding. + // See Table 3-5. UTF-16 Bit Distribution. + // + + function EncodeScalarValue(v: ScalarValue): (m: MinimalWellFormedCodeUnitSeq) + { + if 0x0 <= v <= 0xD7FF || 0xE000 <= v <= 0xFFFF then EncodeScalarValueSingleWord(v) + else EncodeScalarValueDoubleWord(v) + } + + function EncodeScalarValueSingleWord(v: ScalarValue): (m: MinimalWellFormedCodeUnitSeq) + requires + || 0x0 <= v <= 0xD7FF + || 0xE000 <= v <= 0xFFFF + ensures |m| == 1 + ensures IsWellFormedSingleCodeUnitSequence(m) + { + var firstWord := v as CodeUnit; + [firstWord] + } + + function EncodeScalarValueDoubleWord(v: ScalarValue): (m: MinimalWellFormedCodeUnitSeq) + requires 0x10000 <= v <= 0x10FFFF + ensures |m| == 2 + ensures IsWellFormedDoubleCodeUnitSequence(m) + { + // v = 000u uuuu / xxxx xxxx / xxxx xxxx + // 1111 1122 2222 2222 + var x2 := (v & 0x3FF) as bv10; + var x1 := ((v & 0xFC00) >> 10) as bv6; + var u := ((v & 0x1F0000) >> 16) as bv5; + var w := (u - 1) as bv4; + // encoded = 1101 10ww / wwxx xxxx / 1101 11xx / xxxx xxxx + // 11 1111 22 2222 2222 + var firstWord := 0xD800 | ((w as CodeUnit) << 6) | x1 as CodeUnit; + var secondWord := 0xDC00 | x2 as CodeUnit; + [firstWord, secondWord] + } + + function DecodeMinimalWellFormedCodeUnitSubsequence(m: MinimalWellFormedCodeUnitSeq): (v: ScalarValue) + { + if |m| == 1 then DecodeMinimalWellFormedCodeUnitSubsequenceSingleWord(m) + else assert |m| == 2; DecodeMinimalWellFormedCodeUnitSubsequenceDoubleWord(m) + } + + function DecodeMinimalWellFormedCodeUnitSubsequenceSingleWord(m: MinimalWellFormedCodeUnitSeq): (v: ScalarValue) + requires |m| == 1 + ensures + || 0x0 <= v <= 0xD7FF + || 0xE000 <= v <= 0xFFFF + ensures EncodeScalarValueSingleWord(v) == m + { + var firstWord := m[0]; + var x := firstWord as bv16; + assert EncodeScalarValueSingleWord(x as ScalarValue) == m; + x as ScalarValue + } + + function + {:rlimit 1200} + DecodeMinimalWellFormedCodeUnitSubsequenceDoubleWord(m: MinimalWellFormedCodeUnitSeq): (v: ScalarValue) + requires |m| == 2 + ensures 0x10000 <= v <= 0x10FFFF + ensures EncodeScalarValueDoubleWord(v) == m + { + var firstWord := m[0]; + var secondWord := m[1]; + var x2 := (secondWord & 0x3FF) as bv24; + var x1 := (firstWord & 0x3F) as bv24; + var w := ((firstWord & 0x3C0) >> 6) as bv24; + var u := (w + 1) as bv24; + var v := (u << 16) | (x1 << 10) | x2 as ScalarValue; + assert {:split_here} true; + assert EncodeScalarValueDoubleWord(v) == m; + v + } +} diff --git a/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Utf8EncodingForm.dfy b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Utf8EncodingForm.dfy new file mode 100644 index 00000000000..7508abc395d --- /dev/null +++ b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Utf8EncodingForm.dfy @@ -0,0 +1,252 @@ +/******************************************************************************* + * Copyright by the contributors to the Dafny Project + * SPDX-License-Identifier: MIT + *******************************************************************************/ + +/** + * The Unicode encoding form that assigns each Unicode scalar value to an unsigned byte sequence of one to four bytes + * in length, as specified in Table 3-6 and Table 3-7. + */ +module DafnyStdLibs.Unicode.Utf8EncodingForm refines UnicodeEncodingForm { + type CodeUnit = bv8 + + // + // Definitions of well-formedness. + // + + function IsMinimalWellFormedCodeUnitSubsequence(s: CodeUnitSeq): (b: bool) + { + if |s| == 1 then ( + var b := IsWellFormedSingleCodeUnitSequence(s); + assert b ==> forall i | 0 < i < |s| :: !IsMinimalWellFormedCodeUnitSubsequence(s[..i]); + b + ) + else if |s| == 2 then ( + var b := IsWellFormedDoubleCodeUnitSequence(s); + assert b ==> forall i | 0 < i < |s| :: !IsMinimalWellFormedCodeUnitSubsequence(s[..i]); + b + ) + else if |s| == 3 then ( + var b := IsWellFormedTripleCodeUnitSequence(s); + assert b ==> forall i | 0 < i < |s| :: !IsMinimalWellFormedCodeUnitSubsequence(s[..i]); + b + ) + else if |s| == 4 then ( + var b := IsWellFormedQuadrupleCodeUnitSequence(s); + assert b ==> forall i | 0 < i < |s| :: !IsMinimalWellFormedCodeUnitSubsequence(s[..i]); + b + ) + else false + } + + function IsWellFormedSingleCodeUnitSequence(s: CodeUnitSeq): (b: bool) + requires |s| == 1 + { + var firstByte := s[0]; + && 0x00 <= firstByte <= 0x7F + } + + function IsWellFormedDoubleCodeUnitSequence(s: CodeUnitSeq): (b: bool) + requires |s| == 2 + ensures b ==> + && !IsWellFormedSingleCodeUnitSequence(s[..1]) + { + var firstByte := s[0]; + var secondByte := s[1]; + && 0xC2 <= firstByte <= 0xDF + && 0x80 <= secondByte <= 0xBF + } + + function IsWellFormedTripleCodeUnitSequence(s: CodeUnitSeq): (b: bool) + requires |s| == 3 + ensures b ==> + && !IsWellFormedSingleCodeUnitSequence(s[..1]) + && !IsWellFormedDoubleCodeUnitSequence(s[..2]) + { + var firstByte := s[0]; + var secondByte := s[1]; + var thirdByte := s[2]; + && ( + || (firstByte == 0xE0 && 0xA0 <= secondByte <= 0xBF) + || (0xE1 <= firstByte <= 0xEC && 0x80 <= secondByte <= 0xBF) + || (firstByte == 0xED && 0x80 <= secondByte <= 0x9F) + || (0xEE <= firstByte <= 0xEF && 0x80 <= secondByte <= 0xBF) + ) + && 0x80 <= thirdByte <= 0xBF + } + + function IsWellFormedQuadrupleCodeUnitSequence(s: CodeUnitSeq): (b: bool) + requires |s| == 4 + ensures b ==> + && !IsWellFormedSingleCodeUnitSequence(s[..1]) + && !IsWellFormedDoubleCodeUnitSequence(s[..2]) + && !IsWellFormedTripleCodeUnitSequence(s[..3]) + { + var firstByte := s[0]; + var secondByte := s[1]; + var thirdByte := s[2]; + var fourthByte := s[3]; + && ( + || (firstByte == 0xF0 && 0x90 <= secondByte <= 0xBF) + || (0xF1 <= firstByte <= 0xF3 && 0x80 <= secondByte <= 0xBF) + || (firstByte == 0xF4 && 0x80 <= secondByte <= 0x8F) + ) + && 0x80 <= thirdByte <= 0xBF + && 0x80 <= fourthByte <= 0xBF + } + + function SplitPrefixMinimalWellFormedCodeUnitSubsequence(s: CodeUnitSeq): + (maybePrefix: Option) + { + if |s| >= 1 && IsWellFormedSingleCodeUnitSequence(s[..1]) then Some(s[..1]) + else if |s| >= 2 && IsWellFormedDoubleCodeUnitSequence(s[..2]) then Some(s[..2]) + else if |s| >= 3 && IsWellFormedTripleCodeUnitSequence(s[..3]) then Some(s[..3]) + else if |s| >= 4 && IsWellFormedQuadrupleCodeUnitSequence(s[..4]) then Some(s[..4]) + else None + } + + // + // Encoding and decoding. + // See Table 3-6. UTF-8 Bit Distribution of the Unicode Standard 14.0. + // + + function EncodeScalarValue(v: ScalarValue): (m: MinimalWellFormedCodeUnitSeq) + { + if v <= 0x7F then EncodeScalarValueSingleByte(v) + else if v <= 0x7FF then EncodeScalarValueDoubleByte(v) + else if v <= 0xFFFF then EncodeScalarValueTripleByte(v) + else EncodeScalarValueQuadrupleByte(v) + } + + function EncodeScalarValueSingleByte(v: ScalarValue): (m: MinimalWellFormedCodeUnitSeq) + requires 0 <= v <= 0x7F + ensures |m| == 1 + ensures IsWellFormedSingleCodeUnitSequence(m) + { + // v = 0000 0000 / 0xxx xxxx + var x := (v & 0x7F) as bv7; + // encoded = 0xxx xxxx + var firstByte := x as CodeUnit; + [firstByte] + } + + function EncodeScalarValueDoubleByte(v: ScalarValue): (s: CodeUnitSeq) + requires 0x80 <= v <= 0x7FF + ensures |s| == 2 + ensures IsWellFormedDoubleCodeUnitSequence(s) + { + // v = 0000 0yyy / yyxx xxxx + var x := (v & 0x3F) as bv6; + var y := ((v & 0x7C0) >> 6) as bv5; + // encoded = 110y yyyy / 10xx xxxx + var firstByte := 0xC0 | y as CodeUnit; + var secondByte := 0x80 | x as CodeUnit; + [firstByte, secondByte] + } + + function EncodeScalarValueTripleByte(v: ScalarValue): (s: CodeUnitSeq) + requires 0x800 <= v <= 0xFFFF + ensures |s| == 3 + ensures IsWellFormedTripleCodeUnitSequence(s) + { + // v = zzzz yyyy / yyxx xxxx + var x := (v & 0x3F) as bv6; + var y := ((v & 0xFC0) >> 6) as bv6; + var z := ((v & 0xF000) >> 12) as bv4; + // encoded = 1110 zzzz / 10yy yyyy / 10xx xxxx + var firstByte := 0xE0 | z as CodeUnit; + var secondByte := 0x80 | y as CodeUnit; + var thirdByte := 0x80 | x as CodeUnit; + [firstByte, secondByte, thirdByte] + } + + function EncodeScalarValueQuadrupleByte(v: ScalarValue): (s: CodeUnitSeq) + requires 0x10000 <= v <= 0x10FFFF + ensures |s| == 4 + ensures IsWellFormedQuadrupleCodeUnitSequence(s) + { + // v = 000u uuuu / zzzz yyyy / yyxx xxxx + // 1 1122 + assert v <= 0x1FFFFF; + var x := (v & 0x3F) as bv6; + var y := ((v & 0xFC0) >> 6) as bv6; + var z := ((v & 0xF000) >> 12) as bv4; + var u2 := ((v & 0x30000) >> 16) as bv2; + var u1 := ((v & 0x1C0000) >> 18) as bv3; + + // encoded = 1111 0uuu / 10uu zzzz / 10yy yyyy / 10xx xxxx + // 111 22 + var firstByte := 0xF0 | u1 as CodeUnit; + var secondByte := 0x80 | ((u2 as CodeUnit) << 4) | z as CodeUnit; + var thirdByte := 0x80 | y as CodeUnit; + var fourthByte := 0x80 | x as CodeUnit; + [firstByte, secondByte, thirdByte, fourthByte] + } + + function DecodeMinimalWellFormedCodeUnitSubsequence(m: MinimalWellFormedCodeUnitSeq): (v: ScalarValue) + { + if |m| == 1 then DecodeMinimalWellFormedCodeUnitSubsequenceSingleByte(m) + else if |m| == 2 then DecodeMinimalWellFormedCodeUnitSubsequenceDoubleByte(m) + else if |m| == 3 then DecodeMinimalWellFormedCodeUnitSubsequenceTripleByte(m) + else assert |m| == 4; DecodeMinimalWellFormedCodeUnitSubsequenceQuadrupleByte(m) + } + + function DecodeMinimalWellFormedCodeUnitSubsequenceSingleByte(m: MinimalWellFormedCodeUnitSeq): (v: ScalarValue) + requires |m| == 1 + ensures 0 <= v <= 0x7F + ensures EncodeScalarValueSingleByte(v) == m + { + var firstByte := m[0]; + var x := firstByte as bv7; + x as ScalarValue + } + + function DecodeMinimalWellFormedCodeUnitSubsequenceDoubleByte(m: MinimalWellFormedCodeUnitSeq): (v: ScalarValue) + requires |m| == 2 + ensures 0x80 <= v <= 0x7FF + ensures EncodeScalarValueDoubleByte(v) == m + { + var firstByte := m[0]; + var secondByte := m[1]; + var y := (firstByte & 0x1F) as bv24; + var x := (secondByte & 0x3F) as bv24; + (y << 6) | x as ScalarValue + } + + function + {:rlimit 115000} + DecodeMinimalWellFormedCodeUnitSubsequenceTripleByte(m: MinimalWellFormedCodeUnitSeq): (v: ScalarValue) + requires |m| == 3 + ensures 0x800 <= v <= 0xFFFF + ensures EncodeScalarValueTripleByte(v) == m + { + var firstByte := m[0]; + var secondByte := m[1]; + var thirdByte := m[2]; + var z := (firstByte & 0xF) as bv24; + var y := (secondByte & 0x3F) as bv24; + var x := (thirdByte & 0x3F) as bv24; + assert {:split_here} true; + (z << 12) | (y << 6) | x as ScalarValue + } + + function + {:rlimit 4000} + DecodeMinimalWellFormedCodeUnitSubsequenceQuadrupleByte(m: MinimalWellFormedCodeUnitSeq): (v: ScalarValue) + requires |m| == 4 + ensures 0x10000 <= v <= 0x10FFFF + ensures EncodeScalarValueQuadrupleByte(v) == m + { + var firstByte := m[0]; + var secondByte := m[1]; + var thirdByte := m[2]; + var fourthByte := m[3]; + var u1 := (firstByte & 0x7) as bv24; + var u2 := ((secondByte & 0x30) >> 4) as bv24; + var z := (secondByte & 0xF) as bv24; + var y := (thirdByte & 0x3F) as bv24; + var x := (fourthByte & 0x3F) as bv24; + assert {:split_here} true; + (u1 << 18) | (u2 << 16) | (z << 12) | (y << 6) | x as ScalarValue + } +} diff --git a/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Utf8EncodingScheme.dfy b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Utf8EncodingScheme.dfy new file mode 100644 index 00000000000..1d886dec31e --- /dev/null +++ b/Source/DafnyStandardLibraries/src/DafnyStdLibs/Unicode/Utf8EncodingScheme.dfy @@ -0,0 +1,73 @@ +/******************************************************************************* + * Copyright by the contributors to the Dafny Project + * SPDX-License-Identifier: MIT + *******************************************************************************/ + +/** + * The Unicode encoding scheme that serializes a UTF-8 code unit sequence in exactly the same order as the code unit + * sequence itself. + * + * Because the UTF-8 encoding form deals in ordered byte sequences, the UTF-8 encoding scheme is trivial. + * The byte ordering is completely defined by the UTF-8 code unit sequence itself. + * We implement the encoding scheme here for completeness of the Unicode character encoding model, + * and to perform the (trivial) conversion between `uint8`/`byte` and `bv8` values. + * + * (Section 3.10 D95) + * + * TODO: this should refine an abstract UnicodeEncodingScheme module + * that states lemmas/conditions about Serialize and Deserialize + * which refining modules would prove about their own implementations. + */ +module DafnyStdLibs.Unicode.Utf8EncodingScheme { + import opened Wrappers + + import BoundedInts + import Collections.Seqs + import Utf8EncodingForm + + type byte = BoundedInts.uint8 + + /** + * Returns the byte serialization of the given code unit sequence. + */ + function Serialize(s: Utf8EncodingForm.CodeUnitSeq): (b: seq) + { + Seqs.Map(c => c as byte, s) + } + + /** + * Returns the code unit sequence that serializes to the given byte sequence. + */ + function Deserialize(b: seq): (s: Utf8EncodingForm.CodeUnitSeq) + { + Seqs.Map(b => b as Utf8EncodingForm.CodeUnit, b) + } + + /** + * Serializing a code unit sequence and then deserializing the result, yields the original code unit sequence. + */ + lemma LemmaSerializeDeserialize(s: Utf8EncodingForm.CodeUnitSeq) + ensures Deserialize(Serialize(s)) == s + {} + + /** + * Deserializing a byte sequence and then serializing the result, yields the original byte sequence. + */ + lemma + {:rlimit 3000} + LemmaDeserializeSerialize(b: seq) + ensures Serialize(Deserialize(b)) == b + { + calc { + Serialize(Deserialize(b)); + == // Definitions of Serialize, Deserialize + Seqs.Map(c => c as byte, Seqs.Map(b => b as Utf8EncodingForm.CodeUnit, b)); + == // Compositionality of Map + Seqs.Map(b => (b as Utf8EncodingForm.CodeUnit) as byte, b); + == // Simplify map + Seqs.Map(b => b, b); + == // Identity function + b; + } + } +} diff --git a/Source/DafnyStandardLibraries/src/dfyconfig.toml b/Source/DafnyStandardLibraries/src/dfyconfig.toml index b530649891e..04205750646 100644 --- a/Source/DafnyStandardLibraries/src/dfyconfig.toml +++ b/Source/DafnyStandardLibraries/src/dfyconfig.toml @@ -13,5 +13,6 @@ reads-clauses-on-methods = true # of the libraries themselves. resource-limit = 1000 warn-as-errors = true -warn-redundant-assumptions = true -warn-contradictory-assumptions = true \ No newline at end of file +# These give too many false positives right now, but should be enabled in the future +warn-redundant-assumptions = false +warn-contradictory-assumptions = false