From 70405da6724bdbdab573972c2d873fa5f8b133d4 Mon Sep 17 00:00:00 2001 From: cheatfate Date: Wed, 30 Sep 2020 10:14:40 +0300 Subject: [PATCH 1/9] Initial commit. --- stew/conio.nim | 340 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 340 insertions(+) create mode 100644 stew/conio.nim diff --git a/stew/conio.nim b/stew/conio.nim new file mode 100644 index 00000000..cad1e4e7 --- /dev/null +++ b/stew/conio.nim @@ -0,0 +1,340 @@ +## Copyright (c) 2020 Status Research & Development GmbH +## Licensed under either of +## * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE)) +## * MIT license ([LICENSE-MIT](LICENSE-MIT)) +## at your option. +## This file may not be copied, modified, or distributed except according to +## those terms. + +## This module implements cross-platform console procedures. +import io2 +export io2 + +when defined(windows): + proc setConsoleOutputCP(wCodePageID: cuint): int32 {. + importc: "SetConsoleOutputCP", stdcall, dynlib: "kernel32", sideEffect.} + proc setConsoleCP(wCodePageID: cuint): int32 {. + importc: "SetConsoleCP", stdcall, dynlib: "kernel32", sideEffect.} + proc getConsoleCP(): cuint {. + importc: "GetConsoleCP", stdcall, dynlib: "kernel32", sideEffect.} + proc getConsoleOutputCP(): cuint {. + importc: "GetConsoleOutputCP", stdcall, dynlib: "kernel32", sideEffect.} + proc setConsoleMode(hConsoleHandle: uint, dwMode: uint32): int32 {. + importc: "SetConsoleMode", stdcall, dynlib: "kernel32", sideEffect.} + proc getConsoleMode(hConsoleHandle: uint, dwMode: var uint32): int32 {. + importc: "GetConsoleMode", stdcall, dynlib: "kernel32", sideEffect.} + proc readConsole(hConsoleInput: uint, lpBuffer: pointer, + nNumberOfCharsToRead: uint32, + lpNumberOfCharsRead: var uint32, + pInputControl: pointer): int32 {. + importc: "ReadConsoleW", stdcall, dynlib: "kernel32", sideEffect.} + proc readFile(hFile: uint, lpBuffer: pointer, + nNumberOfBytesToRead: uint32, + lpNumberOfBytesRead: var uint32, + lpOverlapped: pointer): int32 {. + importc: "ReadFile", dynlib: "kernel32", stdcall, sideEffect.} + proc writeConsole(hConsoleOutput: uint, lpBuffer: pointer, + nNumberOfCharsToWrite: uint32, + lpNumberOfCharsWritten: var uint32, + lpReserved: pointer): int32 {. + importc: "WriteConsoleW", stdcall, dynlib: "kernel32", sideEffect.} + proc writeFile(hFile: uint, lpBuffer: pointer, + nNumberOfBytesToWrite: uint32, + lpNumberOfBytesWritten: var uint32, + lpOverlapped: pointer): int32 {. + importc: "WriteFile", dynlib: "kernel32", stdcall, sideEffect.} + proc getStdHandle(nStdHandle: uint32): uint {. + importc: "GetStdHandle", stdcall, dynlib: "kernel32", sideEffect.} + proc wideCharToMultiByte(codePage: cuint, dwFlags: uint32, + lpWideCharStr: ptr Utf16Char, cchWideChar: cint, + lpMultiByteStr: ptr char, cbMultiByte: cint, + lpDefaultChar: pointer, + lpUsedDefaultChar: pointer): cint {. + importc: "WideCharToMultiByte", stdcall, dynlib: "kernel32", sideEffect.} + + const + CP_UTF8 = 65001'u32 + STD_INPUT_HANDLE = cast[uint32](-10) + STD_OUTPUT_HANDLE = cast[uint32](-11) + INVALID_HANDLE_VALUE = cast[uint](-1) + ENABLE_PROCESSED_INPUT = 0x0001'u32 + ENABLE_ECHO_INPUT = 0x0004'u32 + ERROR_INVALID_HANDLE = 0x0006'u32 + + proc isConsoleRedirected(hConsole: uint): bool = + ## Returns ``true`` if console handle was redirected. + var mode: uint32 + let res = getConsoleMode(hConsole, mode) + if res == 0: + let errCode = ioLastError() + if errCode == ERROR_INVALID_HANDLE: + true + else: + false + else: + false + + proc readConsoleInput(maxBytes: int): IoResult[string] = + let hConsoleInput = + block: + let res = getStdHandle(STD_INPUT_HANDLE) + if res == INVALID_HANDLE_VALUE: + return err(ioLastError()) + res + + let prevInputCP = + block: + let res = getConsoleCP() + if res == cuint(0): + return err(ioLastError()) + res + + if isConsoleRedirected(hConsoleInput): + # Console STDIN is redirected, we should use ReadFile(), because + # ReadConsole() is not working for such types of STDIN. + if setConsoleCP(CP_UTF8) == 0'i32: + return err(ioLastError()) + + # Allocating buffer with size equal to `maxBytes` + len(CRLF) + var buffer = newString(maxBytes + 2) + let bytesToRead = uint32(len(buffer)) + var bytesRead: uint32 + let rres = readFile(hConsoleInput, cast[pointer](addr buffer[0]), + bytesToRead, bytesRead, nil) + if rres == 0: + let errCode = ioLastError() + discard setConsoleCP(prevInputCP) + return err(errCode) + + if setConsoleCP(prevInputCP) == 0'i32: + return err(ioLastError()) + + # Truncate additional bytes from buffer. + buffer.setLen(int(min(bytesRead, uint32(maxBytes)))) + + # Trim CR/CRLF from buffer. + if len(buffer) > 0: + if buffer[^1] == char(0x0A): + if len(buffer) > 1: + if buffer[^2] == char(0x0D): + buffer.setLen(len(buffer) - 2) + else: + buffer.setLen(len(buffer) - 1) + else: + buffer.setLen(len(buffer) - 1) + elif buffer[^1] == char(0x0D): + buffer.setLen(len(buffer) - 1) + ok(buffer) + else: + let prevMode = + block: + var mode: uint32 + let res = getConsoleMode(hConsoleInput, mode) + if res == 0: + return err(ioLastError()) + mode + + var newMode = prevMode or ENABLE_PROCESSED_INPUT + newMode = newMode and not(ENABLE_ECHO_INPUT) + + # Change console CodePage to allow UTF-8 strings input. + if setConsoleCP(CP_UTF8) == 0'i32: + return err(ioLastError()) + + # Disable local echo output. + let mres = setConsoleMode(hConsoleInput, newMode) + if mres == 0: + let errCode = ioLastError() + discard setConsoleCP(prevInputCP) + return err(errCode) + + # Allocating buffer with size equal to `maxBytes` + len(CRLF) + var buffer = newSeq[Utf16Char](maxBytes + 2) + let charsToRead = uint32(len(buffer)) + var charsRead: uint32 + let rres = readConsole(hConsoleInput, cast[pointer](addr buffer[0]), + charsToRead, charsRead, nil) + if rres == 0'i32: + let errCode = ioLastError() + discard setConsoleMode(hConsoleInput, prevMode) + discard setConsoleCP(prevInputCP) + return err(errCode) + + # Restore local echo output. + if setConsoleMode(hConsoleInput, prevMode) == 0'i32: + let errCode = ioLastError() + discard setConsoleCP(prevInputCP) + return err(errCode) + + # Restore previous console CodePage. + if setConsoleCP(prevInputCP) == 0'i32: + return err(ioLastError()) + + # Truncate additional bytes from buffer. + buffer.setLen(int(min(charsRead, uint32(maxBytes)))) + # Truncate CRLF in result wide string. + if len(buffer) > 0: + if int16(buffer[^1]) == int16(0x0A): + if len(buffer) > 1: + if int16(buffer[^2]) == int16(0x0D): + buffer.setLen(len(buffer) - 2) + else: + buffer.setLen(len(buffer) - 1) + else: + buffer.setLen(len(buffer) - 1) + elif int16(buffer[^1]) == int16(0x0D): + buffer.setLen(len(buffer) - 1) + + # Convert Windows UTF-16 encoded string to UTF-8 encoded string. + if len(buffer) > 0: + var pwd = "" + let bytesNeeded = wideCharToMultiByte(CP_UTF8, 0'u32, addr buffer[0], + cint(len(buffer)), nil, + cint(0), nil, nil) + if bytesNeeded <= cint(0): + return err(ioLastError()) + pwd.setLen(bytesNeeded) + let cres = wideCharToMultiByte(CP_UTF8, 0'u32, addr buffer[0], + cint(len(buffer)), addr pwd[0], + cint(len(pwd)), nil, nil) + if cres == cint(0): + return err(ioLastError()) + ok(pwd) + else: + ok("") + + proc writeConsoleOutput(data: string): IoResult[void] = + if len(data) == 0: + return ok() + + let hConsoleOutput = + block: + let res = getStdHandle(STD_OUTPUT_HANDLE) + if res == INVALID_HANDLE_VALUE: + return err(ioLastError()) + res + + let prevOutputCP = + block: + let res = getConsoleOutputCP() + if res == cuint(0): + return err(ioLastError()) + res + + if isConsoleRedirected(hConsoleOutput): + # If STDOUT is redirected we should use WriteFile() because WriteConsole() + # is not working for such types of STDOUT. + if setConsoleOutputCP(CP_UTF8) == 0'i32: + return err(ioLastError()) + + let bytesToWrite = uint32(len(data)) + var bytesWritten: uint32 + let wres = writeFile(hConsoleOutput, cast[pointer](unsafeAddr data[0]), + bytesToWrite, bytesWritten, nil) + if wres == 0'i32: + let errCode = ioLastError() + discard setConsoleOutputCP(prevOutputCP) + return err(errCode) + + if setConsoleOutputCP(prevOutputCP) == 0'i32: + return err(ioLastError()) + else: + if setConsoleOutputCP(CP_UTF8) == 0'i32: + return err(ioLastError()) + + let widePrompt = newWideCString(data) + var charsWritten: uint32 + let wres = writeConsole(hConsoleOutput, cast[pointer](widePrompt), + uint32(len(widePrompt)), charsWritten, nil) + if wres == 0'i32: + let errCode = ioLastError() + discard setConsoleOutputCP(prevOutputCP) + return err(errCode) + + if setConsoleOutputCP(prevOutputCP) == 0'i32: + return err(ioLastError()) + ok() + +elif defined(posix): + import posix, termios + + proc isConsoleRedirected(consoleFd: cint): bool = + ## Returns ``true`` if console handle was redirected. + var mode: Termios + if tcGetAttr(consoleFd, addr mode) != cint(0): + let errCode = ioLastError() + if errCode == ENOTTY: + true + else: + false + else: + false + + proc writeConsoleOutput(prompt: string): IoResult[void] = + if len(prompt) == 0: + ok() + else: + let res = posix.write(STDOUT_FILENO, cast[pointer](unsafeAddr prompt[0]), + len(prompt)) + if res != len(prompt): + err(ioLastError()) + else: + ok() + + proc readConsoleInput(maxBytes: int): IoResult[string] = + # Allocating buffer with size equal to `maxBytes` + len(LF) + var buffer = newString(maxBytes + 1) + let bytesRead = + if isConsoleRedirected(STDIN_FILENO): + let res = posix.read(STDIN_FILENO, cast[pointer](addr buffer[0]), + len(buffer)) + if res < 0: + return err(ioLastError()) + res + else: + var cur, old: Termios + if tcGetAttr(STDIN_FILENO, addr cur) != cint(0): + return err(ioLastError()) + + old = cur + cur.c_lflag = cur.c_lflag and not(Cflag(ECHO)) + + if tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(cur)) != cint(0): + return err(ioLastError()) + + let res = read(STDIN_FILENO, cast[pointer](addr buffer[0]), + len(buffer)) + if res < 0: + discard tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old)) + return err(ioLastError()) + + if tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old)) != cint(0): + return err(ioLastError()) + res + + # Truncate additional bytes from buffer. + buffer.setLen(min(maxBytes, bytesRead)) + # Trim LF in result string + if len(buffer) > 0: + if buffer[^1] == char(0x0A): + buffer.setLen(len(buffer) - 1) + ok(buffer) + +proc readConsolePassword*(prompt: string, + maxBytes = 32768): IoResult[string] = + ## Reads a password from stdin without printing it with length in bytes up to + ## ``maxBytes``. + ## + ## This procedure supports reading of UTF-8 encoded passwords from console or + ## redirected pipe. But ``maxBytes`` will limit + ## + ## Before reading password ``prompt`` will be printed. + ## + ## Please note that ``maxBytes`` should be in range (0, 32768]. + doAssert(maxBytes > 0 and maxBytes <= 32768, + "maxBytes should be integer in (0, 32768]") + ? writeConsoleOutput(prompt) + let res = ? readConsoleInput(maxBytes) + # `\p` is platform specific newline: CRLF on Windows, LF on Unix + ? writeConsoleOutput("\p") + ok(res) From cb44a0db40bcde9a5bfc24626f3cef7f5e3c33bd Mon Sep 17 00:00:00 2001 From: cheatfate Date: Wed, 30 Sep 2020 11:21:31 +0300 Subject: [PATCH 2/9] Fix error handler. --- stew/conio.nim | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stew/conio.nim b/stew/conio.nim index cad1e4e7..c8d9cf91 100644 --- a/stew/conio.nim +++ b/stew/conio.nim @@ -305,8 +305,9 @@ elif defined(posix): let res = read(STDIN_FILENO, cast[pointer](addr buffer[0]), len(buffer)) if res < 0: + let errCode = ioLastError() discard tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old)) - return err(ioLastError()) + return err(errCode) if tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old)) != cint(0): return err(ioLastError()) From 4363df120fb572877512786bdb2f28e841a8886f Mon Sep 17 00:00:00 2001 From: cheatfate Date: Wed, 30 Sep 2020 16:19:51 +0300 Subject: [PATCH 3/9] User proper method of console identification (msvcrt). --- stew/conio.nim | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/stew/conio.nim b/stew/conio.nim index c8d9cf91..7f7c3bab 100644 --- a/stew/conio.nim +++ b/stew/conio.nim @@ -51,6 +51,8 @@ when defined(windows): lpDefaultChar: pointer, lpUsedDefaultChar: pointer): cint {. importc: "WideCharToMultiByte", stdcall, dynlib: "kernel32", sideEffect.} + proc getFileType(hFile: uint): uint32 {. + importc: "GetFileType", stdcall, dynlib: "kernel32", sideEffect.} const CP_UTF8 = 65001'u32 @@ -59,20 +61,17 @@ when defined(windows): INVALID_HANDLE_VALUE = cast[uint](-1) ENABLE_PROCESSED_INPUT = 0x0001'u32 ENABLE_ECHO_INPUT = 0x0004'u32 - ERROR_INVALID_HANDLE = 0x0006'u32 + FILE_TYPE_CHAR = 0x0002'u32 proc isConsoleRedirected(hConsole: uint): bool = ## Returns ``true`` if console handle was redirected. - var mode: uint32 - let res = getConsoleMode(hConsole, mode) - if res == 0: - let errCode = ioLastError() - if errCode == ERROR_INVALID_HANDLE: - true - else: - false - else: + let res = getFileType(hConsole) + if res == FILE_TYPE_CHAR: + # The specified handle is a character device, typically an LPT device or a + # console. false + else: + true proc readConsoleInput(maxBytes: int): IoResult[string] = let hConsoleInput = From 6ee4974c7f884fefd61f8c61355a4fe84de7602d Mon Sep 17 00:00:00 2001 From: cheatfate Date: Wed, 30 Sep 2020 16:34:10 +0300 Subject: [PATCH 4/9] Use proper method of console identification (glibc). --- stew/conio.nim | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/stew/conio.nim b/stew/conio.nim index 7f7c3bab..6ed3418c 100644 --- a/stew/conio.nim +++ b/stew/conio.nim @@ -260,12 +260,9 @@ elif defined(posix): proc isConsoleRedirected(consoleFd: cint): bool = ## Returns ``true`` if console handle was redirected. var mode: Termios + # This is how `isatty()` checks for TTY. if tcGetAttr(consoleFd, addr mode) != cint(0): - let errCode = ioLastError() - if errCode == ENOTTY: - true - else: - false + true else: false From cc001fa88f23133f2b279fb138caa0580c77fe0d Mon Sep 17 00:00:00 2001 From: cheatfate Date: Wed, 30 Sep 2020 18:15:56 +0300 Subject: [PATCH 5/9] Make isConsoleRedirected() public API. --- stew/conio.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stew/conio.nim b/stew/conio.nim index 6ed3418c..8e78dc39 100644 --- a/stew/conio.nim +++ b/stew/conio.nim @@ -63,7 +63,7 @@ when defined(windows): ENABLE_ECHO_INPUT = 0x0004'u32 FILE_TYPE_CHAR = 0x0002'u32 - proc isConsoleRedirected(hConsole: uint): bool = + proc isConsoleRedirected*(hConsole: uint): bool = ## Returns ``true`` if console handle was redirected. let res = getFileType(hConsole) if res == FILE_TYPE_CHAR: @@ -257,7 +257,7 @@ when defined(windows): elif defined(posix): import posix, termios - proc isConsoleRedirected(consoleFd: cint): bool = + proc isConsoleRedirected*(consoleFd: cint): bool = ## Returns ``true`` if console handle was redirected. var mode: Termios # This is how `isatty()` checks for TTY. From b0bbeb49d2380023f4da7da8f2b61fa6c13ae10b Mon Sep 17 00:00:00 2001 From: cheatfate Date: Sun, 4 Oct 2020 19:33:13 +0300 Subject: [PATCH 6/9] Add UTF-8 validation procedure. --- stew/utf8.nim | 91 +++++++++++++++++++++ tests/all_tests.nim | 3 +- tests/test_utf8.nim | 193 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 286 insertions(+), 1 deletion(-) create mode 100644 stew/utf8.nim create mode 100644 tests/test_utf8.nim diff --git a/stew/utf8.nim b/stew/utf8.nim new file mode 100644 index 00000000..e2b8599a --- /dev/null +++ b/stew/utf8.nim @@ -0,0 +1,91 @@ +## Copyright (c) 2020 Status Research & Development GmbH +## Licensed under either of +## * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE)) +## * MIT license ([LICENSE-MIT](LICENSE-MIT)) +## at your option. +## This file may not be copied, modified, or distributed except according to +## those terms. + +## This module implements UTF-8 related procedures. + +proc validateUtf8*[T: byte|char](data: openarray[T]): bool = + ## Returns ``true`` if ``data`` is correctly UTF-8 encoded string. + var index = 0 + + while true: + let byte1 = + block: + var b: byte + while true: + if index >= len(data): + return true + b = when T is byte: data[index] else: byte(data[index]) + inc(index) + if b >= 0x80'u8: + break + b + + if (byte1 and 0xE0'u8) == 0xC0'u8: + # Two-byte form (110xxxxx 10xxxxxx) + if index >= len(data): + return false + # overlong sequence test + if (byte1 and 0xFE'u8) == 0xC0'u8: + return false + + let byte2 = when T is byte: data[index] else: byte(data[index]) + if (byte2 and 0xC0'u8) != 0x80'u8: + return false + inc(index) + + elif (byte1 and 0xF0'u8) == 0xE0'u8: + # Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) + if (index + 1) >= len(data): + return false + + let byte2 = when T is byte: data[index] else: byte(data[index]) + if (byte2 and 0xC0'u8) != 0x80'u8: + return false + # overlong sequence test + if (byte1 == 0xE0'u8) and ((byte2 and 0xE0'u8) == 0x80'u8): + return false + # 0xD800–0xDFFF (UTF-16 surrogates) test + if (byte1 == 0xED'u8) and ((byte2 and 0xE0'u8) == 0xA0'u8): + return false + + let byte3 = when T is byte: data[index + 1] else: byte(data[index + 1]) + if (byte3 and 0xC0'u8) != 0x80'u8: + return false + # U+FFFE or U+FFFF test + if (byte1 == 0xEF'u8) and (byte2 == 0xBF'u8) and + ((byte3 and 0xFE'u8) == 0xBE'u8): + return false + inc(index, 2) + + elif (byte1 and 0xF8'u8) == 0xF0'u8: + # Four-byte form (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) + if (index + 2) >= len(data): + return false + + let byte2 = when T is byte: data[index] else: byte(data[index]) + if (byte2 and 0xC0'u8) != 0x80'u8: + return false + # overlong sequence test + if (byte1 == 0xF0'u8) and ((byte2 and 0xF0'u8) == 0x80'u8): + return false + # According to RFC 3629 no point above U+10FFFF should be used, which + # limits characters to four bytes. + if ((byte1 == 0xF4'u8) and (byte2 > 0x8F'u8)) or (byte1 > 0xF4'u8): + return false + + let byte3 = when T is byte: data[index + 1] else: byte(data[index + 1]) + if (byte3 and 0xC0'u8) != 0x80'u8: + return false + + let byte4 = when T is byte: data[index + 2] else: byte(data[index + 2]) + if (byte4 and 0xC0'u8) != 0x80'u8: + return false + inc(index, 3) + + else: + return false diff --git a/tests/all_tests.nim b/tests/all_tests.nim index 7b4f42c1..eeba4089 100644 --- a/tests/all_tests.nim +++ b/tests/all_tests.nim @@ -23,4 +23,5 @@ import test_varints, test_ctops, test_io2, - test_winacl \ No newline at end of file + test_winacl, + test_utf8 diff --git a/tests/test_utf8.nim b/tests/test_utf8.nim new file mode 100644 index 00000000..da4ed533 --- /dev/null +++ b/tests/test_utf8.nim @@ -0,0 +1,193 @@ +import std/unittest +import ../stew/utf8 + +proc toUTF4(value: uint32): array[4, byte] = + doAssert(value >= 0x10000'u32 and value < 0x200000'u32) + [ + 0xF0'u8 or byte((value shr 18) and 0x07), + 0x80'u8 or byte((value shr 12) and 0x3F), + 0x80'u8 or byte((value shr 6) and 0x3F), + 0x80'u8 or byte(value and 0x3F) + ] + +proc toUTF3(value: uint32): array[3, byte] = + doAssert(value >= 0x800'u32 and value < 0x10000'u32) + [ + 0xE0'u8 or byte((value shr 12) and 0x0F), + 0x80'u8 or byte((value shr 6) and 0x3F), + 0x80'u8 or byte(value and 0x3F) + ] + +proc toUTF2(value: uint32): array[2, byte] = + doAssert(value >= 0x80'u32 and value < 0x800'u32) + [ + 0xC0'u8 or byte((value shr 6) and 0x1F), + 0x80'u8 or byte(value and 0x3F) + ] + +proc toUTF1(value: uint32): array[1, byte] = + doAssert(value < 0x80'u32) + [ byte(value and 0x7F) ] + +suite "UTF-8 validation test suite": + test "Values [U+0000, U+007F] are allowed": + for i in 0x00'u32 .. 0x7F'u32: + check validateUtf8(toUTF1(i)) == true + test "Values [U+0080, U+07FF] are allowed": + for i in 0x80'u32 .. 0x7FF'u32: + check validateUtf8(toUTF2(i)) == true + test "Values [U+0800, U+D7FF] are allowed": + for i in 0x800'u32 .. 0xD7FF'u32: + check validateUtf8(toUTF3(i)) == true + test "Values [U+D800, U+DFFF] (UTF-16 surrogates) are not allowed": + for i in 0xD800'u32 .. 0xDFFF'u32: + check validateUtf8(toUTF3(i)) == false + test "Values [U+E000, U+FFFD] are allowed": + for i in 0xE000'u32 .. 0xFFFD'u32: + check validateUtf8(toUTF3(i)) == true + test "Values U+FFFE and U+FFFF are not allowed": + check: + validateUtf8(toUTF3(0xFFFE'u32)) == false + validateUtf8(toUTF3(0xFFFF'u32)) == false + test "Values [U+10000, U10FFFF] are allowed": + for i in 0x10000'u32 .. 0x10FFFF'u32: + check validateUtf8(toUTF4(i)) == true + test "Values bigger U+10FFFF are not allowed": + for i in 0x11_0000'u32 .. 0x1F_FFFF'u32: + check validateUtf8(toUTF4(i)) == false + test "fastvalidate-utf-8 bad sequences": + # https://github.com/lemire/fastvalidate-utf-8 test vectors + const + GoodSequences = [ + "a", + "\xc3\xb1", + "\xe2\x82\xa1", + "\xf0\x90\x8c\xbc", + "안녕하세요, 세상", + "\xc2\x80", + "\xf0\x90\x80\x80", + "\xee\x80\x80" + ] + + BadSequences = [ + "\xc3\x28", + "\xa0\xa1", + "\xe2\x28\xa1", + "\xe2\x82\x28", + "\xf0\x28\x8c\xbc", + "\xf0\x90\x28\xbc", + "\xf0\x28\x8c\x28", + "\xc0\x9f", + "\xf5\xff\xff\xff", + "\xed\xa0\x81", + "\xf8\x90\x80\x80\x80", + "123456789012345\xed", + "123456789012345\xf1", + "123456789012345\xc2", + "\xC2\x7F", + "\xce", + "\xce\xba\xe1", + "\xce\xba\xe1\xbd", + "\xce\xba\xe1\xbd\xb9\xcf", + "\xce\xba\xe1\xbd\xb9\xcf\x83\xce", + "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce", + "\xdf", + "\xef\xbf" + ] + for item in BadSequences: + check validateUtf8(item) == false + for item in GoodSequences: + check validateUtf8(item) == true + test "UTF-8 decoder capability and stress test": + # https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt + const Tests2 = [ + # Boundary condition test cases + ("\x00", true), + ("\xc2\x80", true), + ("\xe0\xa0\x80", true), + ("\xf0\x90\x80\x80", true), + ("\xf8\x88\x80\x80\x80", false), + ("\xfc\x84\x80\x80\x80\x80", false), + ("\x7f", true), + ("\xdf\xbf", true), + ("\xef\xbf\xbf", false), + ("\xf7\xbf\xbf\xbf", false), + ("\xfb\xbf\xbf\xbf\xbf", false), + ("\xfd\xbf\xbf\xbf\xbf\xbf", false), + ("\xed\x9f\xbf", true), + ("\xee\x80\x80", true), + ("\xef\xbf\xbd", true), + ("\xf4\x8f\xbf\xbf", true), + ] + + const Tests3 = [ + # Malformed sequences + ("\x80", false), + ("\xbf", false), + ("\x80\xbf", false), + ("\x80\xbf\x80", false), + ("\x80\xbf\x80\xbf", false), + ("\x80\xbf\x80\xbf\x80", false), + ("\x80\xbf\x80\xbf\x80\xbf", false), + ("\x80\xbf\x80\xbf\x80\xbf\x80", false), + ("\xc0", false), + ("\xe0\x80", false), + ("\xf0\x80\x80", false), + ("\xf8\x80\x80\x80", false), + ("\xfc\x80\x80\x80\x80", false), + ("\xdf", false), + ("\xef\xbf", false), + ("\xf7\xbf\xbf", false), + ("\xfb\xbf\xbf\xbf", false), + ("\xfd\xbf\xbf\xbf\xbf", false), + ("\xfe", false), + ("\xff", false), + ("\xfe\xfe\xff\xff", false) + ] + + const Tests4 = [ + # Overlong sequences + ("\xc0\xaf", false), + ("\xe0\x80\xaf", false), + ("\xf0\x80\x80\xaf", false), + ("\xf8\x80\x80\x80\xaf", false), + ("\xfc\x80\x80\x80\x80\xaf", false), + ("\xc1\xbf", false), + ("\xe0\x9f\xbf", false), + ("\xf0\x8f\xbf\xbf", false), + ("\xf8\x87\xbf\xbf\xbf", false), + ("\xfc\x83\xbf\xbf\xbf\xbf", false), + ("\xc0\x80", false), + ("\xe0\x80\x80", false), + ("\xf0\x80\x80\x80", false), + ("\xf8\x80\x80\x80\x80", false), + ("\xfc\x80\x80\x80\x80\x80", false) + ] + + const Tests5 = [ + # Illegal code positions + ("\xed\xa0\x80", false), + ("\xed\xad\xbf", false), + ("\xed\xae\x80", false), + ("\xed\xaf\xbf", false), + ("\xed\xb0\x80", false), + ("\xed\xbe\x80", false), + ("\xed\xbf\xbf", false), + ("\xed\xa0\x80\xed\xb0\x80", false), + ("\xed\xa0\x80\xed\xbf\xbf", false), + ("\xed\xad\xbf\xed\xb0\x80", false), + ("\xed\xad\xbf\xed\xbf\xbf", false), + ("\xed\xae\x80\xed\xb0\x80", false), + ("\xed\xae\x80\xed\xbf\xbf", false), + ("\xed\xaf\xbf\xed\xb0\x80", false), + ("\xed\xaf\xbf\xed\xbf\xbf", false) + ] + + for item in Tests2: + check validateUtf8(item[0]) == item[1] + for item in Tests3: + check validateUtf8(item[0]) == item[1] + for item in Tests4: + check validateUtf8(item[0]) == item[1] + for item in Tests5: + check validateUtf8(item[0]) == item[1] From 39fb71bceca73eb2209021638273c820ab7f3f07 Mon Sep 17 00:00:00 2001 From: cheatfate Date: Wed, 7 Oct 2020 23:50:38 +0300 Subject: [PATCH 7/9] Add UTF-8 length procedure. Add UTF-8 offset procedure. Add UTF-8 substr procedure. Add wchar_t to UTF-8 conversion procedure. Add multibyte to wchar_t conversion procedure (posix). Add UTF-8 tests. Fix password reader to validate utf-8 encoding when reading from pipe. Fix password reader to read utf-8 encoded strings from *nix console. --- stew/conio.nim | 151 ++++++++++++++++++--------- stew/utf8.nim | 248 +++++++++++++++++++++++++++++++++++++++++++- tests/test_utf8.nim | 156 +++++++++++++++++++++++++--- 3 files changed, 488 insertions(+), 67 deletions(-) diff --git a/stew/conio.nim b/stew/conio.nim index 8e78dc39..e1115885 100644 --- a/stew/conio.nim +++ b/stew/conio.nim @@ -7,7 +7,7 @@ ## those terms. ## This module implements cross-platform console procedures. -import io2 +import io2, utf8 export io2 when defined(windows): @@ -62,6 +62,7 @@ when defined(windows): ENABLE_PROCESSED_INPUT = 0x0001'u32 ENABLE_ECHO_INPUT = 0x0004'u32 FILE_TYPE_CHAR = 0x0002'u32 + ERROR_NO_UNICODE_TRANSLATION = 1113'u32 proc isConsoleRedirected*(hConsole: uint): bool = ## Returns ``true`` if console handle was redirected. @@ -73,7 +74,7 @@ when defined(windows): else: true - proc readConsoleInput(maxBytes: int): IoResult[string] = + proc readConsoleInput(maxChars: int): IoResult[string] = let hConsoleInput = block: let res = getStdHandle(STD_INPUT_HANDLE) @@ -94,8 +95,9 @@ when defined(windows): if setConsoleCP(CP_UTF8) == 0'i32: return err(ioLastError()) - # Allocating buffer with size equal to `maxBytes` + len(CRLF) - var buffer = newString(maxBytes + 2) + # Allocating buffer with size equal to `(maxChars + len(CRLF)) * 4`, + # where 4 is maximum expected size of one character (UTF8 encoding). + var buffer = newString((maxChars + 2) * 4) let bytesToRead = uint32(len(buffer)) var bytesRead: uint32 let rres = readFile(hConsoleInput, cast[pointer](addr buffer[0]), @@ -109,7 +111,7 @@ when defined(windows): return err(ioLastError()) # Truncate additional bytes from buffer. - buffer.setLen(int(min(bytesRead, uint32(maxBytes)))) + buffer.setLen(int(bytesRead)) # Trim CR/CRLF from buffer. if len(buffer) > 0: @@ -123,7 +125,13 @@ when defined(windows): buffer.setLen(len(buffer) - 1) elif buffer[^1] == char(0x0D): buffer.setLen(len(buffer) - 1) - ok(buffer) + + # Check if buffer is valid UTF-8 encoded string. + if utf8Validate(buffer): + # Cut result buffer to `maxChars` characters. + ok(utf8Substr(buffer, 0, maxChars - 1).get()) + else: + err(IoErrorCode(ERROR_NO_UNICODE_TRANSLATION)) else: let prevMode = block: @@ -147,8 +155,8 @@ when defined(windows): discard setConsoleCP(prevInputCP) return err(errCode) - # Allocating buffer with size equal to `maxBytes` + len(CRLF) - var buffer = newSeq[Utf16Char](maxBytes + 2) + # Allocating buffer with size equal to `maxChars` + len(CRLF). + var buffer = newSeq[Utf16Char](maxChars + 2) let charsToRead = uint32(len(buffer)) var charsRead: uint32 let rres = readConsole(hConsoleInput, cast[pointer](addr buffer[0]), @@ -170,7 +178,8 @@ when defined(windows): return err(ioLastError()) # Truncate additional bytes from buffer. - buffer.setLen(int(min(charsRead, uint32(maxBytes)))) + buffer.setLen(int(min(charsRead, uint32(maxChars)))) + # Truncate CRLF in result wide string. if len(buffer) > 0: if int16(buffer[^1]) == int16(0x0A): @@ -184,7 +193,7 @@ when defined(windows): elif int16(buffer[^1]) == int16(0x0D): buffer.setLen(len(buffer) - 1) - # Convert Windows UTF-16 encoded string to UTF-8 encoded string. + # Convert Windows UCS-2 encoded string to UTF-8 encoded string. if len(buffer) > 0: var pwd = "" let bytesNeeded = wideCharToMultiByte(CP_UTF8, 0'u32, addr buffer[0], @@ -277,61 +286,101 @@ elif defined(posix): else: ok() - proc readConsoleInput(maxBytes: int): IoResult[string] = - # Allocating buffer with size equal to `maxBytes` + len(LF) - var buffer = newString(maxBytes + 1) - let bytesRead = - if isConsoleRedirected(STDIN_FILENO): - let res = posix.read(STDIN_FILENO, cast[pointer](addr buffer[0]), - len(buffer)) - if res < 0: - return err(ioLastError()) - res + proc readConsoleInput(maxChars: int): IoResult[string] = + # Allocating buffer with size equal to `(maxChars + len(LF)) * 4`, where + # 4 is maximum expected size of one character (UTF8 encoding). + var buffer = newString((maxChars + 1) * 4) + + if isConsoleRedirected(STDIN_FILENO): + let bytesRead = + block: + let res = posix.read(STDIN_FILENO, cast[pointer](addr buffer[0]), + len(buffer)) + if res < 0: + return err(ioLastError()) + res + + # Truncate additional bytes from buffer. + buffer.setLen(bytesRead) + + # Trim LF in result string + if len(buffer) > 0: + if buffer[^1] == char(0x0A): + buffer.setLen(len(buffer) - 1) + + # Check if buffer is valid UTF-8 encoded string. + if utf8Validate(buffer): + # Cut result buffer to `maxChars` characters. + ok(utf8Substr(buffer, 0, maxChars - 1).get()) else: - var cur, old: Termios - if tcGetAttr(STDIN_FILENO, addr cur) != cint(0): - return err(ioLastError()) + err(IoErrorCode(EILSEQ)) + else: + let bytesRead = + block: + var cur, old: Termios + if tcGetAttr(STDIN_FILENO, addr cur) != cint(0): + return err(ioLastError()) - old = cur - cur.c_lflag = cur.c_lflag and not(Cflag(ECHO)) + old = cur + cur.c_lflag = cur.c_lflag and not(Cflag(ECHO)) - if tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(cur)) != cint(0): - return err(ioLastError()) + if tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(cur)) != cint(0): + return err(ioLastError()) - let res = read(STDIN_FILENO, cast[pointer](addr buffer[0]), - len(buffer)) - if res < 0: - let errCode = ioLastError() - discard tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old)) - return err(errCode) + let res = read(STDIN_FILENO, cast[pointer](addr buffer[0]), + len(buffer)) + if res < 0: + let errCode = ioLastError() + discard tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old)) + return err(errCode) - if tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old)) != cint(0): - return err(ioLastError()) - res + if tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old)) != cint(0): + return err(ioLastError()) + res - # Truncate additional bytes from buffer. - buffer.setLen(min(maxBytes, bytesRead)) - # Trim LF in result string - if len(buffer) > 0: - if buffer[^1] == char(0x0A): - buffer.setLen(len(buffer) - 1) - ok(buffer) + # Truncate additional bytes from buffer. + buffer.setLen(bytesRead) + + # Trim LF in result string + if len(buffer) > 0: + if buffer[^1] == char(0x0A): + buffer.setLen(len(buffer) - 1) + buffer.add(char(0x00)) + + # Conversion of console input into wide characters sequence. + let wres = mbstowcs(uint32, buffer) + if wres.isOk(): + # Trim wide character sequence to `maxChars` number of characters. + var wbuffer = wres.get() + if maxChars < len(wbuffer): + wbuffer.setLen(maxChars) + # Conversion of wide characters sequence to UTF-8 encoded string. + let ures = wbuffer.wcharToUtf8() + if ures.isOk(): + ok(ures.get()) + else: + err(IoErrorCode(EILSEQ)) + else: + err(IoErrorCode(EILSEQ)) proc readConsolePassword*(prompt: string, - maxBytes = 32768): IoResult[string] = - ## Reads a password from stdin without printing it with length in bytes up to - ## ``maxBytes``. + maxChars = 32768): IoResult[string] = + ## Reads a password from stdin without printing it with length in characters + ## up to ``maxChars``. ## ## This procedure supports reading of UTF-8 encoded passwords from console or - ## redirected pipe. But ``maxBytes`` will limit + ## redirected pipe. ## ## Before reading password ``prompt`` will be printed. ## - ## Please note that ``maxBytes`` should be in range (0, 32768]. - doAssert(maxBytes > 0 and maxBytes <= 32768, - "maxBytes should be integer in (0, 32768]") + ## Please note that ``maxChars`` should be in range (0, 32768]. + doAssert(maxChars > 0 and maxChars <= 32768, + "maxChars should be integer in (0, 32768]") ? writeConsoleOutput(prompt) - let res = ? readConsoleInput(maxBytes) + let res = ? readConsoleInput(maxChars) # `\p` is platform specific newline: CRLF on Windows, LF on Unix ? writeConsoleOutput("\p") ok(res) + +when isMainModule: + echo readConsolePassword("Enter password: ", 4) diff --git a/stew/utf8.nim b/stew/utf8.nim index e2b8599a..d2d43034 100644 --- a/stew/utf8.nim +++ b/stew/utf8.nim @@ -7,8 +7,21 @@ ## those terms. ## This module implements UTF-8 related procedures. +import results, io2 +export results -proc validateUtf8*[T: byte|char](data: openarray[T]): bool = +type + UResult*[T] = Result[T, cstring] + Wides* = int16 | uint16 | int32 | uint32 + Bytes* = int8 | char | uint8 | byte + +const + ErrorBufferOverflow* = cstring"Buffer is not large enough" + ErrorInvalidSequence* = cstring"Invalid Unicode sequence found" + ErrorInvalidLocale* = cstring"Could not obtain system locale" + ErrorNotEnoughCharacters* = cstring"Not enough characters in string" + +proc utf8Validate*[T: Bytes](data: openarray[T]): bool = ## Returns ``true`` if ``data`` is correctly UTF-8 encoded string. var index = 0 @@ -89,3 +102,236 @@ proc validateUtf8*[T: byte|char](data: openarray[T]): bool = else: return false + +proc utf8Length*[T: Bytes](data: openarray[T]): UResult[int] = + ## Returns number of UTF-8 encoded characters in array ``data``. + ## + ## NOTE: Validate data with `utf8Validate()` before using this procedure, + ## otherwise length returned by this procedure could be incorrect. + var index = 0 + var size = 0 + while index < len(data): + let ch = uint(data[index]) + if ch < 0x80: + inc(index, 1) + elif (ch and 0xE0'u8) == 0xC0'u8: + inc(index, 2) + elif (ch and 0xF0'u8) == 0xE0'u8: + inc(index, 3) + elif (ch and 0xF8'u8) == 0xF0'u8: + inc(index, 4) + else: + return err(ErrorInvalidSequence) + inc(size) + if index == len(data): + ok(size) + else: + err(ErrorInvalidSequence) + +proc utf8Offset*[T: Bytes](data: openarray[T], index: int): UResult[int] = + ## Return offset in UTF-8 encoded string ``data`` for character position + ## ``index``. + if index <= 0: + return ok(0) + + var byteIndex = 0 + var charIndex = 0 + + while (byteIndex < len(data)) and (charIndex < index): + let ch = uint(data[byteIndex]) + if ch < 0x80: + inc(byteIndex, 1) + elif (ch and 0xE0'u8) == 0xC0'u8: + inc(byteIndex, 2) + elif (ch and 0xF0'u8) == 0xE0'u8: + inc(byteIndex, 3) + elif (ch and 0xF8'u8) == 0xF0'u8: + inc(byteIndex, 4) + else: + return err(ErrorInvalidSequence) + inc(charIndex) + + if charIndex == index: + ok(byteIndex) + else: + err(ErrorNotEnoughCharacters) + +proc utf8Substr*[T: Bytes](data: openarray[T], + start, finish: int): UResult[string] = + ## Substring string ``data`` using starting character (not byte) index + ## ``start`` and terminating character (not byte) index ``finish`` and return + ## result string. + ## + ## ``data`` should be correct UTF-8 encoded string, because only initial + ## octets got validated. + ## + ## ``start`` - The starting index of the substring, any value BELOW or EQUAL + ## to zero will be considered as zero. If ``start`` index is not present in + ## string ``data`` empty string will be returned as result. + ## + ## ``finish`` - The terminating index of the substring, any value BELOW + ## zero will be considered as `len(data)`. + let soffset = + if start <= 0: + 0 + elif start >= len(data): + return ok("") + else: + let res = utf8Offset(data, start) + if res.isErr(): + if res.error != ErrorNotEnoughCharacters: + return err(res.error) + return ok("") + else: + res.get() + + let eoffset = + if finish < 0: + len(data) + elif finish >= len(data): + len(data) + else: + let res = utf8Offset(data, finish + 1) + if res.isErr(): + if res.error != ErrorNotEnoughCharacters: + return err(res.error) + len(data) + else: + res.get() + + var res = newString(eoffset - soffset) + var k = 0 + for i in soffset ..< eoffset: + res[k] = cast[char](data[i]) + inc(k) + ok(res) + +proc wcharToUtf8*[A: Wides, B: Bytes](input: openarray[A], + output: var openarray[B]): UResult[int] = + ## Converts WCHAR sequence ``input`` to UTF-8 array of octets ``output``. + ## + ## Procedure supports 4-byte (Linux) and 2-byte sequences (Windows) as input. + var offset = 0 + for item in input: + let uitem = uint(item) + let codepoint = + if uitem >= 0xD800'u and uitem <= 0xDBFF'u: + 0x10000'u + ((uitem - 0xD800'u) shl 10) + else: + if uitem >= 0xDC00'u and uitem <= 0xDFFF'u: + uitem - 0xDC00'u + else: + uitem + if codepoint <= 0x7F'u: + if len(output) > 0: + if offset < len(output): + output[offset] = cast[B](codepoint and 0x7F'u) + else: + return err(ErrorBufferOverflow) + inc(offset, 1) + elif codepoint <= 0x7FF'u: + if len(output) > 0: + if offset + 1 < len(output): + output[offset + 0] = cast[B](0xC0'u8 or + byte((codepoint shr 6) and 0x1F'u)) + output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u)) + else: + return err(ErrorBufferOverflow) + inc(offset, 2) + elif codepoint <= 0xFFFF'u: + if len(output) > 0: + if offset + 2 < len(output): + output[offset + 0] = cast[B](0xE0'u8 or + byte((codepoint shr 12) and 0x0F'u)) + output[offset + 1] = cast[B](0x80'u8 or + byte((codepoint shr 6) and 0x3F'u)) + output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u)) + else: + return err(ErrorBufferOverflow) + inc(offset, 3) + elif codepoint <= 0x10FFFF'u: + if len(output) > 0: + if offset + 3 < len(output): + output[offset + 0] = cast[B](0xF0'u8 or + byte((codepoint shr 18) and 0x07'u)) + output[offset + 1] = cast[B](0x80'u8 or + byte((codepoint shr 12) and 0x3F'u)) + output[offset + 2] = cast[B](0x80'u8 or + byte((codepoint shr 6) and 0x3F'u)) + output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u)) + else: + return err("") + inc(offset, 4) + else: + return err(ErrorInvalidSequence) + ok(offset) + +proc wcharToUtf8*[T: Wides](input: openarray[T]): UResult[string] {.inline.} = + ## Converts wide character + var empty: array[0, char] + let size = ? wcharToUtf8(input, empty) + var output = newString(size) + let res {.used.} = ? wcharToUtf8(input, output) + ok(output) + +when defined(posix): + import posix + + type + Mbstate {.importc: "mbstate_t", + header: "", pure, final.} = object + + proc mbsrtowcs(dest: pointer, src: pointer, n: csize_t, + ps: ptr Mbstate): csize_t {. + importc, header: "".} + + proc mbstowcs*[A: Bytes, B: Wides](t: typedesc[B], + input: openarray[A]): UResult[seq[B]] = + ## Converts multibyte encoded string to OS specific wide char string. + ## + ## Note, that `input` should be `0` terminated. + ## + ## Encoding is made using `mbsrtowcs`, so procedure supports invalid + ## sequences and able to decoded all the characters before first invalid + ## character encountered. + + # Without explicitely setting locale because `mbsrtowcs` will fail with + # EILSEQ. + # If locale is an empty string, "", each part of the locale that should + # be modified is set according to the environment variables. + let sres = setlocale(LC_ALL, cstring"") + if isNil(sres): + return err(ErrorInvalidLocale) + + var buffer = newSeq[B](len(input)) + if len(input) == 0: + return ok(buffer) + + doAssert(input[^1] == A(0), "Input array should be zero-terminated") + var data = @input + var ostr = addr data[0] + var pstr = ostr + var mstate = Mbstate() + + while true: + let res = mbsrtowcs(addr buffer[0], addr pstr, csize_t(len(buffer)), + addr mstate) + if res == cast[csize_t](-1): + # If invalid multibyte sequence has been encountered, ``pstr`` is left + ## pointing to the invalid multibyte sequence, ``-1`` is returned, and + ## errno is set to EILSEQ. + let diff = cast[uint](pstr) - cast[uint](ostr) + if diff == 0: + return err(ErrorInvalidSequence) + else: + # We have partially decoded sequence, `diff` is position of first + # invalid character in sequence. + data[diff] = A(0x00) + ostr = addr data[0] + pstr = ostr + mstate = Mbstate() + else: + # Its safe to convert `csize_t` to `int` here because `len(input)` + # is also `int`. + buffer.setLen(res) + return ok(buffer) diff --git a/tests/test_utf8.nim b/tests/test_utf8.nim index da4ed533..05bee447 100644 --- a/tests/test_utf8.nim +++ b/tests/test_utf8.nim @@ -32,29 +32,29 @@ proc toUTF1(value: uint32): array[1, byte] = suite "UTF-8 validation test suite": test "Values [U+0000, U+007F] are allowed": for i in 0x00'u32 .. 0x7F'u32: - check validateUtf8(toUTF1(i)) == true + check utf8Validate(toUTF1(i)) == true test "Values [U+0080, U+07FF] are allowed": for i in 0x80'u32 .. 0x7FF'u32: - check validateUtf8(toUTF2(i)) == true + check utf8Validate(toUTF2(i)) == true test "Values [U+0800, U+D7FF] are allowed": for i in 0x800'u32 .. 0xD7FF'u32: - check validateUtf8(toUTF3(i)) == true + check utf8Validate(toUTF3(i)) == true test "Values [U+D800, U+DFFF] (UTF-16 surrogates) are not allowed": for i in 0xD800'u32 .. 0xDFFF'u32: - check validateUtf8(toUTF3(i)) == false + check utf8Validate(toUTF3(i)) == false test "Values [U+E000, U+FFFD] are allowed": for i in 0xE000'u32 .. 0xFFFD'u32: - check validateUtf8(toUTF3(i)) == true + check utf8Validate(toUTF3(i)) == true test "Values U+FFFE and U+FFFF are not allowed": check: - validateUtf8(toUTF3(0xFFFE'u32)) == false - validateUtf8(toUTF3(0xFFFF'u32)) == false + utf8Validate(toUTF3(0xFFFE'u32)) == false + utf8Validate(toUTF3(0xFFFF'u32)) == false test "Values [U+10000, U10FFFF] are allowed": for i in 0x10000'u32 .. 0x10FFFF'u32: - check validateUtf8(toUTF4(i)) == true + check utf8Validate(toUTF4(i)) == true test "Values bigger U+10FFFF are not allowed": for i in 0x11_0000'u32 .. 0x1F_FFFF'u32: - check validateUtf8(toUTF4(i)) == false + check utf8Validate(toUTF4(i)) == false test "fastvalidate-utf-8 bad sequences": # https://github.com/lemire/fastvalidate-utf-8 test vectors const @@ -95,9 +95,9 @@ suite "UTF-8 validation test suite": "\xef\xbf" ] for item in BadSequences: - check validateUtf8(item) == false + check utf8Validate(item) == false for item in GoodSequences: - check validateUtf8(item) == true + check utf8Validate(item) == true test "UTF-8 decoder capability and stress test": # https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt const Tests2 = [ @@ -184,10 +184,136 @@ suite "UTF-8 validation test suite": ] for item in Tests2: - check validateUtf8(item[0]) == item[1] + check utf8Validate(item[0]) == item[1] for item in Tests3: - check validateUtf8(item[0]) == item[1] + check utf8Validate(item[0]) == item[1] for item in Tests4: - check validateUtf8(item[0]) == item[1] + check utf8Validate(item[0]) == item[1] for item in Tests5: - check validateUtf8(item[0]) == item[1] + check utf8Validate(item[0]) == item[1] + + test "UTF-8 length() test": + const + Cyrillic = "\xd0\x9f\xd1\x80\xd0\xbe\xd0\xb3" & + "\xd1\x80\xd0\xb0\xd0\xbc\xd0\xbc\xd0\xb0" + check: + utf8Length("Программа").tryGet() == 9 + utf8Length("Программ").tryGet() == 8 + utf8Length("Програм").tryGet() == 7 + utf8Length("Програ").tryGet() == 6 + utf8Length("Прогр").tryGet() == 5 + utf8Length("Прог").tryGet() == 4 + utf8Length("Про").tryGet() == 3 + utf8Length("Пр").tryGet() == 2 + utf8Length("П").tryGet() == 1 + utf8Length("").tryGet() == 0 + utf8Length("П⠯🤗").tryGet() == 3 + utf8Length("⠯🤗").tryGet() == 2 + utf8Length("🤗").tryGet() == 1 + + check: + utf8Length(Cyrillic).tryGet() == 9 + utf8Length(Cyrillic.toOpenArray(0, len(Cyrillic) - 2)).isErr() == true + + test "UTF-8 substr() test": + check: + utf8Substr("Программа", -1, -1).tryGet() == "Программа" + utf8Substr("Программа", 0, 0).tryGet() == "П" + utf8Substr("Программа", 0, 1).tryGet() == "Пр" + utf8Substr("Программа", 0, 2).tryGet() == "Про" + utf8Substr("Программа", 0, 3).tryGet() == "Прог" + utf8Substr("Программа", 0, 4).tryGet() == "Прогр" + utf8Substr("Программа", 0, 5).tryGet() == "Програ" + utf8Substr("Программа", 0, 6).tryGet() == "Програм" + utf8Substr("Программа", 0, 7).tryGet() == "Программ" + utf8Substr("Программа", 0, 8).tryGet() == "Программа" + utf8Substr("Программа", 0, 9).tryGet() == "Программа" + utf8Substr("Программа", 0, 10).tryGet() == "Программа" + utf8Substr("Программа", 0, 18).tryGet() == "Программа" + utf8Substr("Программа", 0, 19).tryGet() == "Программа" + utf8Substr("Программа", 0, 100).tryGet() == "Программа" + utf8Substr("Программа", 100, 0).tryGet() == "" + utf8Substr("Программа", 100, 100).tryGet() == "" + utf8Substr("Программа", 1, 1).tryGet() == "р" + utf8Substr("Программа", 2, 2).tryGet() == "о" + utf8Substr("Программа", 3, 3).tryGet() == "г" + utf8Substr("Программа", 4, 4).tryGet() == "р" + utf8Substr("Программа", 5, 5).tryGet() == "а" + utf8Substr("Программа", 6, 6).tryGet() == "м" + utf8Substr("Программа", 7, 7).tryGet() == "м" + utf8Substr("Программа", 8, 8).tryGet() == "а" + utf8Substr("Программа", 9, 9).tryGet() == "" + utf8Substr("Программа", 0, -1).tryGet() == "Программа" + utf8Substr("Программа", 1, -1).tryGet() == "рограмма" + utf8Substr("Программа", 2, -1).tryGet() == "ограмма" + utf8Substr("Программа", 3, -1).tryGet() == "грамма" + utf8Substr("Программа", 4, -1).tryGet() == "рамма" + utf8Substr("Программа", 5, -1).tryGet() == "амма" + utf8Substr("Программа", 6, -1).tryGet() == "мма" + utf8Substr("Программа", 7, -1).tryGet() == "ма" + utf8Substr("Программа", 8, -1).tryGet() == "а" + utf8Substr("Программа", 9, -1).tryGet() == "" + + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", -1, -1).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 0).tryGet() == "⠯" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 1).tryGet() == "⠯⠰" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 2).tryGet() == "⠯⠰⠱" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 3).tryGet() == "⠯⠰⠱⠲" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 4).tryGet() == "⠯⠰⠱⠲⠳" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 5).tryGet() == "⠯⠰⠱⠲⠳⠴" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 6).tryGet() == "⠯⠰⠱⠲⠳⠴⠵" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 7).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 8).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 9).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 23).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 24).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 100).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 100, 0).tryGet() == "" + utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 100, 100).tryGet() == "" + + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", -1, -1).tryGet() == + "🤗🤘🤙🤚🤛🤜🤝🤞🤟" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 0).tryGet() == + "🤗" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 1).tryGet() == + "🤗🤘" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 2).tryGet() == + "🤗🤘🤙" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 3).tryGet() == + "🤗🤘🤙🤚" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 4).tryGet() == + "🤗🤘🤙🤚🤛" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 5).tryGet() == + "🤗🤘🤙🤚🤛🤜" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 6).tryGet() == + "🤗🤘🤙🤚🤛🤜🤝" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 7).tryGet() == + "🤗🤘🤙🤚🤛🤜🤝🤞" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 8).tryGet() == + "🤗🤘🤙🤚🤛🤜🤝🤞🤟" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 9).tryGet() == + "🤗🤘🤙🤚🤛🤜🤝🤞🤟" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 31).tryGet() == + "🤗🤘🤙🤚🤛🤜🤝🤞🤟" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 32).tryGet() == + "🤗🤘🤙🤚🤛🤜🤝🤞🤟" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 100).tryGet() == + "🤗🤘🤙🤚🤛🤜🤝🤞🤟" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 0).tryGet() == "" + utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 100).tryGet() == "" + + test "wcharToUtf8() tests": + for i in 0 ..< 0x11_0000: + if i != 0xFFFE and i != 0xFFFF: + if i < 0x10000: + var data16 = [uint16(i)] + let res = wcharToUtf8(data16) + check: + res.isOk() == true + utf8Validate(res.get()) == true + + var data32 = [uint32(i)] + let res = wcharToUtf8(data32) + check: + res.isOk() == true + utf8Validate(res.get()) == true From 1746bc0095257845807d9a9e7f05dac61796f744 Mon Sep 17 00:00:00 2001 From: cheatfate Date: Wed, 14 Oct 2020 12:25:33 +0300 Subject: [PATCH 8/9] Fix UTF-32 encoder/decoder. Add tests for UTF-8 to UTF-32 and UTF-32 to UTF-8 encoders. --- stew/conio.nim | 2 +- stew/utf8.nim | 193 ++++++++++++++++++++++++++++++++++++-------- tests/test_utf8.nim | 42 +++++++--- 3 files changed, 194 insertions(+), 43 deletions(-) diff --git a/stew/conio.nim b/stew/conio.nim index e1115885..e9fba96d 100644 --- a/stew/conio.nim +++ b/stew/conio.nim @@ -355,7 +355,7 @@ elif defined(posix): if maxChars < len(wbuffer): wbuffer.setLen(maxChars) # Conversion of wide characters sequence to UTF-8 encoded string. - let ures = wbuffer.wcharToUtf8() + let ures = wbuffer.utf32toUtf8() if ures.isOk(): ok(ures.get()) else: diff --git a/stew/utf8.nim b/stew/utf8.nim index d2d43034..afac4d5f 100644 --- a/stew/utf8.nim +++ b/stew/utf8.nim @@ -12,7 +12,8 @@ export results type UResult*[T] = Result[T, cstring] - Wides* = int16 | uint16 | int32 | uint32 + Wides32* = int32 | uint32 + Wides16* = int16 | uint16 Bytes* = int8 | char | uint8 | byte const @@ -206,72 +207,200 @@ proc utf8Substr*[T: Bytes](data: openarray[T], inc(k) ok(res) -proc wcharToUtf8*[A: Wides, B: Bytes](input: openarray[A], +proc utf32toUtf8*[A: Wides32, B: Bytes](input: openarray[A], output: var openarray[B]): UResult[int] = - ## Converts WCHAR sequence ``input`` to UTF-8 array of octets ``output``. - ## - ## Procedure supports 4-byte (Linux) and 2-byte sequences (Windows) as input. + ## Converts UTF-32 sequence ``input`` to UTF-8 array ``output``. var offset = 0 for item in input: - let uitem = uint(item) let codepoint = - if uitem >= 0xD800'u and uitem <= 0xDBFF'u: - 0x10000'u + ((uitem - 0xD800'u) shl 10) - else: - if uitem >= 0xDC00'u and uitem <= 0xDFFF'u: - uitem - 0xDC00'u - else: - uitem - if codepoint <= 0x7F'u: + block: + if (uint32(item) >= 0xD800'u32) and (uint32(item) <= 0xDFFF'u32): + # high and low surrogates U+D800 through U+DFFF prohibited in UTF-32. + return err(ErrorInvalidSequence) + elif (uint32(item) == 0xFFFE'u32) or (uint32(item) == 0xFFFF'u32): + # these codes are intended for process-internal uses, and not a + # unicode characters. + return err(ErrorInvalidSequence) + uint32(item) + if codepoint <= 0x7F'u32: if len(output) > 0: if offset < len(output): - output[offset] = cast[B](codepoint and 0x7F'u) + output[offset] = cast[B](codepoint and 0x7F'u32) else: return err(ErrorBufferOverflow) inc(offset, 1) - elif codepoint <= 0x7FF'u: + elif codepoint <= 0x7FF'u32: if len(output) > 0: if offset + 1 < len(output): output[offset + 0] = cast[B](0xC0'u8 or - byte((codepoint shr 6) and 0x1F'u)) - output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u)) + byte((codepoint shr 6) and 0x1F'u32)) + output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32)) else: return err(ErrorBufferOverflow) inc(offset, 2) - elif codepoint <= 0xFFFF'u: + elif codepoint <= 0xFFFF'u32: if len(output) > 0: if offset + 2 < len(output): output[offset + 0] = cast[B](0xE0'u8 or - byte((codepoint shr 12) and 0x0F'u)) + byte((codepoint shr 12) and 0x0F'u32)) output[offset + 1] = cast[B](0x80'u8 or - byte((codepoint shr 6) and 0x3F'u)) - output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u)) + byte((codepoint shr 6) and 0x3F'u32)) + output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32)) else: return err(ErrorBufferOverflow) inc(offset, 3) - elif codepoint <= 0x10FFFF'u: + elif codepoint <= 0x10FFFF'u32: if len(output) > 0: if offset + 3 < len(output): output[offset + 0] = cast[B](0xF0'u8 or - byte((codepoint shr 18) and 0x07'u)) + byte((codepoint shr 18) and 0x07'u32)) output[offset + 1] = cast[B](0x80'u8 or - byte((codepoint shr 12) and 0x3F'u)) + byte((codepoint shr 12) and 0x3F'u32)) output[offset + 2] = cast[B](0x80'u8 or - byte((codepoint shr 6) and 0x3F'u)) - output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u)) + byte((codepoint shr 6) and 0x3F'u32)) + output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32)) else: - return err("") + return err(ErrorBufferOverflow) inc(offset, 4) else: return err(ErrorInvalidSequence) ok(offset) -proc wcharToUtf8*[T: Wides](input: openarray[T]): UResult[string] {.inline.} = - ## Converts wide character +proc utf32toUtf8*[T: Wides32](input: openarray[T]): UResult[string] {.inline.} = + ## Converts wide character sequence ``input`` to UTF-8 encoded string. var empty: array[0, char] - let size = ? wcharToUtf8(input, empty) + let size = ? utf32ToUtf8(input, empty) var output = newString(size) - let res {.used.} = ? wcharToUtf8(input, output) + let res {.used.} = ? utf32ToUtf8(input, output) + ok(output) + +proc utf8toUtf32*[A: Bytes, B: Wides32](input: openarray[A], + output: var openarray[B]): UResult[int] = + ## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded + ## sequences of 32bit limbs. + ## + ## To obtain required size of ``output`` you need to pass ``output`` as + ## zero-length array, in such way required size will be returned as result of + ## procedure. + ## + ## If size of ``output`` is not zero, and there not enough space in ``output`` + ## array to store whole ``input`` array, error ``ErrorBufferOverflow`` will + ## be returned. + var index = 0 + var dindex = 0 + if len(output) == 0: + return utf8Length(input) + else: + while true: + if index >= len(input): + break + let byte1 = uint32(input[index]) + inc(index) + + if (byte1 and 0x80) == 0x00: + if dindex < len(output): + output[dindex] = B(byte1) + inc(dindex) + else: + return err(ErrorBufferOverflow) + elif (byte1 and 0xE0'u32) == 0xC0'u32: + # Two-byte form (110xxxxx 10xxxxxx) + if index >= len(input): + return err(ErrorInvalidSequence) + # overlong sequence test + if (byte1 and 0xFE'u32) == 0xC0'u32: + return err(ErrorInvalidSequence) + + let byte2 = uint32(input[index]) + if (byte2 and 0xC0'u32) != 0x80'u32: + return err(ErrorInvalidSequence) + + if dindex < len(output): + output[dindex] = B(((byte1 and 0x1F'u32) shl 6) or + (byte2 and 0x3F'u32)) + inc(dindex) + else: + return err(ErrorBufferOverflow) + inc(index) + elif (byte1 and 0xF0'u32) == 0xE0'u32: + # Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) + if (index + 1) >= len(input): + return err(ErrorInvalidSequence) + + let byte2 = uint32(input[index]) + if (byte2 and 0xC0'u32) != 0x80'u32: + return err(ErrorInvalidSequence) + # overlong sequence test + if (byte1 == 0xE0'u32) and ((byte2 and 0xE0'u32) == 0x80'u32): + return err(ErrorInvalidSequence) + # 0xD800–0xDFFF (UTF-16 surrogates) test + if (byte1 == 0xED'u32) and ((byte2 and 0xE0'u32) == 0xA0'u32): + return err(ErrorInvalidSequence) + + let byte3 = uint32(input[index + 1]) + if (byte3 and 0xC0'u32) != 0x80'u32: + return err(ErrorInvalidSequence) + # U+FFFE or U+FFFF test + if (byte1 == 0xEF'u32) and (byte2 == 0xBF'u32) and + ((byte3 and 0xFE'u32) == 0xBE'u32): + return err(ErrorInvalidSequence) + + if dindex < len(output): + output[dindex] = B(((byte1 and 0x0F'u32) shl 12) or + ((byte2 and 0x3F'u32) shl 6) or + (byte3 and 0x3F'u32)) + inc(dindex) + else: + return err(ErrorBufferOverflow) + inc(index, 2) + + elif (byte1 and 0xF8'u8) == 0xF0'u8: + # Four-byte form (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) + if (index + 2) >= len(input): + return err(ErrorInvalidSequence) + + let byte2 = uint32(input[index]) + if (byte2 and 0xC0'u32) != 0x80'u32: + return err(ErrorInvalidSequence) + # overlong sequence test + if (byte1 == 0xF0'u32) and ((byte2 and 0xF0'u32) == 0x80'u32): + return err(ErrorInvalidSequence) + # According to RFC 3629 no point above U+10FFFF should be used, which + # limits characters to four bytes. + if ((byte1 == 0xF4'u32) and (byte2 > 0x8F'u32)) or (byte1 > 0xF4'u32): + return err(ErrorInvalidSequence) + + let byte3 = uint32(input[index + 1]) + if (byte3 and 0xC0'u32) != 0x80'u32: + return err(ErrorInvalidSequence) + + let byte4 = uint32(input[index + 2]) + if (byte4 and 0xC0'u32) != 0x80'u32: + return err(ErrorInvalidSequence) + + if dindex < len(output): + output[dindex] = B(((byte1 and 0x07'u32) shl 18) or + ((byte2 and 0x3F'u32) shl 12) or + ((byte3 and 0x3F'u32) shl 6) or + (byte4 and 0x3F'u32)) + inc(dindex) + else: + return err(ErrorBufferOverflow) + inc(index, 3) + + else: + return err(ErrorInvalidSequence) + + ok(dindex) + +proc utf8toUtf32*[A: Bytes, B: Wides32](et: typedesc[B], + input: openarray[A]): UResult[seq[B]] = + ## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded + ## sequence of 32bit limbs and return it. + var empty: array[0, B] + let size = ? utf8toUtf32(input, empty) + var output = newSeq[B](size) + let res {.used.} = ? utf8toUtf32(input, output) ok(output) when defined(posix): diff --git a/tests/test_utf8.nim b/tests/test_utf8.nim index 05bee447..06b9bb31 100644 --- a/tests/test_utf8.nim +++ b/tests/test_utf8.nim @@ -302,18 +302,40 @@ suite "UTF-8 validation test suite": utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 0).tryGet() == "" utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 100).tryGet() == "" - test "wcharToUtf8() tests": + test "UTF-32 -> UTF-8 conversion test": for i in 0 ..< 0x11_0000: - if i != 0xFFFE and i != 0xFFFF: - if i < 0x10000: - var data16 = [uint16(i)] - let res = wcharToUtf8(data16) - check: - res.isOk() == true - utf8Validate(res.get()) == true - + var data32 = [uint32(i)] + if i >= 0xD800 and i <= 0xDFFF: + check utf32toUtf8(data32).isErr() + elif i == 0xFFFE: + check utf32toUtf8(data32).isErr() + elif i == 0xFFFF: + check utf32toUtf8(data32).isErr() + elif i == 0x11_0000: + check utf32toUtf8(data32).isErr() + else: var data32 = [uint32(i)] - let res = wcharToUtf8(data32) + let res = utf32toUtf8(data32) check: res.isOk() == true utf8Validate(res.get()) == true + + test "UTF-8 -> UTF-32 conversion test": + for i in 0 ..< 0x11_0001: + var data32 = [uint32(i)] + if i >= 0xD800 and i <= 0xDFFF: + check utf32toUtf8(data32).isErr() + elif i == 0xFFFE: + check utf32toUtf8(data32).isErr() + elif i == 0xFFFF: + check utf32toUtf8(data32).isErr() + elif i == 0x11_0000: + check utf32toUtf8(data32).isErr() + else: + var data32 = [uint32(i)] + let res8 = utf32toUtf8(data32) + check res8.isOk() + let res32 = utf8toUtf32(uint32, res8.get()) + check: + res32.isOk() + res32.get() == data32 From 8d87ba3e05bb09c6e1383f289c533dfafa5ec4d0 Mon Sep 17 00:00:00 2001 From: cheatfate Date: Thu, 15 Oct 2020 00:10:19 +0300 Subject: [PATCH 9/9] Fix *nix compilation problem. --- stew/utf8.nim | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stew/utf8.nim b/stew/utf8.nim index afac4d5f..312f4d6e 100644 --- a/stew/utf8.nim +++ b/stew/utf8.nim @@ -414,8 +414,8 @@ when defined(posix): ps: ptr Mbstate): csize_t {. importc, header: "".} - proc mbstowcs*[A: Bytes, B: Wides](t: typedesc[B], - input: openarray[A]): UResult[seq[B]] = + proc mbstowcs*[A: Bytes, B: Wides32](t: typedesc[B], + input: openarray[A]): UResult[seq[B]] = ## Converts multibyte encoded string to OS specific wide char string. ## ## Note, that `input` should be `0` terminated.