From 70405da6724bdbdab573972c2d873fa5f8b133d4 Mon Sep 17 00:00:00 2001
From: cheatfate <eugene.kabanov@status.im>
Date: Wed, 30 Sep 2020 10:14:40 +0300
Subject: [PATCH 1/9] Initial commit.

---
 stew/conio.nim | 340 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 340 insertions(+)
 create mode 100644 stew/conio.nim

diff --git a/stew/conio.nim b/stew/conio.nim
new file mode 100644
index 00000000..cad1e4e7
--- /dev/null
+++ b/stew/conio.nim
@@ -0,0 +1,340 @@
+## Copyright (c) 2020 Status Research & Development GmbH
+## Licensed under either of
+##  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
+##  * MIT license ([LICENSE-MIT](LICENSE-MIT))
+## at your option.
+## This file may not be copied, modified, or distributed except according to
+## those terms.
+
+## This module implements cross-platform console procedures.
+import io2
+export io2
+
+when defined(windows):
+  proc setConsoleOutputCP(wCodePageID: cuint): int32 {.
+       importc: "SetConsoleOutputCP", stdcall, dynlib: "kernel32", sideEffect.}
+  proc setConsoleCP(wCodePageID: cuint): int32 {.
+       importc: "SetConsoleCP", stdcall, dynlib: "kernel32", sideEffect.}
+  proc getConsoleCP(): cuint {.
+       importc: "GetConsoleCP", stdcall, dynlib: "kernel32", sideEffect.}
+  proc getConsoleOutputCP(): cuint {.
+       importc: "GetConsoleOutputCP", stdcall, dynlib: "kernel32", sideEffect.}
+  proc setConsoleMode(hConsoleHandle: uint, dwMode: uint32): int32 {.
+       importc: "SetConsoleMode", stdcall, dynlib: "kernel32", sideEffect.}
+  proc getConsoleMode(hConsoleHandle: uint, dwMode: var uint32): int32 {.
+       importc: "GetConsoleMode", stdcall, dynlib: "kernel32", sideEffect.}
+  proc readConsole(hConsoleInput: uint, lpBuffer: pointer,
+                   nNumberOfCharsToRead: uint32,
+                   lpNumberOfCharsRead: var uint32,
+                   pInputControl: pointer): int32 {.
+       importc: "ReadConsoleW", stdcall, dynlib: "kernel32", sideEffect.}
+  proc readFile(hFile: uint, lpBuffer: pointer,
+                nNumberOfBytesToRead: uint32,
+                lpNumberOfBytesRead: var uint32,
+                lpOverlapped: pointer): int32 {.
+       importc: "ReadFile", dynlib: "kernel32", stdcall, sideEffect.}
+  proc writeConsole(hConsoleOutput: uint, lpBuffer: pointer,
+                    nNumberOfCharsToWrite: uint32,
+                    lpNumberOfCharsWritten: var uint32,
+                    lpReserved: pointer): int32 {.
+       importc: "WriteConsoleW", stdcall, dynlib: "kernel32", sideEffect.}
+  proc writeFile(hFile: uint, lpBuffer: pointer,
+                 nNumberOfBytesToWrite: uint32,
+                 lpNumberOfBytesWritten: var uint32,
+                 lpOverlapped: pointer): int32 {.
+       importc: "WriteFile", dynlib: "kernel32", stdcall, sideEffect.}
+  proc getStdHandle(nStdHandle: uint32): uint {.
+       importc: "GetStdHandle", stdcall, dynlib: "kernel32", sideEffect.}
+  proc wideCharToMultiByte(codePage: cuint, dwFlags: uint32,
+                           lpWideCharStr: ptr Utf16Char, cchWideChar: cint,
+                           lpMultiByteStr: ptr char, cbMultiByte: cint,
+                           lpDefaultChar: pointer,
+                           lpUsedDefaultChar: pointer): cint {.
+       importc: "WideCharToMultiByte", stdcall, dynlib: "kernel32", sideEffect.}
+
+  const
+    CP_UTF8 = 65001'u32
+    STD_INPUT_HANDLE = cast[uint32](-10)
+    STD_OUTPUT_HANDLE = cast[uint32](-11)
+    INVALID_HANDLE_VALUE = cast[uint](-1)
+    ENABLE_PROCESSED_INPUT = 0x0001'u32
+    ENABLE_ECHO_INPUT = 0x0004'u32
+    ERROR_INVALID_HANDLE = 0x0006'u32
+
+  proc isConsoleRedirected(hConsole: uint): bool =
+    ## Returns ``true`` if console handle was redirected.
+    var mode: uint32
+    let res = getConsoleMode(hConsole, mode)
+    if res == 0:
+      let errCode = ioLastError()
+      if errCode == ERROR_INVALID_HANDLE:
+        true
+      else:
+        false
+    else:
+      false
+
+  proc readConsoleInput(maxBytes: int): IoResult[string] =
+    let hConsoleInput =
+      block:
+        let res = getStdHandle(STD_INPUT_HANDLE)
+        if res == INVALID_HANDLE_VALUE:
+          return err(ioLastError())
+        res
+
+    let prevInputCP =
+      block:
+        let res = getConsoleCP()
+        if res == cuint(0):
+          return err(ioLastError())
+        res
+
+    if isConsoleRedirected(hConsoleInput):
+      # Console STDIN is redirected, we should use ReadFile(), because
+      # ReadConsole() is not working for such types of STDIN.
+      if setConsoleCP(CP_UTF8) == 0'i32:
+        return err(ioLastError())
+
+      # Allocating buffer with size equal to `maxBytes` + len(CRLF)
+      var buffer = newString(maxBytes + 2)
+      let bytesToRead = uint32(len(buffer))
+      var bytesRead: uint32
+      let rres = readFile(hConsoleInput, cast[pointer](addr buffer[0]),
+                          bytesToRead, bytesRead, nil)
+      if rres == 0:
+        let errCode = ioLastError()
+        discard setConsoleCP(prevInputCP)
+        return err(errCode)
+
+      if setConsoleCP(prevInputCP) == 0'i32:
+        return err(ioLastError())
+
+      # Truncate additional bytes from buffer.
+      buffer.setLen(int(min(bytesRead, uint32(maxBytes))))
+
+      # Trim CR/CRLF from buffer.
+      if len(buffer) > 0:
+        if buffer[^1] == char(0x0A):
+          if len(buffer) > 1:
+            if buffer[^2] == char(0x0D):
+              buffer.setLen(len(buffer) - 2)
+            else:
+              buffer.setLen(len(buffer) - 1)
+          else:
+            buffer.setLen(len(buffer) - 1)
+        elif buffer[^1] == char(0x0D):
+          buffer.setLen(len(buffer) - 1)
+      ok(buffer)
+    else:
+      let prevMode =
+        block:
+          var mode: uint32
+          let res = getConsoleMode(hConsoleInput, mode)
+          if res == 0:
+            return err(ioLastError())
+          mode
+
+      var newMode = prevMode or ENABLE_PROCESSED_INPUT
+      newMode = newMode and not(ENABLE_ECHO_INPUT)
+
+      # Change console CodePage to allow UTF-8 strings input.
+      if setConsoleCP(CP_UTF8) == 0'i32:
+        return err(ioLastError())
+
+      # Disable local echo output.
+      let mres = setConsoleMode(hConsoleInput, newMode)
+      if mres == 0:
+        let errCode = ioLastError()
+        discard setConsoleCP(prevInputCP)
+        return err(errCode)
+
+      # Allocating buffer with size equal to `maxBytes` + len(CRLF)
+      var buffer = newSeq[Utf16Char](maxBytes + 2)
+      let charsToRead = uint32(len(buffer))
+      var charsRead: uint32
+      let rres = readConsole(hConsoleInput, cast[pointer](addr buffer[0]),
+                             charsToRead, charsRead, nil)
+      if rres == 0'i32:
+        let errCode = ioLastError()
+        discard setConsoleMode(hConsoleInput, prevMode)
+        discard setConsoleCP(prevInputCP)
+        return err(errCode)
+
+      # Restore local echo output.
+      if setConsoleMode(hConsoleInput, prevMode) == 0'i32:
+        let errCode = ioLastError()
+        discard setConsoleCP(prevInputCP)
+        return err(errCode)
+
+      # Restore previous console CodePage.
+      if setConsoleCP(prevInputCP) == 0'i32:
+        return err(ioLastError())
+
+      # Truncate additional bytes from buffer.
+      buffer.setLen(int(min(charsRead, uint32(maxBytes))))
+      # Truncate CRLF in result wide string.
+      if len(buffer) > 0:
+        if int16(buffer[^1]) == int16(0x0A):
+          if len(buffer) > 1:
+            if int16(buffer[^2]) == int16(0x0D):
+              buffer.setLen(len(buffer) - 2)
+            else:
+              buffer.setLen(len(buffer) - 1)
+          else:
+            buffer.setLen(len(buffer) - 1)
+        elif int16(buffer[^1]) == int16(0x0D):
+          buffer.setLen(len(buffer) - 1)
+
+      # Convert Windows UTF-16 encoded string to UTF-8 encoded string.
+      if len(buffer) > 0:
+        var pwd = ""
+        let bytesNeeded = wideCharToMultiByte(CP_UTF8, 0'u32, addr buffer[0],
+                                              cint(len(buffer)), nil,
+                                              cint(0), nil, nil)
+        if bytesNeeded <= cint(0):
+          return err(ioLastError())
+        pwd.setLen(bytesNeeded)
+        let cres = wideCharToMultiByte(CP_UTF8, 0'u32, addr buffer[0],
+                                       cint(len(buffer)), addr pwd[0],
+                                       cint(len(pwd)), nil, nil)
+        if cres == cint(0):
+          return err(ioLastError())
+        ok(pwd)
+      else:
+        ok("")
+
+  proc writeConsoleOutput(data: string): IoResult[void] =
+    if len(data) == 0:
+      return ok()
+
+    let hConsoleOutput =
+      block:
+        let res = getStdHandle(STD_OUTPUT_HANDLE)
+        if res == INVALID_HANDLE_VALUE:
+          return err(ioLastError())
+        res
+
+    let prevOutputCP =
+      block:
+        let res = getConsoleOutputCP()
+        if res == cuint(0):
+          return err(ioLastError())
+        res
+
+    if isConsoleRedirected(hConsoleOutput):
+      # If STDOUT is redirected we should use WriteFile() because WriteConsole()
+      # is not working for such types of STDOUT.
+      if setConsoleOutputCP(CP_UTF8) == 0'i32:
+        return err(ioLastError())
+
+      let bytesToWrite = uint32(len(data))
+      var bytesWritten: uint32
+      let wres = writeFile(hConsoleOutput, cast[pointer](unsafeAddr data[0]),
+                           bytesToWrite, bytesWritten, nil)
+      if wres == 0'i32:
+        let errCode = ioLastError()
+        discard setConsoleOutputCP(prevOutputCP)
+        return err(errCode)
+
+      if setConsoleOutputCP(prevOutputCP) == 0'i32:
+        return err(ioLastError())
+    else:
+      if setConsoleOutputCP(CP_UTF8) == 0'i32:
+        return err(ioLastError())
+
+      let widePrompt = newWideCString(data)
+      var charsWritten: uint32
+      let wres = writeConsole(hConsoleOutput, cast[pointer](widePrompt),
+                              uint32(len(widePrompt)), charsWritten, nil)
+      if wres == 0'i32:
+        let errCode = ioLastError()
+        discard setConsoleOutputCP(prevOutputCP)
+        return err(errCode)
+
+      if setConsoleOutputCP(prevOutputCP) == 0'i32:
+        return err(ioLastError())
+    ok()
+
+elif defined(posix):
+  import posix, termios
+
+  proc isConsoleRedirected(consoleFd: cint): bool =
+    ## Returns ``true`` if console handle was redirected.
+    var mode: Termios
+    if tcGetAttr(consoleFd, addr mode) != cint(0):
+      let errCode = ioLastError()
+      if errCode == ENOTTY:
+        true
+      else:
+        false
+    else:
+      false
+
+  proc writeConsoleOutput(prompt: string): IoResult[void] =
+    if len(prompt) == 0:
+      ok()
+    else:
+      let res = posix.write(STDOUT_FILENO, cast[pointer](unsafeAddr prompt[0]),
+                            len(prompt))
+      if res != len(prompt):
+        err(ioLastError())
+      else:
+        ok()
+
+  proc readConsoleInput(maxBytes: int): IoResult[string] =
+    # Allocating buffer with size equal to `maxBytes` + len(LF)
+    var buffer = newString(maxBytes + 1)
+    let bytesRead =
+      if isConsoleRedirected(STDIN_FILENO):
+        let res = posix.read(STDIN_FILENO, cast[pointer](addr buffer[0]),
+                             len(buffer))
+        if res < 0:
+          return err(ioLastError())
+        res
+      else:
+        var cur, old: Termios
+        if tcGetAttr(STDIN_FILENO, addr cur) != cint(0):
+          return err(ioLastError())
+
+        old = cur
+        cur.c_lflag = cur.c_lflag and not(Cflag(ECHO))
+
+        if tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(cur)) != cint(0):
+          return err(ioLastError())
+
+        let res = read(STDIN_FILENO, cast[pointer](addr buffer[0]),
+                       len(buffer))
+        if res < 0:
+          discard tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old))
+          return err(ioLastError())
+
+        if tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old)) != cint(0):
+          return err(ioLastError())
+        res
+
+    # Truncate additional bytes from buffer.
+    buffer.setLen(min(maxBytes, bytesRead))
+    # Trim LF in result string
+    if len(buffer) > 0:
+      if buffer[^1] == char(0x0A):
+        buffer.setLen(len(buffer) - 1)
+    ok(buffer)
+
+proc readConsolePassword*(prompt: string,
+                          maxBytes = 32768): IoResult[string] =
+  ## Reads a password from stdin without printing it with length in bytes up to
+  ## ``maxBytes``.
+  ##
+  ## This procedure supports reading of UTF-8 encoded passwords from console or
+  ## redirected pipe. But ``maxBytes`` will limit
+  ##
+  ## Before reading password ``prompt`` will be printed.
+  ##
+  ## Please note that ``maxBytes`` should be in range (0, 32768].
+  doAssert(maxBytes > 0 and maxBytes <= 32768,
+           "maxBytes should be integer in (0, 32768]")
+  ? writeConsoleOutput(prompt)
+  let res = ? readConsoleInput(maxBytes)
+  # `\p` is platform specific newline: CRLF on Windows, LF on Unix
+  ? writeConsoleOutput("\p")
+  ok(res)

From cb44a0db40bcde9a5bfc24626f3cef7f5e3c33bd Mon Sep 17 00:00:00 2001
From: cheatfate <eugene.kabanov@status.im>
Date: Wed, 30 Sep 2020 11:21:31 +0300
Subject: [PATCH 2/9] Fix error handler.

---
 stew/conio.nim | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/stew/conio.nim b/stew/conio.nim
index cad1e4e7..c8d9cf91 100644
--- a/stew/conio.nim
+++ b/stew/conio.nim
@@ -305,8 +305,9 @@ elif defined(posix):
         let res = read(STDIN_FILENO, cast[pointer](addr buffer[0]),
                        len(buffer))
         if res < 0:
+          let errCode = ioLastError()
           discard tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old))
-          return err(ioLastError())
+          return err(errCode)
 
         if tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old)) != cint(0):
           return err(ioLastError())

From 4363df120fb572877512786bdb2f28e841a8886f Mon Sep 17 00:00:00 2001
From: cheatfate <eugene.kabanov@status.im>
Date: Wed, 30 Sep 2020 16:19:51 +0300
Subject: [PATCH 3/9] User proper method of console identification (msvcrt).

---
 stew/conio.nim | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/stew/conio.nim b/stew/conio.nim
index c8d9cf91..7f7c3bab 100644
--- a/stew/conio.nim
+++ b/stew/conio.nim
@@ -51,6 +51,8 @@ when defined(windows):
                            lpDefaultChar: pointer,
                            lpUsedDefaultChar: pointer): cint {.
        importc: "WideCharToMultiByte", stdcall, dynlib: "kernel32", sideEffect.}
+  proc getFileType(hFile: uint): uint32 {.
+       importc: "GetFileType", stdcall, dynlib: "kernel32", sideEffect.}
 
   const
     CP_UTF8 = 65001'u32
@@ -59,20 +61,17 @@ when defined(windows):
     INVALID_HANDLE_VALUE = cast[uint](-1)
     ENABLE_PROCESSED_INPUT = 0x0001'u32
     ENABLE_ECHO_INPUT = 0x0004'u32
-    ERROR_INVALID_HANDLE = 0x0006'u32
+    FILE_TYPE_CHAR = 0x0002'u32
 
   proc isConsoleRedirected(hConsole: uint): bool =
     ## Returns ``true`` if console handle was redirected.
-    var mode: uint32
-    let res = getConsoleMode(hConsole, mode)
-    if res == 0:
-      let errCode = ioLastError()
-      if errCode == ERROR_INVALID_HANDLE:
-        true
-      else:
-        false
-    else:
+    let res = getFileType(hConsole)
+    if res == FILE_TYPE_CHAR:
+      # The specified handle is a character device, typically an LPT device or a
+      # console.
       false
+    else:
+      true
 
   proc readConsoleInput(maxBytes: int): IoResult[string] =
     let hConsoleInput =

From 6ee4974c7f884fefd61f8c61355a4fe84de7602d Mon Sep 17 00:00:00 2001
From: cheatfate <eugene.kabanov@status.im>
Date: Wed, 30 Sep 2020 16:34:10 +0300
Subject: [PATCH 4/9] Use proper method of console identification (glibc).

---
 stew/conio.nim | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/stew/conio.nim b/stew/conio.nim
index 7f7c3bab..6ed3418c 100644
--- a/stew/conio.nim
+++ b/stew/conio.nim
@@ -260,12 +260,9 @@ elif defined(posix):
   proc isConsoleRedirected(consoleFd: cint): bool =
     ## Returns ``true`` if console handle was redirected.
     var mode: Termios
+    # This is how `isatty()` checks for TTY.
     if tcGetAttr(consoleFd, addr mode) != cint(0):
-      let errCode = ioLastError()
-      if errCode == ENOTTY:
-        true
-      else:
-        false
+      true
     else:
       false
 

From cc001fa88f23133f2b279fb138caa0580c77fe0d Mon Sep 17 00:00:00 2001
From: cheatfate <eugene.kabanov@status.im>
Date: Wed, 30 Sep 2020 18:15:56 +0300
Subject: [PATCH 5/9] Make isConsoleRedirected() public API.

---
 stew/conio.nim | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stew/conio.nim b/stew/conio.nim
index 6ed3418c..8e78dc39 100644
--- a/stew/conio.nim
+++ b/stew/conio.nim
@@ -63,7 +63,7 @@ when defined(windows):
     ENABLE_ECHO_INPUT = 0x0004'u32
     FILE_TYPE_CHAR = 0x0002'u32
 
-  proc isConsoleRedirected(hConsole: uint): bool =
+  proc isConsoleRedirected*(hConsole: uint): bool =
     ## Returns ``true`` if console handle was redirected.
     let res = getFileType(hConsole)
     if res == FILE_TYPE_CHAR:
@@ -257,7 +257,7 @@ when defined(windows):
 elif defined(posix):
   import posix, termios
 
-  proc isConsoleRedirected(consoleFd: cint): bool =
+  proc isConsoleRedirected*(consoleFd: cint): bool =
     ## Returns ``true`` if console handle was redirected.
     var mode: Termios
     # This is how `isatty()` checks for TTY.

From b0bbeb49d2380023f4da7da8f2b61fa6c13ae10b Mon Sep 17 00:00:00 2001
From: cheatfate <eugene.kabanov@status.im>
Date: Sun, 4 Oct 2020 19:33:13 +0300
Subject: [PATCH 6/9] Add UTF-8 validation procedure.

---
 stew/utf8.nim       |  91 +++++++++++++++++++++
 tests/all_tests.nim |   3 +-
 tests/test_utf8.nim | 193 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 286 insertions(+), 1 deletion(-)
 create mode 100644 stew/utf8.nim
 create mode 100644 tests/test_utf8.nim

diff --git a/stew/utf8.nim b/stew/utf8.nim
new file mode 100644
index 00000000..e2b8599a
--- /dev/null
+++ b/stew/utf8.nim
@@ -0,0 +1,91 @@
+## Copyright (c) 2020 Status Research & Development GmbH
+## Licensed under either of
+##  * Apache License, version 2.0, ([LICENSE-APACHE](LICENSE-APACHE))
+##  * MIT license ([LICENSE-MIT](LICENSE-MIT))
+## at your option.
+## This file may not be copied, modified, or distributed except according to
+## those terms.
+
+## This module implements UTF-8 related procedures.
+
+proc validateUtf8*[T: byte|char](data: openarray[T]): bool =
+  ## Returns ``true`` if ``data`` is correctly UTF-8 encoded string.
+  var index = 0
+
+  while true:
+    let byte1 =
+      block:
+        var b: byte
+        while true:
+          if index >= len(data):
+            return true
+          b = when T is byte: data[index] else: byte(data[index])
+          inc(index)
+          if b >= 0x80'u8:
+            break
+        b
+
+    if (byte1 and 0xE0'u8) == 0xC0'u8:
+      # Two-byte form (110xxxxx 10xxxxxx)
+      if index >= len(data):
+        return false
+      # overlong sequence test
+      if (byte1 and 0xFE'u8) == 0xC0'u8:
+        return false
+
+      let byte2 = when T is byte: data[index] else: byte(data[index])
+      if (byte2 and 0xC0'u8) != 0x80'u8:
+        return false
+      inc(index)
+
+    elif (byte1 and 0xF0'u8) == 0xE0'u8:
+      # Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
+      if (index + 1) >= len(data):
+        return false
+
+      let byte2 = when T is byte: data[index] else: byte(data[index])
+      if (byte2 and 0xC0'u8) != 0x80'u8:
+        return false
+      # overlong sequence test
+      if (byte1 == 0xE0'u8) and ((byte2 and 0xE0'u8) == 0x80'u8):
+        return false
+      #  0xD800–0xDFFF (UTF-16 surrogates) test
+      if (byte1 == 0xED'u8) and ((byte2 and 0xE0'u8) == 0xA0'u8):
+        return false
+
+      let byte3 = when T is byte: data[index + 1] else: byte(data[index + 1])
+      if (byte3 and 0xC0'u8) != 0x80'u8:
+        return false
+      # U+FFFE or U+FFFF test
+      if (byte1 == 0xEF'u8) and (byte2 == 0xBF'u8) and
+         ((byte3 and 0xFE'u8) == 0xBE'u8):
+        return false
+      inc(index, 2)
+
+    elif (byte1 and 0xF8'u8) == 0xF0'u8:
+      # Four-byte form (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+      if (index + 2) >= len(data):
+        return false
+
+      let byte2 = when T is byte: data[index] else: byte(data[index])
+      if (byte2 and 0xC0'u8) != 0x80'u8:
+        return false
+      # overlong sequence test
+      if (byte1 == 0xF0'u8) and ((byte2 and 0xF0'u8) == 0x80'u8):
+        return false
+      # According to RFC 3629 no point above U+10FFFF should be used, which
+      # limits characters to four bytes.
+      if ((byte1 == 0xF4'u8) and (byte2 > 0x8F'u8)) or (byte1 > 0xF4'u8):
+        return false
+
+      let byte3 = when T is byte: data[index + 1] else: byte(data[index + 1])
+      if (byte3 and 0xC0'u8) != 0x80'u8:
+        return false
+
+      let byte4 = when T is byte: data[index + 2] else: byte(data[index + 2])
+      if (byte4 and 0xC0'u8) != 0x80'u8:
+        return false
+      inc(index, 3)
+
+    else:
+      return false
diff --git a/tests/all_tests.nim b/tests/all_tests.nim
index 7b4f42c1..eeba4089 100644
--- a/tests/all_tests.nim
+++ b/tests/all_tests.nim
@@ -23,4 +23,5 @@ import
   test_varints,
   test_ctops,
   test_io2,
-  test_winacl
\ No newline at end of file
+  test_winacl,
+  test_utf8
diff --git a/tests/test_utf8.nim b/tests/test_utf8.nim
new file mode 100644
index 00000000..da4ed533
--- /dev/null
+++ b/tests/test_utf8.nim
@@ -0,0 +1,193 @@
+import std/unittest
+import ../stew/utf8
+
+proc toUTF4(value: uint32): array[4, byte] =
+  doAssert(value >= 0x10000'u32 and value < 0x200000'u32)
+  [
+    0xF0'u8 or byte((value shr 18) and 0x07),
+    0x80'u8 or byte((value shr 12) and 0x3F),
+    0x80'u8 or byte((value shr 6) and 0x3F),
+    0x80'u8 or byte(value and 0x3F)
+  ]
+
+proc toUTF3(value: uint32): array[3, byte] =
+  doAssert(value >= 0x800'u32 and value < 0x10000'u32)
+  [
+    0xE0'u8 or byte((value shr 12) and 0x0F),
+    0x80'u8 or byte((value shr 6) and 0x3F),
+    0x80'u8 or byte(value and 0x3F)
+  ]
+
+proc toUTF2(value: uint32): array[2, byte] =
+  doAssert(value >= 0x80'u32 and value < 0x800'u32)
+  [
+    0xC0'u8 or byte((value shr 6) and 0x1F),
+    0x80'u8 or byte(value and 0x3F)
+  ]
+
+proc toUTF1(value: uint32): array[1, byte] =
+  doAssert(value < 0x80'u32)
+  [ byte(value and 0x7F) ]
+
+suite "UTF-8 validation test suite":
+  test "Values [U+0000, U+007F] are allowed":
+    for i in 0x00'u32 .. 0x7F'u32:
+      check validateUtf8(toUTF1(i)) == true
+  test "Values [U+0080, U+07FF] are allowed":
+    for i in 0x80'u32 .. 0x7FF'u32:
+      check validateUtf8(toUTF2(i)) == true
+  test "Values [U+0800, U+D7FF] are allowed":
+    for i in 0x800'u32 .. 0xD7FF'u32:
+      check validateUtf8(toUTF3(i)) == true
+  test "Values [U+D800, U+DFFF] (UTF-16 surrogates) are not allowed":
+    for i in 0xD800'u32 .. 0xDFFF'u32:
+      check validateUtf8(toUTF3(i)) == false
+  test "Values [U+E000, U+FFFD] are allowed":
+    for i in 0xE000'u32 .. 0xFFFD'u32:
+      check validateUtf8(toUTF3(i)) == true
+  test "Values U+FFFE and U+FFFF are not allowed":
+    check:
+      validateUtf8(toUTF3(0xFFFE'u32)) == false
+      validateUtf8(toUTF3(0xFFFF'u32)) == false
+  test "Values [U+10000, U10FFFF] are allowed":
+    for i in 0x10000'u32 .. 0x10FFFF'u32:
+      check validateUtf8(toUTF4(i)) == true
+  test "Values bigger U+10FFFF are not allowed":
+    for i in 0x11_0000'u32 .. 0x1F_FFFF'u32:
+      check validateUtf8(toUTF4(i)) == false
+  test "fastvalidate-utf-8 bad sequences":
+    # https://github.com/lemire/fastvalidate-utf-8 test vectors
+    const
+      GoodSequences = [
+        "a",
+        "\xc3\xb1",
+        "\xe2\x82\xa1",
+        "\xf0\x90\x8c\xbc",
+        "안녕하세요, 세상",
+        "\xc2\x80",
+        "\xf0\x90\x80\x80",
+        "\xee\x80\x80"
+      ]
+
+      BadSequences = [
+        "\xc3\x28",
+        "\xa0\xa1",
+        "\xe2\x28\xa1",
+        "\xe2\x82\x28",
+        "\xf0\x28\x8c\xbc",
+        "\xf0\x90\x28\xbc",
+        "\xf0\x28\x8c\x28",
+        "\xc0\x9f",
+        "\xf5\xff\xff\xff",
+        "\xed\xa0\x81",
+        "\xf8\x90\x80\x80\x80",
+        "123456789012345\xed",
+        "123456789012345\xf1",
+        "123456789012345\xc2",
+        "\xC2\x7F",
+        "\xce",
+        "\xce\xba\xe1",
+        "\xce\xba\xe1\xbd",
+        "\xce\xba\xe1\xbd\xb9\xcf",
+        "\xce\xba\xe1\xbd\xb9\xcf\x83\xce",
+        "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce",
+        "\xdf",
+        "\xef\xbf"
+      ]
+    for item in BadSequences:
+      check validateUtf8(item) == false
+    for item in GoodSequences:
+      check validateUtf8(item) == true
+  test "UTF-8 decoder capability and stress test":
+    # https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+    const Tests2 = [
+      # Boundary condition test cases
+      ("\x00", true),
+      ("\xc2\x80", true),
+      ("\xe0\xa0\x80", true),
+      ("\xf0\x90\x80\x80", true),
+      ("\xf8\x88\x80\x80\x80", false),
+      ("\xfc\x84\x80\x80\x80\x80", false),
+      ("\x7f", true),
+      ("\xdf\xbf", true),
+      ("\xef\xbf\xbf", false),
+      ("\xf7\xbf\xbf\xbf", false),
+      ("\xfb\xbf\xbf\xbf\xbf", false),
+      ("\xfd\xbf\xbf\xbf\xbf\xbf", false),
+      ("\xed\x9f\xbf", true),
+      ("\xee\x80\x80", true),
+      ("\xef\xbf\xbd", true),
+      ("\xf4\x8f\xbf\xbf", true),
+    ]
+
+    const Tests3 = [
+      # Malformed sequences
+      ("\x80", false),
+      ("\xbf", false),
+      ("\x80\xbf", false),
+      ("\x80\xbf\x80", false),
+      ("\x80\xbf\x80\xbf", false),
+      ("\x80\xbf\x80\xbf\x80", false),
+      ("\x80\xbf\x80\xbf\x80\xbf", false),
+      ("\x80\xbf\x80\xbf\x80\xbf\x80", false),
+      ("\xc0", false),
+      ("\xe0\x80", false),
+      ("\xf0\x80\x80", false),
+      ("\xf8\x80\x80\x80", false),
+      ("\xfc\x80\x80\x80\x80", false),
+      ("\xdf", false),
+      ("\xef\xbf", false),
+      ("\xf7\xbf\xbf", false),
+      ("\xfb\xbf\xbf\xbf", false),
+      ("\xfd\xbf\xbf\xbf\xbf", false),
+      ("\xfe", false),
+      ("\xff", false),
+      ("\xfe\xfe\xff\xff", false)
+    ]
+
+    const Tests4 = [
+      # Overlong sequences
+      ("\xc0\xaf", false),
+      ("\xe0\x80\xaf", false),
+      ("\xf0\x80\x80\xaf", false),
+      ("\xf8\x80\x80\x80\xaf", false),
+      ("\xfc\x80\x80\x80\x80\xaf", false),
+      ("\xc1\xbf", false),
+      ("\xe0\x9f\xbf", false),
+      ("\xf0\x8f\xbf\xbf", false),
+      ("\xf8\x87\xbf\xbf\xbf", false),
+      ("\xfc\x83\xbf\xbf\xbf\xbf", false),
+      ("\xc0\x80", false),
+      ("\xe0\x80\x80", false),
+      ("\xf0\x80\x80\x80", false),
+      ("\xf8\x80\x80\x80\x80", false),
+      ("\xfc\x80\x80\x80\x80\x80", false)
+    ]
+
+    const Tests5 = [
+      # Illegal code positions
+      ("\xed\xa0\x80", false),
+      ("\xed\xad\xbf", false),
+      ("\xed\xae\x80", false),
+      ("\xed\xaf\xbf", false),
+      ("\xed\xb0\x80", false),
+      ("\xed\xbe\x80", false),
+      ("\xed\xbf\xbf", false),
+      ("\xed\xa0\x80\xed\xb0\x80", false),
+      ("\xed\xa0\x80\xed\xbf\xbf", false),
+      ("\xed\xad\xbf\xed\xb0\x80", false),
+      ("\xed\xad\xbf\xed\xbf\xbf", false),
+      ("\xed\xae\x80\xed\xb0\x80", false),
+      ("\xed\xae\x80\xed\xbf\xbf", false),
+      ("\xed\xaf\xbf\xed\xb0\x80", false),
+      ("\xed\xaf\xbf\xed\xbf\xbf", false)
+    ]
+
+    for item in Tests2:
+      check validateUtf8(item[0]) == item[1]
+    for item in Tests3:
+      check validateUtf8(item[0]) == item[1]
+    for item in Tests4:
+      check validateUtf8(item[0]) == item[1]
+    for item in Tests5:
+      check validateUtf8(item[0]) == item[1]

From 39fb71bceca73eb2209021638273c820ab7f3f07 Mon Sep 17 00:00:00 2001
From: cheatfate <eugene.kabanov@status.im>
Date: Wed, 7 Oct 2020 23:50:38 +0300
Subject: [PATCH 7/9] Add UTF-8 length procedure. Add UTF-8 offset procedure.
 Add UTF-8 substr procedure. Add wchar_t to UTF-8 conversion procedure. Add
 multibyte to wchar_t conversion procedure (posix). Add UTF-8 tests. Fix
 password reader to validate utf-8 encoding when reading from pipe. Fix
 password reader to read utf-8 encoded strings from *nix console.

---
 stew/conio.nim      | 151 ++++++++++++++++++---------
 stew/utf8.nim       | 248 +++++++++++++++++++++++++++++++++++++++++++-
 tests/test_utf8.nim | 156 +++++++++++++++++++++++++---
 3 files changed, 488 insertions(+), 67 deletions(-)

diff --git a/stew/conio.nim b/stew/conio.nim
index 8e78dc39..e1115885 100644
--- a/stew/conio.nim
+++ b/stew/conio.nim
@@ -7,7 +7,7 @@
 ## those terms.
 
 ## This module implements cross-platform console procedures.
-import io2
+import io2, utf8
 export io2
 
 when defined(windows):
@@ -62,6 +62,7 @@ when defined(windows):
     ENABLE_PROCESSED_INPUT = 0x0001'u32
     ENABLE_ECHO_INPUT = 0x0004'u32
     FILE_TYPE_CHAR = 0x0002'u32
+    ERROR_NO_UNICODE_TRANSLATION = 1113'u32
 
   proc isConsoleRedirected*(hConsole: uint): bool =
     ## Returns ``true`` if console handle was redirected.
@@ -73,7 +74,7 @@ when defined(windows):
     else:
       true
 
-  proc readConsoleInput(maxBytes: int): IoResult[string] =
+  proc readConsoleInput(maxChars: int): IoResult[string] =
     let hConsoleInput =
       block:
         let res = getStdHandle(STD_INPUT_HANDLE)
@@ -94,8 +95,9 @@ when defined(windows):
       if setConsoleCP(CP_UTF8) == 0'i32:
         return err(ioLastError())
 
-      # Allocating buffer with size equal to `maxBytes` + len(CRLF)
-      var buffer = newString(maxBytes + 2)
+      # Allocating buffer with size equal to `(maxChars + len(CRLF)) * 4`,
+      # where 4 is maximum expected size of one character (UTF8 encoding).
+      var buffer = newString((maxChars + 2) * 4)
       let bytesToRead = uint32(len(buffer))
       var bytesRead: uint32
       let rres = readFile(hConsoleInput, cast[pointer](addr buffer[0]),
@@ -109,7 +111,7 @@ when defined(windows):
         return err(ioLastError())
 
       # Truncate additional bytes from buffer.
-      buffer.setLen(int(min(bytesRead, uint32(maxBytes))))
+      buffer.setLen(int(bytesRead))
 
       # Trim CR/CRLF from buffer.
       if len(buffer) > 0:
@@ -123,7 +125,13 @@ when defined(windows):
             buffer.setLen(len(buffer) - 1)
         elif buffer[^1] == char(0x0D):
           buffer.setLen(len(buffer) - 1)
-      ok(buffer)
+
+      # Check if buffer is valid UTF-8 encoded string.
+      if utf8Validate(buffer):
+        # Cut result buffer to `maxChars` characters.
+        ok(utf8Substr(buffer, 0, maxChars - 1).get())
+      else:
+        err(IoErrorCode(ERROR_NO_UNICODE_TRANSLATION))
     else:
       let prevMode =
         block:
@@ -147,8 +155,8 @@ when defined(windows):
         discard setConsoleCP(prevInputCP)
         return err(errCode)
 
-      # Allocating buffer with size equal to `maxBytes` + len(CRLF)
-      var buffer = newSeq[Utf16Char](maxBytes + 2)
+      # Allocating buffer with size equal to `maxChars` + len(CRLF).
+      var buffer = newSeq[Utf16Char](maxChars + 2)
       let charsToRead = uint32(len(buffer))
       var charsRead: uint32
       let rres = readConsole(hConsoleInput, cast[pointer](addr buffer[0]),
@@ -170,7 +178,8 @@ when defined(windows):
         return err(ioLastError())
 
       # Truncate additional bytes from buffer.
-      buffer.setLen(int(min(charsRead, uint32(maxBytes))))
+      buffer.setLen(int(min(charsRead, uint32(maxChars))))
+
       # Truncate CRLF in result wide string.
       if len(buffer) > 0:
         if int16(buffer[^1]) == int16(0x0A):
@@ -184,7 +193,7 @@ when defined(windows):
         elif int16(buffer[^1]) == int16(0x0D):
           buffer.setLen(len(buffer) - 1)
 
-      # Convert Windows UTF-16 encoded string to UTF-8 encoded string.
+      # Convert Windows UCS-2 encoded string to UTF-8 encoded string.
       if len(buffer) > 0:
         var pwd = ""
         let bytesNeeded = wideCharToMultiByte(CP_UTF8, 0'u32, addr buffer[0],
@@ -277,61 +286,101 @@ elif defined(posix):
       else:
         ok()
 
-  proc readConsoleInput(maxBytes: int): IoResult[string] =
-    # Allocating buffer with size equal to `maxBytes` + len(LF)
-    var buffer = newString(maxBytes + 1)
-    let bytesRead =
-      if isConsoleRedirected(STDIN_FILENO):
-        let res = posix.read(STDIN_FILENO, cast[pointer](addr buffer[0]),
-                             len(buffer))
-        if res < 0:
-          return err(ioLastError())
-        res
+  proc readConsoleInput(maxChars: int): IoResult[string] =
+    # Allocating buffer with size equal to `(maxChars + len(LF)) * 4`, where
+    # 4 is maximum expected size of one character (UTF8 encoding).
+    var buffer = newString((maxChars + 1) * 4)
+
+    if isConsoleRedirected(STDIN_FILENO):
+      let bytesRead =
+        block:
+          let res = posix.read(STDIN_FILENO, cast[pointer](addr buffer[0]),
+                               len(buffer))
+          if res < 0:
+            return err(ioLastError())
+          res
+
+      # Truncate additional bytes from buffer.
+      buffer.setLen(bytesRead)
+
+      # Trim LF in result string
+      if len(buffer) > 0:
+        if buffer[^1] == char(0x0A):
+          buffer.setLen(len(buffer) - 1)
+
+      # Check if buffer is valid UTF-8 encoded string.
+      if utf8Validate(buffer):
+        # Cut result buffer to `maxChars` characters.
+        ok(utf8Substr(buffer, 0, maxChars - 1).get())
       else:
-        var cur, old: Termios
-        if tcGetAttr(STDIN_FILENO, addr cur) != cint(0):
-          return err(ioLastError())
+        err(IoErrorCode(EILSEQ))
+    else:
+      let bytesRead =
+        block:
+          var cur, old: Termios
+          if tcGetAttr(STDIN_FILENO, addr cur) != cint(0):
+            return err(ioLastError())
 
-        old = cur
-        cur.c_lflag = cur.c_lflag and not(Cflag(ECHO))
+          old = cur
+          cur.c_lflag = cur.c_lflag and not(Cflag(ECHO))
 
-        if tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(cur)) != cint(0):
-          return err(ioLastError())
+          if tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(cur)) != cint(0):
+            return err(ioLastError())
 
-        let res = read(STDIN_FILENO, cast[pointer](addr buffer[0]),
-                       len(buffer))
-        if res < 0:
-          let errCode = ioLastError()
-          discard tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old))
-          return err(errCode)
+          let res = read(STDIN_FILENO, cast[pointer](addr buffer[0]),
+                         len(buffer))
+          if res < 0:
+            let errCode = ioLastError()
+            discard tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old))
+            return err(errCode)
 
-        if tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old)) != cint(0):
-          return err(ioLastError())
-        res
+          if tcSetAttr(STDIN_FILENO, TCSADRAIN, addr(old)) != cint(0):
+            return err(ioLastError())
+          res
 
-    # Truncate additional bytes from buffer.
-    buffer.setLen(min(maxBytes, bytesRead))
-    # Trim LF in result string
-    if len(buffer) > 0:
-      if buffer[^1] == char(0x0A):
-        buffer.setLen(len(buffer) - 1)
-    ok(buffer)
+      # Truncate additional bytes from buffer.
+      buffer.setLen(bytesRead)
+
+      # Trim LF in result string
+      if len(buffer) > 0:
+        if buffer[^1] == char(0x0A):
+          buffer.setLen(len(buffer) - 1)
+      buffer.add(char(0x00))
+
+      # Conversion of console input into wide characters sequence.
+      let wres = mbstowcs(uint32, buffer)
+      if wres.isOk():
+        # Trim wide character sequence to `maxChars` number of characters.
+        var wbuffer = wres.get()
+        if maxChars < len(wbuffer):
+          wbuffer.setLen(maxChars)
+        # Conversion of wide characters sequence to UTF-8 encoded string.
+        let ures = wbuffer.wcharToUtf8()
+        if ures.isOk():
+          ok(ures.get())
+        else:
+          err(IoErrorCode(EILSEQ))
+      else:
+        err(IoErrorCode(EILSEQ))
 
 proc readConsolePassword*(prompt: string,
-                          maxBytes = 32768): IoResult[string] =
-  ## Reads a password from stdin without printing it with length in bytes up to
-  ## ``maxBytes``.
+                          maxChars = 32768): IoResult[string] =
+  ## Reads a password from stdin without printing it with length in characters
+  ## up to ``maxChars``.
   ##
   ## This procedure supports reading of UTF-8 encoded passwords from console or
-  ## redirected pipe. But ``maxBytes`` will limit
+  ## redirected pipe.
   ##
   ## Before reading password ``prompt`` will be printed.
   ##
-  ## Please note that ``maxBytes`` should be in range (0, 32768].
-  doAssert(maxBytes > 0 and maxBytes <= 32768,
-           "maxBytes should be integer in (0, 32768]")
+  ## Please note that ``maxChars`` should be in range (0, 32768].
+  doAssert(maxChars > 0 and maxChars <= 32768,
+           "maxChars should be integer in (0, 32768]")
   ? writeConsoleOutput(prompt)
-  let res = ? readConsoleInput(maxBytes)
+  let res = ? readConsoleInput(maxChars)
   # `\p` is platform specific newline: CRLF on Windows, LF on Unix
   ? writeConsoleOutput("\p")
   ok(res)
+
+when isMainModule:
+  echo readConsolePassword("Enter password: ", 4)
diff --git a/stew/utf8.nim b/stew/utf8.nim
index e2b8599a..d2d43034 100644
--- a/stew/utf8.nim
+++ b/stew/utf8.nim
@@ -7,8 +7,21 @@
 ## those terms.
 
 ## This module implements UTF-8 related procedures.
+import results, io2
+export results
 
-proc validateUtf8*[T: byte|char](data: openarray[T]): bool =
+type
+  UResult*[T] = Result[T, cstring]
+  Wides* = int16 | uint16 | int32 | uint32
+  Bytes* = int8 | char | uint8 | byte
+
+const
+  ErrorBufferOverflow* = cstring"Buffer is not large enough"
+  ErrorInvalidSequence* = cstring"Invalid Unicode sequence found"
+  ErrorInvalidLocale* = cstring"Could not obtain system locale"
+  ErrorNotEnoughCharacters* = cstring"Not enough characters in string"
+
+proc utf8Validate*[T: Bytes](data: openarray[T]): bool =
   ## Returns ``true`` if ``data`` is correctly UTF-8 encoded string.
   var index = 0
 
@@ -89,3 +102,236 @@ proc validateUtf8*[T: byte|char](data: openarray[T]): bool =
 
     else:
       return false
+
+proc utf8Length*[T: Bytes](data: openarray[T]): UResult[int] =
+  ## Returns number of UTF-8 encoded characters in array ``data``.
+  ##
+  ## NOTE: Validate data with `utf8Validate()` before using this procedure,
+  ## otherwise length returned by this procedure could be incorrect.
+  var index = 0
+  var size = 0
+  while index < len(data):
+    let ch = uint(data[index])
+    if ch < 0x80:
+      inc(index, 1)
+    elif (ch and 0xE0'u8) == 0xC0'u8:
+      inc(index, 2)
+    elif (ch and 0xF0'u8) == 0xE0'u8:
+      inc(index, 3)
+    elif (ch and 0xF8'u8) == 0xF0'u8:
+      inc(index, 4)
+    else:
+      return err(ErrorInvalidSequence)
+    inc(size)
+  if index == len(data):
+    ok(size)
+  else:
+    err(ErrorInvalidSequence)
+
+proc utf8Offset*[T: Bytes](data: openarray[T], index: int): UResult[int] =
+  ## Return offset in UTF-8 encoded string ``data`` for character position
+  ## ``index``.
+  if index <= 0:
+    return ok(0)
+
+  var byteIndex = 0
+  var charIndex = 0
+
+  while (byteIndex < len(data)) and (charIndex < index):
+    let ch = uint(data[byteIndex])
+    if ch < 0x80:
+      inc(byteIndex, 1)
+    elif (ch and 0xE0'u8) == 0xC0'u8:
+      inc(byteIndex, 2)
+    elif (ch and 0xF0'u8) == 0xE0'u8:
+      inc(byteIndex, 3)
+    elif (ch and 0xF8'u8) == 0xF0'u8:
+      inc(byteIndex, 4)
+    else:
+      return err(ErrorInvalidSequence)
+    inc(charIndex)
+
+  if charIndex == index:
+    ok(byteIndex)
+  else:
+    err(ErrorNotEnoughCharacters)
+
+proc utf8Substr*[T: Bytes](data: openarray[T],
+                           start, finish: int): UResult[string] =
+  ## Substring string ``data`` using starting character (not byte) index
+  ## ``start`` and terminating character (not byte) index ``finish`` and return
+  ## result string.
+  ##
+  ## ``data`` should be correct UTF-8 encoded string, because only initial
+  ## octets got validated.
+  ##
+  ## ``start`` - The starting index of the substring, any value BELOW or EQUAL
+  ## to zero will be considered as zero. If ``start`` index is not present in
+  ## string ``data`` empty string will be returned as result.
+  ##
+  ## ``finish`` - The terminating index of the substring, any value BELOW
+  ## zero will be considered as `len(data)`.
+  let soffset =
+    if start <= 0:
+      0
+    elif start >= len(data):
+      return ok("")
+    else:
+      let res = utf8Offset(data, start)
+      if res.isErr():
+        if res.error != ErrorNotEnoughCharacters:
+          return err(res.error)
+        return ok("")
+      else:
+        res.get()
+
+  let eoffset =
+    if finish < 0:
+      len(data)
+    elif finish >= len(data):
+      len(data)
+    else:
+      let res = utf8Offset(data, finish + 1)
+      if res.isErr():
+        if res.error != ErrorNotEnoughCharacters:
+          return err(res.error)
+        len(data)
+      else:
+        res.get()
+
+  var res = newString(eoffset - soffset)
+  var k = 0
+  for i in soffset ..< eoffset:
+    res[k] = cast[char](data[i])
+    inc(k)
+  ok(res)
+
+proc wcharToUtf8*[A: Wides, B: Bytes](input: openarray[A],
+                                      output: var openarray[B]): UResult[int] =
+  ## Converts WCHAR sequence ``input`` to UTF-8 array of octets ``output``.
+  ##
+  ## Procedure supports 4-byte (Linux) and 2-byte sequences (Windows) as input.
+  var offset = 0
+  for item in input:
+    let uitem = uint(item)
+    let codepoint =
+      if uitem >= 0xD800'u and uitem <= 0xDBFF'u:
+        0x10000'u + ((uitem - 0xD800'u) shl 10)
+      else:
+        if uitem >= 0xDC00'u and uitem <= 0xDFFF'u:
+          uitem - 0xDC00'u
+        else:
+          uitem
+    if codepoint <= 0x7F'u:
+      if len(output) > 0:
+        if offset < len(output):
+          output[offset] = cast[B](codepoint and 0x7F'u)
+        else:
+          return err(ErrorBufferOverflow)
+      inc(offset, 1)
+    elif codepoint <= 0x7FF'u:
+      if len(output) > 0:
+        if offset + 1 < len(output):
+          output[offset + 0] = cast[B](0xC0'u8 or
+                                       byte((codepoint shr 6) and 0x1F'u))
+          output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
+        else:
+          return err(ErrorBufferOverflow)
+      inc(offset, 2)
+    elif codepoint <= 0xFFFF'u:
+      if len(output) > 0:
+        if offset + 2 < len(output):
+          output[offset + 0] = cast[B](0xE0'u8 or
+                                       byte((codepoint shr 12) and 0x0F'u))
+          output[offset + 1] = cast[B](0x80'u8 or
+                                       byte((codepoint shr 6) and 0x3F'u))
+          output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
+        else:
+          return err(ErrorBufferOverflow)
+      inc(offset, 3)
+    elif codepoint <= 0x10FFFF'u:
+      if len(output) > 0:
+        if offset + 3 < len(output):
+          output[offset + 0] = cast[B](0xF0'u8 or
+                                       byte((codepoint shr 18) and 0x07'u))
+          output[offset + 1] = cast[B](0x80'u8 or
+                                       byte((codepoint shr 12) and 0x3F'u))
+          output[offset + 2] = cast[B](0x80'u8 or
+                                       byte((codepoint shr 6) and 0x3F'u))
+          output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
+        else:
+          return err("")
+      inc(offset, 4)
+    else:
+      return err(ErrorInvalidSequence)
+  ok(offset)
+
+proc wcharToUtf8*[T: Wides](input: openarray[T]): UResult[string] {.inline.} =
+  ## Converts wide character
+  var empty: array[0, char]
+  let size = ? wcharToUtf8(input, empty)
+  var output = newString(size)
+  let res {.used.} = ? wcharToUtf8(input, output)
+  ok(output)
+
+when defined(posix):
+  import posix
+
+  type
+    Mbstate {.importc: "mbstate_t",
+              header: "<wchar.h>", pure, final.} = object
+
+  proc mbsrtowcs(dest: pointer, src: pointer, n: csize_t,
+                 ps: ptr Mbstate): csize_t {.
+       importc, header: "<wchar.h>".}
+
+  proc mbstowcs*[A: Bytes, B: Wides](t: typedesc[B],
+                                     input: openarray[A]): UResult[seq[B]] =
+    ## Converts multibyte encoded string to OS specific wide char string.
+    ##
+    ## Note, that `input` should be `0` terminated.
+    ##
+    ## Encoding is made using `mbsrtowcs`, so procedure supports invalid
+    ## sequences and able to decoded all the characters before first invalid
+    ## character encountered.
+
+    # Without explicitely setting locale because `mbsrtowcs` will fail with
+    # EILSEQ.
+    # If locale is an empty string, "", each part of the locale that should
+    # be modified is set according to the environment variables.
+    let sres = setlocale(LC_ALL, cstring"")
+    if isNil(sres):
+      return err(ErrorInvalidLocale)
+
+    var buffer = newSeq[B](len(input))
+    if len(input) == 0:
+      return ok(buffer)
+
+    doAssert(input[^1] == A(0), "Input array should be zero-terminated")
+    var data = @input
+    var ostr = addr data[0]
+    var pstr = ostr
+    var mstate = Mbstate()
+
+    while true:
+      let res = mbsrtowcs(addr buffer[0], addr pstr, csize_t(len(buffer)),
+                          addr mstate)
+      if res == cast[csize_t](-1):
+        # If invalid multibyte sequence has been encountered, ``pstr`` is left
+        ## pointing to the invalid multibyte sequence, ``-1`` is returned, and
+        ## errno is set to EILSEQ.
+        let diff = cast[uint](pstr) - cast[uint](ostr)
+        if diff == 0:
+          return err(ErrorInvalidSequence)
+        else:
+          # We have partially decoded sequence, `diff` is position of first
+          # invalid character in sequence.
+          data[diff] = A(0x00)
+          ostr = addr data[0]
+          pstr = ostr
+          mstate = Mbstate()
+      else:
+        # Its safe to convert `csize_t` to `int` here because `len(input)`
+        # is also `int`.
+        buffer.setLen(res)
+        return ok(buffer)
diff --git a/tests/test_utf8.nim b/tests/test_utf8.nim
index da4ed533..05bee447 100644
--- a/tests/test_utf8.nim
+++ b/tests/test_utf8.nim
@@ -32,29 +32,29 @@ proc toUTF1(value: uint32): array[1, byte] =
 suite "UTF-8 validation test suite":
   test "Values [U+0000, U+007F] are allowed":
     for i in 0x00'u32 .. 0x7F'u32:
-      check validateUtf8(toUTF1(i)) == true
+      check utf8Validate(toUTF1(i)) == true
   test "Values [U+0080, U+07FF] are allowed":
     for i in 0x80'u32 .. 0x7FF'u32:
-      check validateUtf8(toUTF2(i)) == true
+      check utf8Validate(toUTF2(i)) == true
   test "Values [U+0800, U+D7FF] are allowed":
     for i in 0x800'u32 .. 0xD7FF'u32:
-      check validateUtf8(toUTF3(i)) == true
+      check utf8Validate(toUTF3(i)) == true
   test "Values [U+D800, U+DFFF] (UTF-16 surrogates) are not allowed":
     for i in 0xD800'u32 .. 0xDFFF'u32:
-      check validateUtf8(toUTF3(i)) == false
+      check utf8Validate(toUTF3(i)) == false
   test "Values [U+E000, U+FFFD] are allowed":
     for i in 0xE000'u32 .. 0xFFFD'u32:
-      check validateUtf8(toUTF3(i)) == true
+      check utf8Validate(toUTF3(i)) == true
   test "Values U+FFFE and U+FFFF are not allowed":
     check:
-      validateUtf8(toUTF3(0xFFFE'u32)) == false
-      validateUtf8(toUTF3(0xFFFF'u32)) == false
+      utf8Validate(toUTF3(0xFFFE'u32)) == false
+      utf8Validate(toUTF3(0xFFFF'u32)) == false
   test "Values [U+10000, U10FFFF] are allowed":
     for i in 0x10000'u32 .. 0x10FFFF'u32:
-      check validateUtf8(toUTF4(i)) == true
+      check utf8Validate(toUTF4(i)) == true
   test "Values bigger U+10FFFF are not allowed":
     for i in 0x11_0000'u32 .. 0x1F_FFFF'u32:
-      check validateUtf8(toUTF4(i)) == false
+      check utf8Validate(toUTF4(i)) == false
   test "fastvalidate-utf-8 bad sequences":
     # https://github.com/lemire/fastvalidate-utf-8 test vectors
     const
@@ -95,9 +95,9 @@ suite "UTF-8 validation test suite":
         "\xef\xbf"
       ]
     for item in BadSequences:
-      check validateUtf8(item) == false
+      check utf8Validate(item) == false
     for item in GoodSequences:
-      check validateUtf8(item) == true
+      check utf8Validate(item) == true
   test "UTF-8 decoder capability and stress test":
     # https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
     const Tests2 = [
@@ -184,10 +184,136 @@ suite "UTF-8 validation test suite":
     ]
 
     for item in Tests2:
-      check validateUtf8(item[0]) == item[1]
+      check utf8Validate(item[0]) == item[1]
     for item in Tests3:
-      check validateUtf8(item[0]) == item[1]
+      check utf8Validate(item[0]) == item[1]
     for item in Tests4:
-      check validateUtf8(item[0]) == item[1]
+      check utf8Validate(item[0]) == item[1]
     for item in Tests5:
-      check validateUtf8(item[0]) == item[1]
+      check utf8Validate(item[0]) == item[1]
+
+  test "UTF-8 length() test":
+    const
+      Cyrillic = "\xd0\x9f\xd1\x80\xd0\xbe\xd0\xb3" &
+                 "\xd1\x80\xd0\xb0\xd0\xbc\xd0\xbc\xd0\xb0"
+    check:
+      utf8Length("Программа").tryGet() == 9
+      utf8Length("Программ").tryGet() == 8
+      utf8Length("Програм").tryGet() == 7
+      utf8Length("Програ").tryGet() == 6
+      utf8Length("Прогр").tryGet() == 5
+      utf8Length("Прог").tryGet() == 4
+      utf8Length("Про").tryGet() == 3
+      utf8Length("Пр").tryGet() == 2
+      utf8Length("П").tryGet() == 1
+      utf8Length("").tryGet() == 0
+      utf8Length("П⠯🤗").tryGet() == 3
+      utf8Length("⠯🤗").tryGet() == 2
+      utf8Length("🤗").tryGet() == 1
+
+    check:
+      utf8Length(Cyrillic).tryGet() == 9
+      utf8Length(Cyrillic.toOpenArray(0, len(Cyrillic) - 2)).isErr() == true
+
+  test "UTF-8 substr() test":
+    check:
+      utf8Substr("Программа", -1, -1).tryGet() == "Программа"
+      utf8Substr("Программа", 0, 0).tryGet() == "П"
+      utf8Substr("Программа", 0, 1).tryGet() == "Пр"
+      utf8Substr("Программа", 0, 2).tryGet() == "Про"
+      utf8Substr("Программа", 0, 3).tryGet() == "Прог"
+      utf8Substr("Программа", 0, 4).tryGet() == "Прогр"
+      utf8Substr("Программа", 0, 5).tryGet() == "Програ"
+      utf8Substr("Программа", 0, 6).tryGet() == "Програм"
+      utf8Substr("Программа", 0, 7).tryGet() == "Программ"
+      utf8Substr("Программа", 0, 8).tryGet() == "Программа"
+      utf8Substr("Программа", 0, 9).tryGet() == "Программа"
+      utf8Substr("Программа", 0, 10).tryGet() == "Программа"
+      utf8Substr("Программа", 0, 18).tryGet() == "Программа"
+      utf8Substr("Программа", 0, 19).tryGet() == "Программа"
+      utf8Substr("Программа", 0, 100).tryGet() == "Программа"
+      utf8Substr("Программа", 100, 0).tryGet() == ""
+      utf8Substr("Программа", 100, 100).tryGet() == ""
+      utf8Substr("Программа", 1, 1).tryGet() == "р"
+      utf8Substr("Программа", 2, 2).tryGet() == "о"
+      utf8Substr("Программа", 3, 3).tryGet() == "г"
+      utf8Substr("Программа", 4, 4).tryGet() == "р"
+      utf8Substr("Программа", 5, 5).tryGet() == "а"
+      utf8Substr("Программа", 6, 6).tryGet() == "м"
+      utf8Substr("Программа", 7, 7).tryGet() == "м"
+      utf8Substr("Программа", 8, 8).tryGet() == "а"
+      utf8Substr("Программа", 9, 9).tryGet() == ""
+      utf8Substr("Программа", 0, -1).tryGet() == "Программа"
+      utf8Substr("Программа", 1, -1).tryGet() == "рограмма"
+      utf8Substr("Программа", 2, -1).tryGet() == "ограмма"
+      utf8Substr("Программа", 3, -1).tryGet() == "грамма"
+      utf8Substr("Программа", 4, -1).tryGet() == "рамма"
+      utf8Substr("Программа", 5, -1).tryGet() == "амма"
+      utf8Substr("Программа", 6, -1).tryGet() == "мма"
+      utf8Substr("Программа", 7, -1).tryGet() == "ма"
+      utf8Substr("Программа", 8, -1).tryGet() == "а"
+      utf8Substr("Программа", 9, -1).tryGet() == ""
+
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", -1, -1).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶"
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 0).tryGet() == "⠯"
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 1).tryGet() == "⠯⠰"
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 2).tryGet() == "⠯⠰⠱"
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 3).tryGet() == "⠯⠰⠱⠲"
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 4).tryGet() == "⠯⠰⠱⠲⠳"
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 5).tryGet() == "⠯⠰⠱⠲⠳⠴"
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 6).tryGet() == "⠯⠰⠱⠲⠳⠴⠵"
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 7).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶"
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 8).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶"
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 9).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶"
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 23).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶"
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 24).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶"
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 0, 100).tryGet() == "⠯⠰⠱⠲⠳⠴⠵⠶"
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 100, 0).tryGet() == ""
+      utf8Substr("⠯⠰⠱⠲⠳⠴⠵⠶", 100, 100).tryGet() == ""
+
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", -1, -1).tryGet() ==
+        "🤗🤘🤙🤚🤛🤜🤝🤞🤟"
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 0).tryGet() ==
+        "🤗"
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 1).tryGet() ==
+        "🤗🤘"
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 2).tryGet() ==
+        "🤗🤘🤙"
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 3).tryGet() ==
+        "🤗🤘🤙🤚"
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 4).tryGet() ==
+        "🤗🤘🤙🤚🤛"
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 5).tryGet() ==
+        "🤗🤘🤙🤚🤛🤜"
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 6).tryGet() ==
+        "🤗🤘🤙🤚🤛🤜🤝"
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 7).tryGet() ==
+        "🤗🤘🤙🤚🤛🤜🤝🤞"
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 8).tryGet() ==
+        "🤗🤘🤙🤚🤛🤜🤝🤞🤟"
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 9).tryGet() ==
+        "🤗🤘🤙🤚🤛🤜🤝🤞🤟"
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 31).tryGet() ==
+        "🤗🤘🤙🤚🤛🤜🤝🤞🤟"
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 32).tryGet() ==
+        "🤗🤘🤙🤚🤛🤜🤝🤞🤟"
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 0, 100).tryGet() ==
+        "🤗🤘🤙🤚🤛🤜🤝🤞🤟"
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 0).tryGet() == ""
+      utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 100).tryGet() == ""
+
+  test "wcharToUtf8() tests":
+    for i in 0 ..< 0x11_0000:
+      if i != 0xFFFE and i != 0xFFFF:
+        if i < 0x10000:
+          var data16 = [uint16(i)]
+          let res = wcharToUtf8(data16)
+          check:
+            res.isOk() == true
+            utf8Validate(res.get()) == true
+
+        var data32 = [uint32(i)]
+        let res = wcharToUtf8(data32)
+        check:
+          res.isOk() == true
+          utf8Validate(res.get()) == true

From 1746bc0095257845807d9a9e7f05dac61796f744 Mon Sep 17 00:00:00 2001
From: cheatfate <eugene.kabanov@status.im>
Date: Wed, 14 Oct 2020 12:25:33 +0300
Subject: [PATCH 8/9] Fix UTF-32 encoder/decoder. Add tests for UTF-8 to UTF-32
 and UTF-32 to UTF-8 encoders.

---
 stew/conio.nim      |   2 +-
 stew/utf8.nim       | 193 ++++++++++++++++++++++++++++++++++++--------
 tests/test_utf8.nim |  42 +++++++---
 3 files changed, 194 insertions(+), 43 deletions(-)

diff --git a/stew/conio.nim b/stew/conio.nim
index e1115885..e9fba96d 100644
--- a/stew/conio.nim
+++ b/stew/conio.nim
@@ -355,7 +355,7 @@ elif defined(posix):
         if maxChars < len(wbuffer):
           wbuffer.setLen(maxChars)
         # Conversion of wide characters sequence to UTF-8 encoded string.
-        let ures = wbuffer.wcharToUtf8()
+        let ures = wbuffer.utf32toUtf8()
         if ures.isOk():
           ok(ures.get())
         else:
diff --git a/stew/utf8.nim b/stew/utf8.nim
index d2d43034..afac4d5f 100644
--- a/stew/utf8.nim
+++ b/stew/utf8.nim
@@ -12,7 +12,8 @@ export results
 
 type
   UResult*[T] = Result[T, cstring]
-  Wides* = int16 | uint16 | int32 | uint32
+  Wides32* = int32 | uint32
+  Wides16* = int16 | uint16
   Bytes* = int8 | char | uint8 | byte
 
 const
@@ -206,72 +207,200 @@ proc utf8Substr*[T: Bytes](data: openarray[T],
     inc(k)
   ok(res)
 
-proc wcharToUtf8*[A: Wides, B: Bytes](input: openarray[A],
+proc utf32toUtf8*[A: Wides32, B: Bytes](input: openarray[A],
                                       output: var openarray[B]): UResult[int] =
-  ## Converts WCHAR sequence ``input`` to UTF-8 array of octets ``output``.
-  ##
-  ## Procedure supports 4-byte (Linux) and 2-byte sequences (Windows) as input.
+  ## Converts UTF-32 sequence ``input`` to UTF-8 array ``output``.
   var offset = 0
   for item in input:
-    let uitem = uint(item)
     let codepoint =
-      if uitem >= 0xD800'u and uitem <= 0xDBFF'u:
-        0x10000'u + ((uitem - 0xD800'u) shl 10)
-      else:
-        if uitem >= 0xDC00'u and uitem <= 0xDFFF'u:
-          uitem - 0xDC00'u
-        else:
-          uitem
-    if codepoint <= 0x7F'u:
+      block:
+        if (uint32(item) >= 0xD800'u32) and (uint32(item) <= 0xDFFF'u32):
+          # high and low surrogates U+D800 through U+DFFF prohibited in UTF-32.
+          return err(ErrorInvalidSequence)
+        elif (uint32(item) == 0xFFFE'u32) or (uint32(item) == 0xFFFF'u32):
+          # these codes are intended for process-internal uses, and not a
+          # unicode characters.
+          return err(ErrorInvalidSequence)
+        uint32(item)
+    if codepoint <= 0x7F'u32:
       if len(output) > 0:
         if offset < len(output):
-          output[offset] = cast[B](codepoint and 0x7F'u)
+          output[offset] = cast[B](codepoint and 0x7F'u32)
         else:
           return err(ErrorBufferOverflow)
       inc(offset, 1)
-    elif codepoint <= 0x7FF'u:
+    elif codepoint <= 0x7FF'u32:
       if len(output) > 0:
         if offset + 1 < len(output):
           output[offset + 0] = cast[B](0xC0'u8 or
-                                       byte((codepoint shr 6) and 0x1F'u))
-          output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
+                                       byte((codepoint shr 6) and 0x1F'u32))
+          output[offset + 1] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
         else:
           return err(ErrorBufferOverflow)
       inc(offset, 2)
-    elif codepoint <= 0xFFFF'u:
+    elif codepoint <= 0xFFFF'u32:
       if len(output) > 0:
         if offset + 2 < len(output):
           output[offset + 0] = cast[B](0xE0'u8 or
-                                       byte((codepoint shr 12) and 0x0F'u))
+                                       byte((codepoint shr 12) and 0x0F'u32))
           output[offset + 1] = cast[B](0x80'u8 or
-                                       byte((codepoint shr 6) and 0x3F'u))
-          output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
+                                       byte((codepoint shr 6) and 0x3F'u32))
+          output[offset + 2] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
         else:
           return err(ErrorBufferOverflow)
       inc(offset, 3)
-    elif codepoint <= 0x10FFFF'u:
+    elif codepoint <= 0x10FFFF'u32:
       if len(output) > 0:
         if offset + 3 < len(output):
           output[offset + 0] = cast[B](0xF0'u8 or
-                                       byte((codepoint shr 18) and 0x07'u))
+                                       byte((codepoint shr 18) and 0x07'u32))
           output[offset + 1] = cast[B](0x80'u8 or
-                                       byte((codepoint shr 12) and 0x3F'u))
+                                       byte((codepoint shr 12) and 0x3F'u32))
           output[offset + 2] = cast[B](0x80'u8 or
-                                       byte((codepoint shr 6) and 0x3F'u))
-          output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u))
+                                       byte((codepoint shr 6) and 0x3F'u32))
+          output[offset + 3] = cast[B](0x80'u8 or byte(codepoint and 0x3F'u32))
         else:
-          return err("")
+          return err(ErrorBufferOverflow)
       inc(offset, 4)
     else:
       return err(ErrorInvalidSequence)
   ok(offset)
 
-proc wcharToUtf8*[T: Wides](input: openarray[T]): UResult[string] {.inline.} =
-  ## Converts wide character
+proc utf32toUtf8*[T: Wides32](input: openarray[T]): UResult[string] {.inline.} =
+  ## Converts wide character sequence ``input`` to UTF-8 encoded string.
   var empty: array[0, char]
-  let size = ? wcharToUtf8(input, empty)
+  let size = ? utf32ToUtf8(input, empty)
   var output = newString(size)
-  let res {.used.} = ? wcharToUtf8(input, output)
+  let res {.used.} = ? utf32ToUtf8(input, output)
+  ok(output)
+
+proc utf8toUtf32*[A: Bytes, B: Wides32](input: openarray[A],
+                                       output: var openarray[B]): UResult[int] =
+  ## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
+  ## sequences of 32bit limbs.
+  ##
+  ## To obtain required size of ``output`` you need to pass ``output`` as
+  ## zero-length array, in such way required size will be returned as result of
+  ## procedure.
+  ##
+  ## If size of ``output`` is not zero, and there not enough space in ``output``
+  ## array to store whole ``input`` array, error ``ErrorBufferOverflow`` will
+  ## be returned.
+  var index = 0
+  var dindex = 0
+  if len(output) == 0:
+    return utf8Length(input)
+  else:
+    while true:
+      if index >= len(input):
+        break
+      let byte1 = uint32(input[index])
+      inc(index)
+
+      if (byte1 and 0x80) == 0x00:
+        if dindex < len(output):
+          output[dindex] = B(byte1)
+          inc(dindex)
+        else:
+          return err(ErrorBufferOverflow)
+      elif (byte1 and 0xE0'u32) == 0xC0'u32:
+        # Two-byte form (110xxxxx 10xxxxxx)
+        if index >= len(input):
+          return err(ErrorInvalidSequence)
+        # overlong sequence test
+        if (byte1 and 0xFE'u32) == 0xC0'u32:
+          return err(ErrorInvalidSequence)
+
+        let byte2 = uint32(input[index])
+        if (byte2 and 0xC0'u32) != 0x80'u32:
+          return err(ErrorInvalidSequence)
+
+        if dindex < len(output):
+          output[dindex] = B(((byte1 and 0x1F'u32) shl 6) or
+                              (byte2 and 0x3F'u32))
+          inc(dindex)
+        else:
+          return err(ErrorBufferOverflow)
+        inc(index)
+      elif (byte1 and 0xF0'u32) == 0xE0'u32:
+        # Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
+        if (index + 1) >= len(input):
+          return err(ErrorInvalidSequence)
+
+        let byte2 = uint32(input[index])
+        if (byte2 and 0xC0'u32) != 0x80'u32:
+          return err(ErrorInvalidSequence)
+        # overlong sequence test
+        if (byte1 == 0xE0'u32) and ((byte2 and 0xE0'u32) == 0x80'u32):
+          return err(ErrorInvalidSequence)
+        #  0xD800–0xDFFF (UTF-16 surrogates) test
+        if (byte1 == 0xED'u32) and ((byte2 and 0xE0'u32) == 0xA0'u32):
+          return err(ErrorInvalidSequence)
+
+        let byte3 = uint32(input[index + 1])
+        if (byte3 and 0xC0'u32) != 0x80'u32:
+          return err(ErrorInvalidSequence)
+        # U+FFFE or U+FFFF test
+        if (byte1 == 0xEF'u32) and (byte2 == 0xBF'u32) and
+           ((byte3 and 0xFE'u32) == 0xBE'u32):
+          return err(ErrorInvalidSequence)
+
+        if dindex < len(output):
+          output[dindex] = B(((byte1 and 0x0F'u32) shl 12) or
+                             ((byte2 and 0x3F'u32) shl 6) or
+                              (byte3 and 0x3F'u32))
+          inc(dindex)
+        else:
+          return err(ErrorBufferOverflow)
+        inc(index, 2)
+
+      elif (byte1 and 0xF8'u8) == 0xF0'u8:
+        # Four-byte form (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+        if (index + 2) >= len(input):
+          return err(ErrorInvalidSequence)
+
+        let byte2 = uint32(input[index])
+        if (byte2 and 0xC0'u32) != 0x80'u32:
+          return err(ErrorInvalidSequence)
+        # overlong sequence test
+        if (byte1 == 0xF0'u32) and ((byte2 and 0xF0'u32) == 0x80'u32):
+          return err(ErrorInvalidSequence)
+        # According to RFC 3629 no point above U+10FFFF should be used, which
+        # limits characters to four bytes.
+        if ((byte1 == 0xF4'u32) and (byte2 > 0x8F'u32)) or (byte1 > 0xF4'u32):
+          return err(ErrorInvalidSequence)
+
+        let byte3 = uint32(input[index + 1])
+        if (byte3 and 0xC0'u32) != 0x80'u32:
+          return err(ErrorInvalidSequence)
+
+        let byte4 = uint32(input[index + 2])
+        if (byte4 and 0xC0'u32) != 0x80'u32:
+          return err(ErrorInvalidSequence)
+
+        if dindex < len(output):
+          output[dindex] = B(((byte1 and 0x07'u32) shl 18) or
+                             ((byte2 and 0x3F'u32) shl 12) or
+                             ((byte3 and 0x3F'u32) shl 6) or
+                              (byte4 and 0x3F'u32))
+          inc(dindex)
+        else:
+          return err(ErrorBufferOverflow)
+        inc(index, 3)
+
+      else:
+        return err(ErrorInvalidSequence)
+
+    ok(dindex)
+
+proc utf8toUtf32*[A: Bytes, B: Wides32](et: typedesc[B],
+                                        input: openarray[A]): UResult[seq[B]] =
+  ## Convert UTF-8 encoded array of characters ``input`` to UTF-32 encoded
+  ## sequence of 32bit limbs and return it.
+  var empty: array[0, B]
+  let size = ? utf8toUtf32(input, empty)
+  var output = newSeq[B](size)
+  let res {.used.} = ? utf8toUtf32(input, output)
   ok(output)
 
 when defined(posix):
diff --git a/tests/test_utf8.nim b/tests/test_utf8.nim
index 05bee447..06b9bb31 100644
--- a/tests/test_utf8.nim
+++ b/tests/test_utf8.nim
@@ -302,18 +302,40 @@ suite "UTF-8 validation test suite":
       utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 0).tryGet() == ""
       utf8Substr("🤗🤘🤙🤚🤛🤜🤝🤞🤟", 100, 100).tryGet() == ""
 
-  test "wcharToUtf8() tests":
+  test "UTF-32 -> UTF-8 conversion test":
     for i in 0 ..< 0x11_0000:
-      if i != 0xFFFE and i != 0xFFFF:
-        if i < 0x10000:
-          var data16 = [uint16(i)]
-          let res = wcharToUtf8(data16)
-          check:
-            res.isOk() == true
-            utf8Validate(res.get()) == true
-
+      var data32 = [uint32(i)]
+      if i >= 0xD800 and i <= 0xDFFF:
+        check utf32toUtf8(data32).isErr()
+      elif i == 0xFFFE:
+        check utf32toUtf8(data32).isErr()
+      elif i == 0xFFFF:
+        check utf32toUtf8(data32).isErr()
+      elif i == 0x11_0000:
+        check utf32toUtf8(data32).isErr()
+      else:
         var data32 = [uint32(i)]
-        let res = wcharToUtf8(data32)
+        let res = utf32toUtf8(data32)
         check:
           res.isOk() == true
           utf8Validate(res.get()) == true
+
+  test "UTF-8 -> UTF-32 conversion test":
+    for i in 0 ..< 0x11_0001:
+      var data32 = [uint32(i)]
+      if i >= 0xD800 and i <= 0xDFFF:
+        check utf32toUtf8(data32).isErr()
+      elif i == 0xFFFE:
+        check utf32toUtf8(data32).isErr()
+      elif i == 0xFFFF:
+        check utf32toUtf8(data32).isErr()
+      elif i == 0x11_0000:
+        check utf32toUtf8(data32).isErr()
+      else:
+        var data32 = [uint32(i)]
+        let res8 = utf32toUtf8(data32)
+        check res8.isOk()
+        let res32 = utf8toUtf32(uint32, res8.get())
+        check:
+          res32.isOk()
+          res32.get() == data32

From 8d87ba3e05bb09c6e1383f289c533dfafa5ec4d0 Mon Sep 17 00:00:00 2001
From: cheatfate <eugene.kabanov@status.im>
Date: Thu, 15 Oct 2020 00:10:19 +0300
Subject: [PATCH 9/9] Fix *nix compilation problem.

---
 stew/utf8.nim | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stew/utf8.nim b/stew/utf8.nim
index afac4d5f..312f4d6e 100644
--- a/stew/utf8.nim
+++ b/stew/utf8.nim
@@ -414,8 +414,8 @@ when defined(posix):
                  ps: ptr Mbstate): csize_t {.
        importc, header: "<wchar.h>".}
 
-  proc mbstowcs*[A: Bytes, B: Wides](t: typedesc[B],
-                                     input: openarray[A]): UResult[seq[B]] =
+  proc mbstowcs*[A: Bytes, B: Wides32](t: typedesc[B],
+                                       input: openarray[A]): UResult[seq[B]] =
     ## Converts multibyte encoded string to OS specific wide char string.
     ##
     ## Note, that `input` should be `0` terminated.