Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement correct handling of wide Unicode characters #564

Merged
merged 1 commit into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
*Megaparsec follows [SemVer](https://semver.org/).*

## Upcoming

* Implemented correct handling of wide Unicode characters in error messages.
To that end, a new module `Text.Megaparsec.Unicode` was introduced. [Issue
370](https://github.com/mrkkrp/megaparsec/issues/370).

## Megaparsec 9.6.1

* Exposed `Text.Megaparsec.State`, so that the new functions (`initialState`
Expand Down
3 changes: 2 additions & 1 deletion Text/Megaparsec/Error.hs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ import GHC.Generics
import Text.Megaparsec.Pos
import Text.Megaparsec.State
import Text.Megaparsec.Stream
import qualified Text.Megaparsec.Unicode as Unicode

----------------------------------------------------------------------------
-- Parse error type
Expand Down Expand Up @@ -397,7 +398,7 @@ errorBundlePretty ParseErrorBundle {..} =
lineNumber = (show . unPos . sourceLine) epos
padding = replicate (length lineNumber + 1) ' '
rpshift = unPos (sourceColumn epos) - 1
slineLen = length sline
slineLen = Unicode.stringLength sline
in padding
<> "|\n"
<> lineNumber
Expand Down
44 changes: 32 additions & 12 deletions Text/Megaparsec/Stream.hs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ import qualified Data.Text.Lazy as TL
import Data.Word (Word8)
import Text.Megaparsec.Pos
import Text.Megaparsec.State
import qualified Text.Megaparsec.Unicode as Unicode

-- | Type class for inputs that can be consumed by the library.
--
Expand Down Expand Up @@ -426,6 +427,7 @@ class (Stream s) => VisualStream s where

instance VisualStream String where
showTokens Proxy = stringPretty
tokensLength Proxy = Unicode.stringLength

instance VisualStream B.ByteString where
showTokens Proxy = stringPretty . fmap (chr . fromIntegral)
Expand All @@ -435,9 +437,11 @@ instance VisualStream BL.ByteString where

instance VisualStream T.Text where
showTokens Proxy = stringPretty
tokensLength Proxy = Unicode.stringLength

instance VisualStream TL.Text where
showTokens Proxy = stringPretty
tokensLength Proxy = Unicode.stringLength

-- | Type class for inputs that can also be used for error reporting.
--
Expand Down Expand Up @@ -510,37 +514,37 @@ class (Stream s) => TraversableStream s where
instance TraversableStream String where
-- NOTE Do not eta-reduce these (breaks inlining)
reachOffset o pst =
reachOffset' splitAt foldl' id id ('\n', '\t') o pst
reachOffset' splitAt foldl' id id ('\n', '\t') charInc o pst
reachOffsetNoLine o pst =
reachOffsetNoLine' splitAt foldl' ('\n', '\t') o pst
reachOffsetNoLine' splitAt foldl' ('\n', '\t') charInc o pst

instance TraversableStream B.ByteString where
-- NOTE Do not eta-reduce these (breaks inlining)
reachOffset o pst =
reachOffset' B.splitAt B.foldl' B8.unpack (chr . fromIntegral) (10, 9) o pst
reachOffset' B.splitAt B.foldl' B8.unpack (chr . fromIntegral) (10, 9) byteInc o pst
reachOffsetNoLine o pst =
reachOffsetNoLine' B.splitAt B.foldl' (10, 9) o pst
reachOffsetNoLine' B.splitAt B.foldl' (10, 9) byteInc o pst

instance TraversableStream BL.ByteString where
-- NOTE Do not eta-reduce these (breaks inlining)
reachOffset o pst =
reachOffset' splitAtBL BL.foldl' BL8.unpack (chr . fromIntegral) (10, 9) o pst
reachOffset' splitAtBL BL.foldl' BL8.unpack (chr . fromIntegral) (10, 9) byteInc o pst
reachOffsetNoLine o pst =
reachOffsetNoLine' splitAtBL BL.foldl' (10, 9) o pst
reachOffsetNoLine' splitAtBL BL.foldl' (10, 9) byteInc o pst

instance TraversableStream T.Text where
-- NOTE Do not eta-reduce (breaks inlining of reachOffset').
reachOffset o pst =
reachOffset' T.splitAt T.foldl' T.unpack id ('\n', '\t') o pst
reachOffset' T.splitAt T.foldl' T.unpack id ('\n', '\t') charInc o pst
reachOffsetNoLine o pst =
reachOffsetNoLine' T.splitAt T.foldl' ('\n', '\t') o pst
reachOffsetNoLine' T.splitAt T.foldl' ('\n', '\t') charInc o pst

instance TraversableStream TL.Text where
-- NOTE Do not eta-reduce (breaks inlining of reachOffset').
reachOffset o pst =
reachOffset' splitAtTL TL.foldl' TL.unpack id ('\n', '\t') o pst
reachOffset' splitAtTL TL.foldl' TL.unpack id ('\n', '\t') charInc o pst
reachOffsetNoLine o pst =
reachOffsetNoLine' splitAtTL TL.foldl' ('\n', '\t') o pst
reachOffsetNoLine' splitAtTL TL.foldl' ('\n', '\t') charInc o pst

----------------------------------------------------------------------------
-- Helpers
Expand All @@ -564,6 +568,8 @@ reachOffset' ::
(Token s -> Char) ->
-- | Newline token and tab token
(Token s, Token s) ->
-- | Increment in column position for a token
(Token s -> Pos) ->
-- | Offset to reach
Int ->
-- | Initial 'PosState' to use
Expand All @@ -576,6 +582,7 @@ reachOffset'
fromToks
fromTok
(newlineTok, tabTok)
columnIncrement
o
PosState {..} =
( Just $ case expandTab pstateTabWidth
Expand Down Expand Up @@ -624,7 +631,7 @@ reachOffset'
(g . (fromTok ch :))
| otherwise ->
St
(SourcePos n l (c <> pos1))
(SourcePos n l (c <> columnIncrement ch))
(g . (fromTok ch :))
{-# INLINE reachOffset' #-}

Expand All @@ -639,6 +646,8 @@ reachOffsetNoLine' ::
-- | Newline token and tab token
(Token s, Token s) ->
-- | Offset to reach
-- | Increment in column position for a token
(Token s -> Pos) ->
Int ->
-- | Initial 'PosState' to use
PosState s ->
Expand All @@ -648,6 +657,7 @@ reachOffsetNoLine'
splitAt'
foldl''
(newlineTok, tabTok)
columnIncrement
o
PosState {..} =
( PosState
Expand All @@ -670,7 +680,7 @@ reachOffsetNoLine'
| ch == tabTok ->
SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w))
| otherwise ->
SourcePos n l (c <> pos1)
SourcePos n l (c <> columnIncrement ch)
{-# INLINE reachOffsetNoLine' #-}

-- | Like 'BL.splitAt' but accepts the index as an 'Int'.
Expand Down Expand Up @@ -753,3 +763,13 @@ expandTab w' = go 0 0
go !i 0 (x : xs) = x : go (i + 1) 0 xs
go !i n xs = ' ' : go (i + 1) (n - 1) xs
w = unPos w'

-- | Return increment in column position that corresponds to the given
-- 'Char'.
charInc :: Char -> Pos
charInc ch = if Unicode.isWideChar ch then pos1 <> pos1 else pos1

-- | Return increment in column position that corresponds to the given
-- 'Word8'.
byteInc :: Word8 -> Pos
byteInc _ = pos1
180 changes: 180 additions & 0 deletions Text/Megaparsec/Unicode.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
{-# LANGUAGE Safe #-}

-- |
-- Module : Text.Megaparsec.Unicode
-- Copyright : © 2024–present Megaparsec contributors
-- License : FreeBSD
--
-- Maintainer : Mark Karpov <[email protected]>
-- Stability : experimental
-- Portability : portable
--
-- Utility functions for working with Unicode.
--
-- @since 9.7.0
module Text.Megaparsec.Unicode
( stringLength,
charLength,
isWideChar,
)
where

import Data.Array (Array, bounds, listArray, (!))
import Data.Char (ord)

-- | Calculate length of a string taking into account the fact that certain
-- 'Char's may span more than 1 column.
--
-- @since 9.7.0
stringLength :: (Traversable t) => t Char -> Int
stringLength = sum . fmap charLength

-- | Return length of an individual 'Char'.
--
-- @since 9.7.0
charLength :: Char -> Int
charLength ch = if isWideChar ch then 2 else 1

-- | Determine whether the given 'Char' is “wide”, that is, whether it spans
-- 2 columns instead of one.
--
-- @since 9.7.0
isWideChar :: Char -> Bool
isWideChar c = go (bounds wideCharRanges)
where
go (lo, hi)
| hi < lo = False
| a <= n && n <= b = True
| n < a = go (lo, pred mid)
| otherwise = go (succ mid, hi)
where
mid = (lo + hi) `div` 2
(a, b) = wideCharRanges ! mid
n = ord c

-- | Wide character ranges.
wideCharRanges :: Array Int (Int, Int)
wideCharRanges =
listArray
(0, 118)
[ (0x001100, 0x00115f),
(0x00231a, 0x00231b),
(0x002329, 0x00232a),
(0x0023e9, 0x0023ec),
(0x0023f0, 0x0023f0),
(0x0023f3, 0x0023f3),
(0x0025fd, 0x0025fe),
(0x002614, 0x002615),
(0x002648, 0x002653),
(0x00267f, 0x00267f),
(0x002693, 0x002693),
(0x0026a1, 0x0026a1),
(0x0026aa, 0x0026ab),
(0x0026bd, 0x0026be),
(0x0026c4, 0x0026c5),
(0x0026ce, 0x0026ce),
(0x0026d4, 0x0026d4),
(0x0026ea, 0x0026ea),
(0x0026f2, 0x0026f3),
(0x0026f5, 0x0026f5),
(0x0026fa, 0x0026fa),
(0x0026fd, 0x0026fd),
(0x002705, 0x002705),
(0x00270a, 0x00270b),
(0x002728, 0x002728),
(0x00274c, 0x00274c),
(0x00274e, 0x00274e),
(0x002753, 0x002755),
(0x002757, 0x002757),
(0x002795, 0x002797),
(0x0027b0, 0x0027b0),
(0x0027bf, 0x0027bf),
(0x002b1b, 0x002b1c),
(0x002b50, 0x002b50),
(0x002b55, 0x002b55),
(0x002e80, 0x002e99),
(0x002e9b, 0x002ef3),
(0x002f00, 0x002fd5),
(0x002ff0, 0x002ffb),
(0x003000, 0x00303e),
(0x003041, 0x003096),
(0x003099, 0x0030ff),
(0x003105, 0x00312f),
(0x003131, 0x00318e),
(0x003190, 0x0031ba),
(0x0031c0, 0x0031e3),
(0x0031f0, 0x00321e),
(0x003220, 0x003247),
(0x003250, 0x004db5),
(0x004e00, 0x009fef),
(0x00a000, 0x00a48c),
(0x00a490, 0x00a4c6),
(0x00a960, 0x00a97c),
(0x00ac00, 0x00d7a3),
(0x00f900, 0x00fa6d),
(0x00fa70, 0x00fad9),
(0x00fe10, 0x00fe19),
(0x00fe30, 0x00fe52),
(0x00fe54, 0x00fe66),
(0x00fe68, 0x00fe6b),
(0x00ff01, 0x00ff60),
(0x00ffe0, 0x00ffe6),
(0x016fe0, 0x016fe3),
(0x017000, 0x0187f7),
(0x018800, 0x018af2),
(0x01b000, 0x01b11e),
(0x01b150, 0x01b152),
(0x01b164, 0x01b167),
(0x01b170, 0x01b2fb),
(0x01f004, 0x01f004),
(0x01f0cf, 0x01f0cf),
(0x01f18e, 0x01f18e),
(0x01f191, 0x01f19a),
(0x01f200, 0x01f202),
(0x01f210, 0x01f23b),
(0x01f240, 0x01f248),
(0x01f250, 0x01f251),
(0x01f260, 0x01f265),
(0x01f300, 0x01f320),
(0x01f32d, 0x01f335),
(0x01f337, 0x01f37c),
(0x01f37e, 0x01f393),
(0x01f3a0, 0x01f3ca),
(0x01f3cf, 0x01f3d3),
(0x01f3e0, 0x01f3f0),
(0x01f3f4, 0x01f3f4),
(0x01f3f8, 0x01f43e),
(0x01f440, 0x01f440),
(0x01f442, 0x01f4fc),
(0x01f4ff, 0x01f53d),
(0x01f54b, 0x01f54e),
(0x01f550, 0x01f567),
(0x01f57a, 0x01f57a),
(0x01f595, 0x01f596),
(0x01f5a4, 0x01f5a4),
(0x01f5fb, 0x01f64f),
(0x01f680, 0x01f6c5),
(0x01f6cc, 0x01f6cc),
(0x01f6d0, 0x01f6d2),
(0x01f6d5, 0x01f6d5),
(0x01f6eb, 0x01f6ec),
(0x01f6f4, 0x01f6fa),
(0x01f7e0, 0x01f7eb),
(0x01f90d, 0x01f971),
(0x01f973, 0x01f976),
(0x01f97a, 0x01f9a2),
(0x01f9a5, 0x01f9aa),
(0x01f9ae, 0x01f9ca),
(0x01f9cd, 0x01f9ff),
(0x01fa70, 0x01fa73),
(0x01fa78, 0x01fa7a),
(0x01fa80, 0x01fa82),
(0x01fa90, 0x01fa95),
(0x020000, 0x02a6d6),
(0x02a700, 0x02b734),
(0x02b740, 0x02b81d),
(0x02b820, 0x02cea1),
(0x02ceb0, 0x02ebe0),
(0x02f800, 0x02fa1d)
]
{-# NOINLINE wideCharRanges #-}
1 change: 1 addition & 0 deletions megaparsec-tests/megaparsec-tests.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ test-suite tests
Text.Megaparsec.ErrorSpec
Text.Megaparsec.PosSpec
Text.Megaparsec.StreamSpec
Text.Megaparsec.UnicodeSpec
Text.MegaparsecSpec

default-language: Haskell2010
Expand Down
Loading