diff --git a/pact.cabal b/pact.cabal index 26c3cdade..dec541a08 100644 --- a/pact.cabal +++ b/pact.cabal @@ -47,7 +47,7 @@ flag tests-in-lib manual: True library - + cpp-options: -DLEGACY_PARSER=1 -- common to all configurations: hs-source-dirs: src default-language: Haskell2010 @@ -382,6 +382,7 @@ test-suite hspec RoundTripSpec PrincipalSpec SizeOfSpec + Test.Pact.Parse if !impl(ghcjs) other-modules: @@ -447,6 +448,7 @@ test-suite hspec , sbv , servant-client , temporary >=1.3 + , trifecta , yaml , process , posix-pty diff --git a/src/Pact/Parse.hs b/src/Pact/Parse.hs index f32e53001..435495535 100644 --- a/src/Pact/Parse.hs +++ b/src/Pact/Parse.hs @@ -1,12 +1,13 @@ -{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE CPP #-} {-# LANGUAGE DeriveGeneric #-} -{-# LANGUAGE ScopedTypeVariables #-} -{-# LANGUAGE TupleSections #-} {-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE Rank2Types #-} +{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE Rank2Types #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TupleSections #-} -- | -- Module : Pact.Compile @@ -35,7 +36,6 @@ import Control.DeepSeq (NFData) import Control.Lens (Wrapped(..)) import Control.Monad import qualified Data.Aeson as A -import qualified Data.Attoparsec.Text as AP import qualified Data.ByteString as BS import Data.Char (digitToInt) import Data.Decimal @@ -55,6 +55,8 @@ import Pact.Types.Pretty (Pretty(..),viaShow) import Pact.Types.Info import Pact.Types.Term (ToTerm) +-- -------------------------------------------------------------------------- -- +-- Expression Parser -- | Main parser for Pact expressions. @@ -63,7 +65,11 @@ expr = do delt <- position let inf = do end <- position +#if LEGACY_PARSER == 1 let len = bytes end - bytes delt +#else + let len = column end - column delt +#endif return $! Parsed delt (fromIntegral len) separator t s = symbol t >> (ESeparator . SeparatorExp s <$> inf) msum @@ -141,7 +147,7 @@ newtype ParsedDecimal = ParsedDecimal Decimal instance A.FromJSON ParsedDecimal where parseJSON (A.String s) = - ParsedDecimal <$> case AP.parseOnly (unPactParser number) s of + ParsedDecimal <$> case pactAttoParseOnly (unPactParser number) s of Right (LDecimal d) -> return d Right (LInteger i) -> return (fromIntegral i) _ -> fail $ "Failure parsing decimal string: " ++ show s @@ -167,7 +173,7 @@ newtype ParsedInteger = ParsedInteger Integer instance A.FromJSON ParsedInteger where parseJSON (A.String s) = - ParsedInteger <$> case AP.parseOnly (unPactParser number) s of + ParsedInteger <$> case pactAttoParseOnly (unPactParser number) s of Right (LInteger i) -> return i _ -> fail $ "Failure parsing integer string: " ++ show s parseJSON (A.Number n) = return $ ParsedInteger (round n) @@ -181,16 +187,19 @@ instance A.ToJSON ParsedInteger where instance Wrapped ParsedInteger +-- -------------------------------------------------------------------------- -- +-- Top Level Parsers + -- | "Production" parser: atto, parse multiple exprs. parseExprs :: Text -> Either String [Exp Parsed] -parseExprs = AP.parseOnly (unPactParser (whiteSpace *> exprs <* TF.eof)) +parseExprs = pactAttoParseOnly (unPactParser (whiteSpace *> exprs <* TF.eof)) {-# INLINABLE parseExprs #-} -- | Legacy version of "production" parser: atto, parse multiple exprs. This -- parser does not force EOF and thus accepts trailing inputs that are not valid -- pact code. legacyParseExprs :: Text -> Either String [Exp Parsed] -legacyParseExprs = AP.parseOnly (unPactParser (whiteSpace *> exprs)) +legacyParseExprs = pactAttoParseOnly (unPactParser (whiteSpace *> exprs)) {-# INLINABLE legacyParseExprs #-} -- | ParsedCode version of 'parseExprs' @@ -210,7 +219,6 @@ _parseF p fp = do let s = unpack $ decodeUtf8 bs fmap (,s) <$> TF.parseFromFileEx p fp - _parseS :: String -> TF.Result [Exp Parsed] _parseS = TF.parseString exprsOnly mempty diff --git a/src/Pact/Types/ExpParser.hs b/src/Pact/Types/ExpParser.hs index 64a365b16..2417821c5 100644 --- a/src/Pact/Types/ExpParser.hs +++ b/src/Pact/Types/ExpParser.hs @@ -1,16 +1,17 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE CPP #-} {-# LANGUAGE ConstraintKinds #-} -{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} -{-# LANGUAGE RankNTypes #-} {-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE PatternSynonyms #-} +{-# LANGUAGE RankNTypes #-} {-# LANGUAGE RecordWildCards #-} -{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TupleSections #-} -{-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE BangPatterns #-} -{-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE PatternSynonyms #-} +{-# LANGUAGE TypeFamilies #-} {-# LANGUAGE ViewPatterns #-} -- megaparsec <9.3 backard compatiblity @@ -126,8 +127,12 @@ mkTextInfo :: T.Text -> MkInfo mkTextInfo s d = Info $ Just (Code code, d) where code = T.take len $ T.drop offset s - offset = fromIntegral $ TF.bytes d len = _pLength d +#if LEGACY_PARSER == 1 + offset = fromIntegral $ TF.bytes (_pDelta d) +#else + offset = fromIntegral $ TF.column (_pDelta d) +#endif type ExpParse s a = ReaderT ParseEnv (StateT (ParseState s) (Parsec Void Cursor)) a diff --git a/src/Pact/Types/Info.hs b/src/Pact/Types/Info.hs index 5a59ecdce..889457765 100644 --- a/src/Pact/Types/Info.hs +++ b/src/Pact/Types/Info.hs @@ -71,7 +71,6 @@ instance Arbitrary Parsed where , Directed <$> genFilename <*> genPositiveInt64 <*> genPositiveInt64 <*> genPositiveInt64 <*> genPositiveInt64 ] instance NFData Parsed instance Default Parsed where def = Parsed mempty 0 -instance HasBytes Parsed where bytes = bytes . _pDelta instance Pretty Parsed where pretty = pretty . _pDelta @@ -103,7 +102,7 @@ instance Default Info where def = Info Nothing instance SizeOf Info where sizeOf _ _ = 0 --- make an Info that refers to the indicated text +-- | Make an Info that refers to the indicated text mkInfo :: Text -> Info mkInfo t = Info $ Just (Code t,Parsed delt len) where len = T.length t diff --git a/src/Pact/Types/Orphans.hs b/src/Pact/Types/Orphans.hs index 37e7c6f15..8adbc7609 100644 --- a/src/Pact/Types/Orphans.hs +++ b/src/Pact/Types/Orphans.hs @@ -1,10 +1,8 @@ {-# LANGUAGE DerivingStrategies #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} -{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE StandaloneDeriving #-} -{-# LANGUAGE TypeSynonymInstances #-} {-# OPTIONS_GHC -fno-warn-orphans #-} -- | -- Module : Pact.Types.Orphans @@ -19,10 +17,7 @@ module Pact.Types.Orphans where import Data.Serialize import Data.Decimal import qualified Data.Aeson as A -import Text.Trifecta.Combinators (DeltaParsing(..)) import Text.Trifecta.Delta -import qualified Data.Attoparsec.Text as AP -import qualified Data.Attoparsec.Internal.Types as APT import Data.Text (Text) import Data.Text.Encoding import Pact.Time.Internal (NominalDiffTime(..), UTCTime(..)) @@ -53,20 +48,6 @@ instance Serialize A.Value where instance NFData Delta - --- | Atto DeltaParsing instance provides 'position' only (with no support for --- hidden chars like Trifecta). -instance DeltaParsing AP.Parser where - line = return mempty - position = attoPos >>= \(APT.Pos p) -> let p' = fromIntegral p in return $ Columns p' p' -- p p - slicedWith f a = (`f` mempty) <$> a - rend = return mempty - restOfLine = return mempty - --- | retrieve pos from Attoparsec. -attoPos :: APT.Parser n APT.Pos -attoPos = APT.Parser $ \t pos more _lose win -> win t pos more pos - instance Default Text where def = "" instance Serialize Text where put = put . encodeUtf8 diff --git a/src/Pact/Types/Parser.hs b/src/Pact/Types/Parser.hs index 04d3b7e7f..5bdfa6d86 100644 --- a/src/Pact/Types/Parser.hs +++ b/src/Pact/Types/Parser.hs @@ -1,3 +1,6 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE DerivingStrategies #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE Rank2Types #-} -- | @@ -10,21 +13,37 @@ -- module Pact.Types.Parser - ( + ( -- * Abstract Pact Parser PactParser(..) , symbols , style + + -- * Pact Attoparsec Parser + , PactAttoparsec(..) + , pactAttoParseOnly ) where - import Control.Applicative import Control.Monad -import Text.Trifecta +import Control.Monad.State + +import qualified Data.Attoparsec.Internal.Types as APT +import qualified Data.Attoparsec.Text as AP import qualified Data.HashSet as HS +import qualified Data.Text as T + import Text.Parser.Token.Highlight import Text.Parser.Token.Style +import Text.Trifecta +import Text.Trifecta.Delta as TF +-- -------------------------------------------------------------------------- -- +-- | Abstract Pact Parser +-- +-- On-chain this is use for Attoparsec as parser backend. In the repl trifecta +-- is used. +-- newtype PactParser p a = PactParser { unPactParser :: p a } deriving (Functor, Applicative, Alternative, Monad, MonadPlus, Parsing, CharParsing, DeltaParsing) @@ -45,3 +64,87 @@ style = IdentifierStyle "atom" (HS.fromList ["true","false"]) Symbol ReservedIdentifier + +-- -------------------------------------------------------------------------- -- +-- Pact Attoparsec backend parser + +-- | A wrapper around Attoparsec that adds DeltaParsing +-- +newtype PactAttoparsec a = PactAttoparsec + { runPactAttoparsec :: StateT Int AP.Parser a } + deriving (Functor, Applicative, Alternative, Monad, MonadPlus, MonadState Int) + deriving (Parsing, TokenParsing) + +pactAttoParseOnly :: PactAttoparsec a -> T.Text -> Either String a +pactAttoParseOnly = AP.parseOnly . flip evalStateT 0 . runPactAttoparsec + +-- | Atto DeltaParsing instance provides 'position' only (with no support for +-- hidden chars like Trifecta). +-- +instance DeltaParsing PactAttoparsec where + line = return mempty + + -- Returns @Column NUMBER_OF_CHARS NUMBER_OF_BYTES@ + -- + -- Notes about legacy behavior: + -- + -- For text <2 it used to return for both values the number of 16-bit code + -- units words. With the UTF-16 encoding used by text <2 this is the total + -- number of characters plus number of characters that are encoded using two + -- 16-bit code units. For instance the Ugaritic letter U+1038D would result + -- in a positional length of two (two 16-bit code units). The string + -- "a\0x263A\0x1038D" ("a☺𐎍") would have positional length 4 (3 characters + -- plus one 2 16-bit character). + -- + -- In practice the old behavior was close enough to the number of characters + -- that it went mostly unnoticed and didn't cause harm on chain. The code + -- just assumed that it represented the number text characters. Those + -- numbers appear on chain (up to some block height) within info objects and + -- later still in failure messages. It is also relevant for extracting the + -- module text from the pact transaction before storing it in the pact db. + -- The presence of unicode characters can result in modules containing + -- dangling data because there are less characters in the module than what + -- is assumed based on the position information. + -- + -- For text >=2 the attoparsic position tracks just bytes and the internal + -- representation of UTF-8. For instance the Ugaritic letter U+1038D results + -- in a byte length of 4. The string "a\0x263A\0x1038D" ("a☺𐎍") has 8 bytes + -- (1 code unit plus 3 code unit plus 4 code units). + -- + position = do +#if MIN_VERSION_text(2,0,0) + APT.Pos !bytePos <- parserPos + !charPos <- gets fromIntegral +#elif LEGACY_PARSER == 1 + APT.Pos !bytePos <- parserPos + let !charPos = fromIntegral bytePos +#else + APT.Pos !bytePos <- (* 2) <$> parserPos + !charPos <- gets fromIntegral +#endif + return $ TF.Columns charPos (fromIntegral bytePos) + {-# INLINE position #-} + + slicedWith f a = (`f` mempty) <$> a + rend = return mempty + restOfLine = return mempty + +-- | retrieve pos from Attoparsec. +-- +-- The first parameter to the parser is the remaining available input, which +-- isn't of any help here. +-- +parserPos :: PactAttoparsec APT.Pos +parserPos = PactAttoparsec $ StateT $ \x -> + APT.Parser $ \t !pos more _lose win -> win t pos more (pos, x) + +instance CharParsing PactAttoparsec where + satisfy p = PactAttoparsec (satisfy p) <* modify' (+ 1) + {-# INLINE satisfy #-} + + string s = PactAttoparsec (string s) <* modify' (+ length s) + {-# INLINE string #-} + + text s = PactAttoparsec (text s) <* modify' (+ T.length s) + {-# INLINE text #-} + diff --git a/tests/PactTests.hs b/tests/PactTests.hs index 7a1fa5f3b..0ef4e6621 100644 --- a/tests/PactTests.hs +++ b/tests/PactTests.hs @@ -7,6 +7,7 @@ import qualified KeysetSpec import qualified RoundTripSpec import qualified PrincipalSpec import qualified SizeOfSpec +import qualified Test.Pact.Parse #ifndef ghcjs_HOST_OS import qualified PactTestsSpec @@ -40,6 +41,7 @@ main = hspec $ parallel $ do describe "RoundTripSpec" RoundTripSpec.spec describe "PrincipalSpec" PrincipalSpec.spec describe "SizeOfSpec" SizeOfSpec.spec + describe "Test.Pact.Parse" Test.Pact.Parse.spec #ifndef ghcjs_HOST_OS diff --git a/tests/Test/Pact/Parse.hs b/tests/Test/Pact/Parse.hs new file mode 100644 index 000000000..891fce358 --- /dev/null +++ b/tests/Test/Pact/Parse.hs @@ -0,0 +1,141 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE ViewPatterns #-} + +-- | +-- Module: Test.Pact.Parse +-- Copyright: Copyright © 2023 Kadena LLC. +-- License: MIT +-- Maintainer: Lars Kuhtz +-- Stability: experimental +-- +module Test.Pact.Parse +( spec +) where + +import qualified Data.ByteString as B +import Data.Char +import qualified Data.Text as T +import qualified Data.Text.Encoding as T + +import Test.Hspec +import Test.Hspec.QuickCheck +import Test.QuickCheck + +import qualified Text.Trifecta as D +import qualified Text.Trifecta.Delta as D + +-- internal modules + +import Pact.Parse +import Pact.Types.Parser +import Pact.Types.Exp +import Pact.Types.Info + +-- -------------------------------------------------------------------------- -- +-- Spec + +spec :: Spec +spec = do + spec_position + spec_parsePact + spec_parseModule + +-- -------------------------------------------------------------------------- -- +-- Tests + +-- | This tests the implementation of the `DeltaParsing` instance for +-- "Data.Attoparsec.Text" in "Pact.Types.Orphans". +-- +spec_position :: Spec +spec_position = describe "parsing deltas" $ do + describe "3-bytes utf-8" $ do + check "\"\x200f\"" + check "\"\x200f\x200f\"" + check "\"a\x200f\x200fz\"" + + describe "4-byte utf-8" $ do + check "\"\x1038D\"" + check "\"\x1038D\x10385\"" + + prop "parse delta" $ \x -> + let s = T.pack (show @T.Text x) + in getCols s `shouldBe` Right (cols s) + + prop "parse delta 2" $ \(Lit x) -> + getCols (quoted x) === Right (cols (quoted x)) + where + + -- Check that `pactAttoParseOnly` parses a `Text` as a string literal + -- and produces the Delta that we would expect (it has the number of + -- characters and the number of bytes that `text` and `bytestring` + -- report). + check :: T.Text -> Spec + check s = it (show s) $ getCols s `shouldBe` Right (cols s) + + getCols :: T.Text -> Either String D.Delta + getCols = pactAttoParseOnly (D.stringLiteral @_ @T.Text *> D.position <* D.eof) + +spec_parsePact :: Spec +spec_parsePact = describe "parsePact string literal" $ do + it "U-f002" $ check "\x200f" + it "U-f002U-f002" $ check "\x200f\x200f" + it "U-f002mU-f002" $ check "\x200fm\x200f" + it "aU-f002U-f002z" $ check "a\x200f\x200fz" + it "aU-f002mU-f002z" $ check "a\x200fm\x200fz" + prop "parse string literal" $ \(Lit s) -> check s + where + check s = + parsePact (quoted s) `shouldBe` Right (expected s (fromIntegral $ D.column $ cols (quoted s))) + expected s l = ParsedCode + { _pcCode = quoted s + , _pcExps = + [ ELiteral $ LiteralExp + { _litLiteral = LString {_lString = s} + , _litInfo = Parsed {_pDelta = D.Columns 0 0, _pLength = l} + } + ] + } + +spec_parseModule :: Spec +spec_parseModule = do + it "parses a module" $ do + pm `shouldBe` m + i `shouldBe` Parsed (D.Columns 0 0) (T.length m) + where + Right (ParsedCode pm [EList ListExp { _listInfo = i } ]) = parsePact m + m = T.unlines + [ "(module m G" + , " (defcap G () true)" + , " (defun f () true)" + , ")" + ] + +-- -------------------------------------------------------------------------- -- +-- Utils + +newtype Lit = Lit { unLit :: T.Text } + deriving (Show, Eq, Ord) + +instance Arbitrary Lit where + arbitrary = Lit + . T.filter (/= '\\') + . T.filter (/= '"') + . T.filter (not . isControl) + <$> arbitrary + +quoted :: T.Text -> T.Text +quoted s = "\"" <> s <> "\"" + +-- | Produce a trifecta Delta from a given `Text` string. +-- `Columns` arguments are the number of characters and the number of bytes. +cols :: T.Text -> D.Delta +#if MIN_VERSION_text(2,0,0) +cols s = D.Columns (fromIntegral $ T.length s) (fromIntegral $ B.length (T.encodeUtf8 s)) +#else +cols s = D.Columns (fromIntegral $ T.length s) (fromIntegral $ B.length (T.encodeUtf16LE s)) +#endif +