diff --git a/.github/workflows/haskell-ci.yml b/.github/workflows/haskell-ci.yml index c4f8cbf..27dd9eb 100644 --- a/.github/workflows/haskell-ci.yml +++ b/.github/workflows/haskell-ci.yml @@ -57,4 +57,5 @@ jobs: run: | cabal install doctest cabal repl javelin --with-ghc=doctest - cabal repl javelin-io --with-ghc=doctest \ No newline at end of file + cabal repl javelin-io --with-ghc=doctest + cabal repl javelin-frames --with-ghc=doctest \ No newline at end of file diff --git a/README.md b/README.md index d047ac3..ea7fc7b 100644 --- a/README.md +++ b/README.md @@ -1,34 +1,24 @@ -# Haskell implementation of labeled one-dimensional arrays +# Haskell implementation of data structures for data science -Packages in this repository implement series, or labeled one-dimensional arrays, and associated functions. - -Like [`Data.Map.Strict`](https://hackage.haskell.org/package/containers/docs/Data-Map-Strict.html), series support efficient: - -* random access by key ( $\mathcal{O}\left( \log n \right)$ ) ; -* slice by key ( $\mathcal{O}\left( \log n \right)$ ). - -Like [`Data.Vector.Vector`](https://hackage.haskell.org/package/vector/docs/Data-Vector.html), series support efficient: - -* random access by integer index ( $\mathcal{O}\left( 1 \right)$ ); -* slice by integer index ( $\mathcal{O}\left( 1 \right)$ ); -* numerical operations. +Packages in this repository implement series and dataframes, data structures which are ubiquitous in data science. ## Tutorial and documentation -A tutorial and interface documentation for the most recent published version are [available here](https://hackage.haskell.org/package/javelin). +A tutorial and interface documentation for the most recent published version are [available here for series](https://hackage.haskell.org/package/javelin). A tutorial and interface documentation for dataframes is coming. Locally, you can generate documentation for all packages using `haddock` like so: ```bash cabal haddock javelin cabal haddock javelin-io +cabal haddock javelin-frames ``` ## Get involved! Do not hesitate to make feature requests or report bugs via the [issue tracker](https://github.com/LaurentRDC/javelin/issues). -## Preliminary benchmarks +## Preliminary benchmarks for series Looking up random integers: diff --git a/javelin-frames/CHANGELOG.md b/javelin-frames/CHANGELOG.md new file mode 100644 index 0000000..80644a7 --- /dev/null +++ b/javelin-frames/CHANGELOG.md @@ -0,0 +1,5 @@ +# Revision history for javelin-frames + +## 0.1.0.0 -- YYYY-mm-dd + +* First version. Released on an unsuspecting world. diff --git a/javelin-frames/LICENSE b/javelin-frames/LICENSE new file mode 100644 index 0000000..38137d9 --- /dev/null +++ b/javelin-frames/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) 2025 Laurent Rene de Cotret + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/javelin-frames/benchmarks/Main.hs b/javelin-frames/benchmarks/Main.hs new file mode 100644 index 0000000..36fc1c9 --- /dev/null +++ b/javelin-frames/benchmarks/Main.hs @@ -0,0 +1,53 @@ +{-# LANGUAGE DeriveAnyClass #-} +{-# LANGUAGE TypeFamilies #-} + +import Control.DeepSeq ( NFData, rnf ) +import Control.Exception ( evaluate ) +import Criterion.Main ( bench, bgroup, nf, defaultMain ) + +import Data.Frame ( Column, Frameable, Indexable, Row, Frame ) +import qualified Data.Frame as Frame +import qualified Data.Vector as Vector + +import GHC.Generics ( Generic ) + + +data Bench t + = MkBench { field1 :: Column t Int + , field2 :: Column t Int + , field3 :: Column t Int + , field4 :: Column t Int + , field5 :: Column t Int + , field6 :: Column t Int + } + deriving (Generic, Frameable) + +instance NFData (Row Bench) +instance NFData (Frame Bench) + +instance Indexable Bench where + type Key Bench = Int + + index = field1 + + +main :: IO () +main = do + let rs = Vector.fromList [MkBench ix 0 0 0 0 0 | ix <- [0::Int .. 100_000]] + fr = Frame.fromRows rs + evaluate $ rnf rs + evaluate $ rnf fr + defaultMain + [ bgroup "Row-wise operations" + [ bench "fromRows" $ nf (Frame.fromRows) rs + , bench "toRows" $ nf (Frame.toRows) fr + , bench "toRows . fromRows" $ nf (Frame.fromRows . Frame.toRows) fr + , bench "fromRows . toRows" $ nf (Frame.toRows . Frame.fromRows) rs + ] + , bgroup "Lookups" + [ bench "lookup" $ nf (Frame.lookup 100) fr + , bench "ilookup" $ nf (Frame.ilookup 99) fr + , bench "at" $ nf (`Frame.at` (100, field5)) fr + , bench "iat" $ nf (`Frame.iat` (99, field5)) fr + ] + ] \ No newline at end of file diff --git a/javelin-frames/javelin-frames.cabal b/javelin-frames/javelin-frames.cabal new file mode 100644 index 0000000..931d52c --- /dev/null +++ b/javelin-frames/javelin-frames.cabal @@ -0,0 +1,76 @@ +cabal-version: 3.0 +name: javelin-frames +version: 0.1.0.0 +synopsis: Type-safe data frames based on higher-kinded types. +-- description: +license: MIT +license-file: LICENSE +author: Laurent P. René de Cotret +maintainer: laurent.decotret@outlook.com +category: Data, Data Structures, Data Science +build-type: Simple +extra-doc-files: CHANGELOG.md +tested-with: GHC ==9.12.1 + || ==9.10.1 + || ==9.8.4 + || ==9.6.4 + || ==9.4.8 + +description: + + This package implements data frames, a data structure + where record types defined by the user can be transformed + into records of columns. + +source-repository head + type: git + location: https://github.com/LaurentRDC/javelin + +common common + default-language: GHC2021 + ghc-options: -Wall + -Wcompat + -Widentities + -Wincomplete-uni-patterns + -Wincomplete-record-updates + -Wredundant-constraints + -fhide-source-paths + -Wpartial-fields + +library + import: common + exposed-modules: Data.Frame + Data.Frame.Tutorial + build-depends: base >=4.15.0.0 && <4.22, + containers >=0.6 && <0.8, + vector >=0.12.3.0 && <0.14, + hs-source-dirs: src + default-language: GHC2021 + +test-suite javelin-frames-test + import: common + default-language: GHC2021 + type: exitcode-stdio-1.0 + hs-source-dirs: test + main-is: Main.hs + other-modules: Test.Data.Frame + build-depends: base >=4.15.0.0 && <4.22, + containers, + hedgehog, + javelin-frames, + tasty, + tasty-hedgehog, + tasty-hunit, + vector + +benchmark bench-frames + import: common + type: exitcode-stdio-1.0 + ghc-options: -rtsopts + hs-source-dirs: benchmarks + main-is: Main.hs + build-depends: base >=4.15.0.0 && <4.22, + criterion ^>=1.6, + deepseq, + javelin-frames, + vector \ No newline at end of file diff --git a/javelin-frames/src/Data/Frame.hs b/javelin-frames/src/Data/Frame.hs new file mode 100644 index 0000000..7111406 --- /dev/null +++ b/javelin-frames/src/Data/Frame.hs @@ -0,0 +1,528 @@ +{-# LANGUAGE DefaultSignatures #-} +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE RecordWildCards #-} +{-# OPTIONS_GHC -Wno-inline-rule-shadowing #-} +----------------------------------------------------------------------------- +-- | +-- Module : Data.Frame +-- Copyright : (c) Laurent P. René de Cotret +-- License : MIT +-- Maintainer : laurent.decotret@outlook.com +-- Portability : portable +-- +-- This is an experimental interface to dataframes. +-- +-- This module defines the type machinery and some functions to +-- process data frames. Data frames are structures where every +-- row corresponds to an object, but data is stored in +-- contiguous arrays known as columns. +-- +-- A user guide is provided in the "Data.Frame.Tutorial" module. + +module Data.Frame ( + -- * Defining dataframe types + Column, Frameable, Row, Frame, + -- * Construction and deconstruction + fromRows, toRows, fields, + -- * Operations on rows + null, length, mapFrame, mapFrameM, filterFrame, zipFramesWith, foldlFrame, + -- * Displaying frames + DisplayOptions(..), defaultDisplayOptions, display, displayWith, + + -- * Indexing operations + -- ** Based on integer indices + ilookup, iat, + -- ** Based on indexable frames + Indexable(Key, index), lookup, at +) where + + +import Control.Exception (assert) +import Data.Bifunctor (second) +import qualified Data.Foldable +import Data.Functor.Identity (Identity(..)) +import Data.Kind (Type) +import qualified Data.List as List ( intersperse, foldl' ) +import Data.Maybe (catMaybes) +import Data.Semigroup (Max(..)) +import qualified Data.Set as Set +import Data.Vector (Vector) +import qualified Data.Vector +import Prelude hiding (lookup, null, length) +import qualified Prelude +import GHC.Generics ( Selector, Generic(..), S, D, C, K1(..), Rec0, M1(..), type (:*:)(..), selName ) + + +-- $setup +-- >>> import qualified Data.Vector as Vector + +-- | Build a dataframe from a container of rows. +-- +-- For the inverse operation, see `toRows`. +fromRows :: (Frameable t, Foldable f) + => f (Row t) + -> Frame t +fromRows = pack . Data.Vector.fromList . Data.Foldable.toList +{-# INLINE[~2] fromRows #-} + +-- | Deconstruct a dataframe into its rows. +-- +-- For the inverse operation, see `fromRows`. +toRows :: Frameable t + => Frame t + -> Vector (Row t) +toRows = unpack +{-# INLINE[~2] toRows #-} + +-- TODO: Chaining operations such as `mapFrame` and `filterFrame` +-- should benefit from optimizing as `toRows . fromRows = id` +-- ( and `fromRows . toRows = id` as well). +-- See the rules below. +-- It's not clear if I'm using the rewrite system correctly, +-- by looking at the benchmark resuylts +{-# RULES +"fromRows/toRows" [2] fromRows . toRows = id +"toRows/fromRows" [2] toRows . fromRows = id + #-} + +-- | Returns `True` if a dataframe has no rows. +null :: Frameable t + => Frame t + -> Bool +-- TODO: we can use yet another typeclass deriving +-- from generic to only look at ONE of the columns, +-- rather than reconstructing the first row +null = Data.Vector.null . toRows + + +length :: Frameable t + => Frame t + -> Int +-- TODO: we can use yet another typeclass deriving +-- from generic to only look at ONE of the columns, +-- rather than reconstructing all rows. +length = Data.Vector.length . toRows + + +-- | Map a function over each row individually. +-- +-- For mapping with a monadic action, see `mapFrameM`. +mapFrame :: (Frameable t1, Frameable t2) + => (Row t1 -> Row t2) + -> Frame t1 + -> Frame t2 +mapFrame f = fromRows + . Data.Vector.map f + . toRows + + +-- | Map each element of a dataframe to a monadic action, evaluate +-- these actions from left to right, and collect the result +-- in a new dataframe. +-- +-- For mapping without a monadic action, see `mapFrame`. +mapFrameM :: (Frameable t1, Frameable t2, Monad m) + => (Row t1 -> m (Row t2)) + -> Frame t1 + -> m (Frame t2) +mapFrameM f = fmap fromRows + . Data.Vector.mapM f + . toRows + + +-- | Filter rows from a @`Frame` t@, only keeping +-- the rows where the predicate is `True`. +filterFrame :: (Frameable t) + => (Row t -> Bool) + -> Frame t + -> Frame t +filterFrame f = fromRows + . Data.Vector.filter f + . toRows + + +-- | Zip two frames together using a combination function. +-- Rows from each frame are matched in order; the resulting +-- frame will only contain as many rows as the shortest of +-- the two input frames +zipFramesWith :: (Frameable t1, Frameable t2, Frameable t3) + => (Row t1 -> Row t2 -> Row t3) + -> Frame t1 + -> Frame t2 + -> Frame t3 +zipFramesWith f xs ys + = fromRows + $ Data.Vector.zipWith f + (toRows xs) + (toRows ys) + + +-- | Left-associative fold of a structure but with strict application of the operator. +foldlFrame :: Frameable t + => (b -> Row t -> b) -- ^ Reduction function that takes in individual rows + -> b -- ^ Initial value for the accumulator + -> Frame t -- ^ Data frame + -> b +foldlFrame f start + = Data.Vector.foldl' f start . toRows + + +-- | Access a row from a dataframe by its integer index. Indexing +-- starts at 0, representing the first row. +-- +-- If the index is larger than the number of rows, this function +-- returns `Nothing`. +-- +-- To access a specific row AND column, `iat` is much more efficient. +-- +-- To lookup a row based on a non-integer index, see `lookup`. +ilookup :: Frameable t + => Int + -> Frame t + -> Maybe (Row t) +ilookup = iindex + + +-- | Look up a row in a data frame by key. The specific key +-- is defined by the `Indexable` instance of type @t@. +-- +-- The first row whose index matches the supplied key is +-- returned. If no row has a matching key, returns `Nothing`. +-- +-- If you need to look up a particular row and column, +-- `at` is much more efficient. +-- +-- To lookup a row based on an integer index, see `ilookup`. +lookup :: (Indexable t) + => Key t + -> Frame t + -> Maybe (Row t) +lookup key fr + = Data.Vector.findIndex (==key) (index fr) + >>= flip ilookup fr + + +-- | Lookup an element of a frame by row and column. +-- +-- This is much more efficient than looking up an entire row +-- using `lookup`, and then selecting a specific field from a row. +-- +-- To lookup an element by integer row index instead, see `iat`. +at :: (Indexable t) + => Frame t + -> (Key t, Frame t -> Vector a) + -> Maybe a +fr `at` (row, col) + = Data.Vector.findIndex (==row) (index fr) + >>= \ix -> (col fr) Data.Vector.!? ix + + +-- | Lookup an element of the frame by row index and column +-- +-- This is much more efficient than looking up an entire row +-- using `ilookup`, and then selecting a specific field from a row. +-- +-- To lookup an element by row key instead, see `at`. +iat :: Frame t + -> (Int, Frame t -> Vector a) + -> Maybe a +fr `iat` (rowIx, col) = (col fr) Data.Vector.!? rowIx + + +-- | Type family which allows for higher-kinded record types +-- in two forms: +-- +-- * Single record type using `Identity`, where @`Column` Identity a ~ a@ ; +-- * Record type whose elements are some other functor (usually `Vector`). +-- +-- Types are created like regular record types, but each element +-- must have the type @`Column` f a@ instead of @a@. +type family Column (f :: Type -> Type) x where + Column Identity x = x + Column f x = f x + +-- | Type synonym for a record type with scalar elements +type Row (dt :: (Type -> Type) -> Type) = dt Identity + +-- | Type synonym for a record type whose elements are arrays (columns) +type Frame (dt :: (Type -> Type) -> Type) = dt Vector + + +-- | Typeclass to generically derive the function `fromRows`. +class GFromRows tI tV where + gfromRows :: Vector (tI a) -> (tV a) + +instance GFromRows (Rec0 a) (Rec0 (Vector a)) where + gfromRows = K1 . Data.Vector.map unK1 + {-# INLINEABLE gfromRows #-} + +instance (GFromRows tI1 tV1, GFromRows tI2 tV2) + => GFromRows (tI1 :*: tI2) (tV1 :*: tV2) where + gfromRows vs = let (xs, ys) = Data.Vector.unzip $ Data.Vector.map (\(x :*: y) -> (x, y)) vs + in gfromRows xs :*: gfromRows ys + {-# INLINEABLE gfromRows #-} + +instance GFromRows tI tV => GFromRows (M1 i c tI) (M1 i c tV) where + gfromRows vs = M1 (gfromRows (Data.Vector.map unM1 vs)) + {-# INLINEABLE gfromRows #-} + + +-- | Typeclass to generically derive the function `toRows`. +class GToRows tI tV where + gtoRows :: tV a -> Vector (tI a) + +instance GToRows (Rec0 a) (Rec0 (Vector a)) where + gtoRows = Data.Vector.map K1 . unK1 + {-# INLINEABLE gtoRows #-} + +instance (GToRows tI1 tV1, GToRows tI2 tV2) + => GToRows (tI1 :*: tI2) (tV1 :*: tV2) where + gtoRows (xs :*: ys) = Data.Vector.zipWith (:*:) (gtoRows xs) (gtoRows ys) + {-# INLINEABLE gtoRows #-} + +instance (GToRows tI tV) => GToRows (M1 i c tI) (M1 i c tV) where + -- gtoRows :: M1 i c tV a -> Vector (M1 i c tI a) + gtoRows = Data.Vector.map M1 . gtoRows . unM1 + {-# INLINEABLE gtoRows #-} + +class GILookup tI tV where + gilookup :: Int -> tV a -> Maybe (tI a) + +instance GILookup (Rec0 a) (Rec0 (Vector a)) where + gilookup ix vs = K1 <$> (unK1 vs) Data.Vector.!? ix + +instance (GILookup tI1 tV1, GILookup tI2 tV2) + => GILookup (tI1 :*: tI2) (tV1 :*: tV2) where + gilookup ix (xs :*: ys) + = (:*:) + <$> (gilookup ix xs) + <*> (gilookup ix ys) + +instance (GILookup tI tV) => GILookup (M1 i c tI) (M1 i c tV) where + gilookup ix = fmap M1 . gilookup ix . unM1 + + +class GFields r where + gfields :: r a -> [(String, String)] + +instance GFields r => GFields (M1 D x r) where + gfields = gfields . unM1 + +instance GFields t => GFields (M1 C x t) where + gfields = gfields . unM1 + +instance (Show r, Selector s) => GFields (M1 S s (Rec0 r)) where + gfields (M1 (K1 r)) = [(selName (undefined :: M1 S s (Rec0 r) ()), show r)] + +instance (GFields f, GFields g) => GFields (f :*: g) where + gfields (x :*: y) = gfields x ++ gfields y + +-- | Typeclass that endows any record type @t@ with the ability to be packaged +-- as a dataframe. +-- +-- Under no circumstances should you write instances for `Frameable`; instead, +-- simply derive an instance of `Generic` for @t@. +class Frameable t where + + -- | Package single rows of type @t@ into a @`Frame` t@. + pack :: Vector (Row t) -> Frame t + + default pack :: ( Generic (Row t) + , Generic (Frame t) + , GFromRows (Rep (Row t)) (Rep (Frame t)) + ) + => Vector (Row t) + -> Frame t + pack = to . gfromRows . Data.Vector.map from + {-# INLINABLE pack #-} + + -- | Unpack a dataframe into rows + unpack :: Frame t -> Vector (Row t) + + default unpack :: ( Generic (Row t) + , Generic (Frame t) + , GToRows (Rep (Row t)) (Rep (Frame t)) + ) + => Frame t + -> Vector (Row t) + unpack = Data.Vector.map to . gtoRows . from + {-# INLINABLE unpack #-} + + + -- | Look up a row from the frame by integer index + iindex :: Int -> Frame t -> Maybe (Row t) + + default iindex :: ( Generic (Frame t) + , Generic (Row t) + , GILookup (Rep (Row t)) (Rep (Frame t)) + ) + => Int + -> Frame t + -> Maybe (Row t) + iindex ix = fmap to . gilookup ix . from + + -- | Return the field names associated with a row or frame. + -- This is useful to display frames + fields :: Row t -> [(String, String)] + + default fields :: ( Generic (Row t) + , GFields (Rep (Row t)) + ) + => Row t + -> [(String, String)] + fields = gfields . from + + +-- | Typeclass for dataframes with an index, a column or set of columns that can +-- be used to search through rows. +-- +-- An index need not be unique, but the type of its keys must be an instance of `Eq`. +class ( Frameable t + , Eq (Key t) -- Effectively required for lookups + ) => Indexable t where + + -- | A type representing a lookup key for a dataframe. + -- This can be a single field, or a compound key composed + -- of multiple fields + type Key t + + -- | How to create an index from a @`Frame` t@. This is generally + -- done by using record selectors. + index :: Frame t -> Vector (Key t) + + +-- | Control how `displayWith` behaves. +data DisplayOptions t + = DisplayOptions + { maximumNumberOfRows :: Int + -- ^ Maximum number of rows shown. These rows will be distributed evenly + -- between the start of the frame and the end + , rowDisplayFunction :: Row t -> [(String, String)] + -- ^ Function used to display rows from the frame. This should be a map from + -- record name to value. + } + + +-- | Default 'Series' display options. +defaultDisplayOptions :: Frameable t => DisplayOptions t +defaultDisplayOptions + = DisplayOptions { maximumNumberOfRows = 6 + , rowDisplayFunction = fields + } + + +-- | Display a @`Frame` t@ using default 'DisplayOptions'. +-- +-- Example: +-- +-- >>> :{ +-- data Student f +-- = MkStudent { studentName :: Column f String +-- , studentAge :: Column f Int +-- , studentMathGrade :: Column f Char +-- } +-- deriving (Generic, Frameable) +-- :} +-- +-- >>> students = fromRows $ Vector.fromList [MkStudent "Albert" 12 'C', MkStudent "Beatrice" 13 'B', MkStudent "Clara" 12 'A'] +-- >>> putStrLn (display students) +-- studentName | studentAge | studentMathGrade +-- ----------- | ---------- | ---------------- +-- "Albert" | 12 | 'C' +-- "Beatrice" | 13 | 'B' +-- "Clara" | 12 | 'A' +display :: Frameable t + => Frame t + -> String +display = displayWith defaultDisplayOptions + + +-- | Display a @`Frame` t@ using custom 'DisplayOptions'. +-- +-- Example: +-- +-- >>> :{ +-- data Student f +-- = MkStudent { studentName :: Column f String +-- , studentAge :: Column f Int +-- , studentMathGrade :: Column f Char +-- } +-- deriving (Generic, Frameable) +-- :} +-- +-- >>> :{ +-- students = fromRows +-- $ Vector.fromList +-- [ MkStudent "Albert" 12 'C' +-- , MkStudent "Beatrice" 13 'B' +-- , MkStudent "Clara" 12 'A' +-- , MkStudent "David" 13 'A' +-- , MkStudent "Erika" 13 'D' +-- , MkStudent "Frank" 11 'C' +-- ] +-- :} +-- +-- >>> putStrLn (displayWith (defaultDisplayOptions{maximumNumberOfRows=2}) students) +-- studentName | studentAge | studentMathGrade +-- ----------- | ---------- | ---------------- +-- "Albert" | 12 | 'C' +-- ... | ... | ... +-- "Frank" | 11 | 'C' +displayWith :: (Frameable t) + => DisplayOptions t + -> Frame t + -> String +displayWith DisplayOptions{..} df + = if null df + then "" -- TODO: it IS possible to determine the record names + -- without having any rows + else formatGrid rows + + where + len = length df + n = max 1 (maximumNumberOfRows `div` 2) + -- We prevent overlap between the 'head' rows and 'tail' rows + -- by favoring removing duplicate integer indices from the tail rows + headIxs = Set.fromList [0 .. n - 1] + tailIxs = Set.fromList [len - n ..len] `Set.difference` headIxs + headRows = catMaybes [ilookup i df | i <- Set.toList headIxs] + tailRows = catMaybes [ilookup j df | j <- Set.toList tailIxs] + + firstRow = case headRows of + [] -> error "Impossible!" -- We already checked that `df` won't be empty + [xs] -> xs + (xs:_) -> xs + + spacerRow = + if len > maximumNumberOfRows + then [(map (second (const "...")) (fields firstRow))] + else mempty + rows = (fields <$> headRows) ++ spacerRow ++ (fields <$> tailRows) + + (headerLengths :: [(String, Int)]) = (map (\(k, _) -> (k, Prelude.length k)) (fields firstRow)) + (colWidths :: [(String, Int)]) + = map (second getMax) + $ List.foldl' + (\acc mp -> zipWith (\(k1, v1) (k2, v2) -> ((assert (k1 == k2) k1, v1 <> v2))) acc (map (second (Max . Prelude.length)) mp)) + (map (second Max) headerLengths) + rows + + -- | Format a grid represented by a list of rows, where every row is a list of items + -- All columns will have a fixed width + formatGrid :: [ [(String, String)]] -- List of rows + -> String + formatGrid rs = mconcat $ List.intersperse "\n" + $ [ mconcat $ List.intersperse " | " [ (pad w k) | (k, w) <- colWidths]] + ++ [ mconcat $ List.intersperse " | " [ (pad w (replicate w '-')) | (_, w) <- colWidths]] + ++ [ mconcat $ List.intersperse " | " [ (pad w v) + | ((_, v), (_, w)) <- zip mp colWidths + ] + | mp <- rs + ] + where + -- | Pad a string to a minimum of @n@ characters wide. + pad :: Int -> String -> String + pad minNumChars s + | minNumChars <= Prelude.length s = s + | otherwise = replicate (minNumChars - Prelude.length s) ' ' <> s \ No newline at end of file diff --git a/javelin-frames/src/Data/Frame/Tutorial.hs b/javelin-frames/src/Data/Frame/Tutorial.hs new file mode 100644 index 0000000..b5dfb61 --- /dev/null +++ b/javelin-frames/src/Data/Frame/Tutorial.hs @@ -0,0 +1,118 @@ +{-# LANGUAGE DeriveGeneric #-} +{-# OPTIONS_GHC -fno-warn-unused-imports #-} +-- | +-- Module : $header +-- Copyright : (c) Laurent P. René de Cotret +-- License : MIT +-- Maintainer : laurent.decotret@outlook.com +-- Portability : portable +-- +module Data.Frame.Tutorial ( + -- * Introduction + -- $introduction + + -- * Quick tour + -- $quicktour + +) where + +import Data.Frame +import Data.Vector as Vector +import GHC.Generics (Generic) + +{- $introduction + +-} + +{- $quicktour +Let's look at a real example. First, let's get some setup out of the way. We must +activate @-XDeriveAnyClass@ to automatically derive `Frameable`: + +>>> :set -XDeriveAnyClass + +and we'll import the "Data.Vector" module as well: + +>>> import Data.Vector as Vector + +We define + +>>> :{ + data Student f + = MkStudent { studentName :: Column f String + , studentAge :: Column f Int + , studentMathGrade :: Column f Char + } + deriving (Generic, Frameable) + :} + +We use `fromRows` to pack individual students into a dataframe: + +>>> :{ + students = fromRows + $ Vector.fromList [ MkStudent "Albert" 12 'C' + , MkStudent "Beatrice" 13 'B' + , MkStudent "Clara" 12 'A' + ] + :} + +We can render the dataframe @students@ into a nice string using `display` +(and print that string using using `putStrLn`): + +>>> putStrLn (display students) +studentName | studentAge | studentMathGrade +----------- | ---------- | ---------------- + "Albert" | 12 | 'C' + "Beatrice" | 13 | 'B' + "Clara" | 12 | 'A' + +== Operations on rows + +Many operations that treat a dataframe as an array +of rows are provided. + +There's `mapFrame` to map each row to a new structure: + +>>> :{ + putStrLn + $ display + $ mapFrame + (\(MkStudent name age grade) -> MkStudent name (2*age) grade) + students +:} +studentName | studentAge | studentMathGrade +----------- | ---------- | ---------------- + "Albert" | 24 | 'C' + "Beatrice" | 26 | 'B' + "Clara" | 24 | 'A' + +There's `filterFrame` to keep specific rows: + +>>> :{ + putStrLn + $ display + $ filterFrame + (\(MkStudent _ _ grade) -> grade < 'C') + students +:} +studentName | studentAge | studentMathGrade +----------- | ---------- | ---------------- + "Beatrice" | 13 | 'B' + "Clara" | 12 | 'A' + +Finally, there's `foldlFrame` to summarize a dataframe by using whole rows: + +>>> import Data.Char (ord) +>>> :{ + foldlFrame + (\acc (MkStudent _ age grade) -> acc + age + ord grade) + (0 :: Int) + students +:} +235 +-} + +{- $construction + +Records + +-} \ No newline at end of file diff --git a/javelin-frames/test/Main.hs b/javelin-frames/test/Main.hs new file mode 100644 index 0000000..ac105df --- /dev/null +++ b/javelin-frames/test/Main.hs @@ -0,0 +1,11 @@ +module Main (main) where + +import qualified Test.Data.Frame + +import Test.Tasty ( defaultMain, testGroup ) + +main :: IO () +main = defaultMain + $ testGroup "Test suite" + [ Test.Data.Frame.tests + ] \ No newline at end of file diff --git a/javelin-frames/test/Test/Data/Frame.hs b/javelin-frames/test/Test/Data/Frame.hs new file mode 100644 index 0000000..3888a49 --- /dev/null +++ b/javelin-frames/test/Test/Data/Frame.hs @@ -0,0 +1,115 @@ +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE TypeFamilies #-} +module Test.Data.Frame (tests) where + +import Control.Monad (guard, forM_) +import Data.Frame as Frame hiding (length) +import qualified Data.List as List (intersperse) +import qualified Data.Set as Set +import qualified Data.Vector as Vector + +import GHC.Generics (Generic) + +import Hedgehog ( property, forAll, (===) ) +import qualified Hedgehog.Gen as Gen +import qualified Hedgehog.Range as Range + +import Test.Tasty ( testGroup, TestTree ) +import Test.Tasty.Hedgehog ( testProperty ) +import Test.Tasty.HUnit + +tests :: TestTree +tests = testGroup "Data.Frame" [ testToFromRowsTripping + , testLookup + , testFields + , testDisplay + ] + +data User f + -- Note that the fields are NOT ordered alphabetically, + -- which is important for the display test cases. + -- We want to present dataframes as the user intended it. + = MkUser { userName :: Column f String + , userAge :: Column f Int + } + deriving (Generic) + + +instance Frameable User +deriving instance Show (Row User) +deriving instance Eq (Row User) + +testToFromRowsTripping :: TestTree +testToFromRowsTripping = testProperty "Ensure that `toRows` and `fromRows` are inverses" $ property $ do + users <- forAll $ Vector.fromList <$> + Gen.list (Range.linear 0 100) + (MkUser <$> Gen.string (Range.linear 0 100) Gen.alpha + <*> Gen.integral (Range.linear 10 25) + ) + users === toRows (fromRows users) + + +instance Indexable User where + type Key User = String + index = userName + + +testLookup :: TestTree +testLookup = testProperty "Ensure that `lookup` works" $ property $ do + users <- forAll $ Vector.fromList <$> + Gen.list (Range.linear 0 100) + (MkUser <$> Gen.string (Range.linear 0 100) Gen.alpha + <*> Gen.integral (Range.linear 10 25) + ) + + -- This property only makes sense for a unique index + guard (unique (Vector.map userName users)) + + let df = fromRows users + + forM_ users $ \user -> do + Frame.lookup (userName user) df === Just user + + where + unique :: Ord a => Vector.Vector a -> Bool + unique vs = length (Set.fromList (Vector.toList vs)) == Vector.length vs + + +testFields :: TestTree +testFields = testCase "Appropriately accessing field names and values" $ do + let row = MkUser "Alice" 37 + assertEqual mempty ([("userName", "\"Alice\""), ("userAge", "37")]) (fields row) + + +testDisplay :: TestTree +testDisplay = + let frame = fromRows [ MkUser "Alice" 37 + , MkUser "Bob" 38 + , MkUser "Clara" 39 + , MkUser "David" 40 + ] + in testGroup "displaytWith" [ + testCase "Appropriately displaying all rows" $ do + let displayed = Frame.displayWith (Frame.defaultDisplayOptions {maximumNumberOfRows = 4}) frame + expectation = unlines' [ "userName | userAge" + , "-------- | -------" + , " \"Alice\" | 37" + , " \"Bob\" | 38" + , " \"Clara\" | 39" + , " \"David\" | 40" + ] + + assertEqual mempty expectation displayed, + testCase "Appropriately eliding some rows" $ do + let displayed = Frame.displayWith (Frame.defaultDisplayOptions {maximumNumberOfRows = 2}) frame + expectation = unlines' [ "userName | userAge" + , "-------- | -------" + , " \"Alice\" | 37" + , " ... | ..." + , " \"David\" | 40" + ] + + assertEqual mempty expectation displayed + ] + where + unlines' = mconcat . List.intersperse "\n" \ No newline at end of file diff --git a/javelin/src/Data/Series/Tutorial.hs b/javelin/src/Data/Series/Tutorial.hs index 6f1a353..f26ae2e 100644 --- a/javelin/src/Data/Series/Tutorial.hs +++ b/javelin/src/Data/Series/Tutorial.hs @@ -608,7 +608,7 @@ in both series. In this case, we can use 'Series.zipWithMatched': "Poland" | 121.40 Finally, in case we want full control over what to do when a key is missing, -we can use @Series.zipWithStrategy'. For example, consider the case where: +we can use `Series.zipWithStrategy'. For example, consider the case where: * If population numbers are missing, I want to set the density to 0; * If land mass information is missing, I wait to skip calculating the density of this country.