Skip to content

Commit

Permalink
Allow to merge two indexable dataframes by mapping their respective i…
Browse files Browse the repository at this point in the history
…ndexes to a common type
  • Loading branch information
LaurentRDC committed Feb 1, 2025
1 parent 55ef603 commit 4a6924f
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 12 deletions.
53 changes: 42 additions & 11 deletions javelin-frames/src/Data/Frame.hs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ module Data.Frame (
-- * Operations on rows
null, length, mapRows, mapRowsM, filterRows, foldlRows,
-- ** Sorting rows in frames
sortRowsBy, sortRowsByUnique, sortRowsByKey, sortRowsByKeyUnique,
sortRowsBy, sortRowsByUnique,
sortRowsByKey, sortRowsByKeyUnique, sortRowsByKeyUniqueOn,

-- * Displaying frames
display,
Expand All @@ -47,7 +48,7 @@ module Data.Frame (
-- ** Zipping rows in order
zipRowsWith,
-- ** Merging using an index
mergeWithStrategy, matchedStrategy,
mergeWithStrategy, mergeWithStrategyOn, matchedStrategy,
-- *** Defining your own strategies
These(..),
) where
Expand Down Expand Up @@ -405,22 +406,36 @@ sortRowsByKey df =
sortRowsByKeyUnique :: (Indexable t)
=> Frame t
-> Frame t
sortRowsByKeyUnique df =
sortRowsByKeyUnique = sortRowsByKeyUniqueOn id


-- | Sort the rows of a frame by mapping the index defined by
-- the `Indexable` typeclass, to another key type @k@.
-- Also prune rows with duplicate keys.
--
-- The underlying sorting algorithm is timsort (via
-- `Data.Vector.Algorithms.Tim.sortBy`), which minimizes the number
-- of comparisons used.
sortRowsByKeyUniqueOn :: (Ord k, Indexable t)
=> (Key t -> k)
-> Frame t
-> Frame t
sortRowsByKeyUniqueOn mapkey df =
-- I had trouble defining a method whereby one could either
-- build a vector of keys from a `Frame` (without converting to rows),
-- or extract a key from a single `Row`.
--
-- Instead, we extract the index vector, sort it while keeping track
-- of the initial integer positions, and finally backpermuting.
let ix = Data.Vector.map swap
$ Data.Vector.indexed (index df)
$ Data.Vector.indexed (Data.Vector.map mapkey $ index df)
-- TODO: is it possible to run `Data.Vector.map snd`
-- within the `ST` context?
sortedIx = Data.Vector.map snd $ runST $ do
mutVec <- Data.Vector.thaw ix
TimSort.sortUniqBy (compare `on` fst) mutVec >>= Data.Vector.freeze <&> Data.Vector.force
in fromRows $ Data.Vector.backpermute (toRows df) sortedIx -- sortRowsBy (compare `on` index)
{-# INLINABLE sortRowsByKeyUnique #-}
{-# INLINABLE sortRowsByKeyUniqueOn #-}


-- | Look up a row in a data frame by key. The specific key
Expand Down Expand Up @@ -469,7 +484,9 @@ iat :: Frame t
fr `iat` (rowIx, col) = (col fr) Data.Vector.!? rowIx


-- | Merge two dataframes using a merging strategy.
-- | Merge two dataframes using a merging strategy, where the indexes
-- of the dataframes have the same type. See `mergeWithStrategyOn`
-- to merge dataframes with different indexes.
--
-- A merging strategy handles the possibility of rows missing in the
-- left and/or right dataframes. Merge strategies can be user-defined,
Expand Down Expand Up @@ -559,11 +576,25 @@ mergeWithStrategy :: ( Indexable t1, Indexable t2, Frameable t3
-> Frame t1
-> Frame t2
-> Frame t3
mergeWithStrategy strat df1Unsorted df2Unsorted
= let df1 = sortRowsByKeyUnique df1Unsorted
df2 = sortRowsByKeyUnique df2Unsorted
ix1 = index df1
ix2 = index df2
mergeWithStrategy = mergeWithStrategyOn id id


-- | Merge two dataframes using a merging strategy, where the indexes
-- of the dataframes are mapped to some key of type @k@.
--
-- See `mergeWithStrategy` for further notes and examples.
mergeWithStrategyOn :: ( Ord k, Indexable t1, Indexable t2, Frameable t3)
=> (Key t1 -> k) -- ^ How to map the index of the left dataframe onto a key of type @k@
-> (Key t2 -> k) -- ^ How to map the index of the right dataframe onto a key of type @k@
-> MergeStrategy k t1 t2 t3
-> Frame t1
-> Frame t2
-> Frame t3
mergeWithStrategyOn mapk1 mapk2 strat df1Unsorted df2Unsorted
= let df1 = sortRowsByKeyUniqueOn mapk1 df1Unsorted
df2 = sortRowsByKeyUniqueOn mapk2 df2Unsorted
ix1 = Data.Vector.map mapk1 $ index df1
ix2 = Data.Vector.map mapk2 $ index df2
-- Since df1 and df2 are sorted by key and their keys are unique, we
-- can safely use `Set.fromDistinctAscList`.
fullIx = (Set.fromDistinctAscList $ Data.Vector.toList ix1)
Expand Down
26 changes: 25 additions & 1 deletion javelin-frames/test/Test/Data/Frame.hs
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ testMergeWithStrategy
= testGroup "mergeWithStrategy"
[ testMergeWithStrategyUnion
, testMergeWithStrategySelf
, testMergeWithStrategyOn
]
where
testMergeWithStrategyUnion :: TestTree
Expand Down Expand Up @@ -225,7 +226,30 @@ testMergeWithStrategy

Frame.mergeWithStrategy (Frame.matchedStrategy (\_ u _ -> u)) users users === Frame.sortRowsByKeyUnique users


testMergeWithStrategyOn :: TestTree
testMergeWithStrategyOn = testCase "mergeWithStrategyOn" $ do
let users1 = fromRows [ MkUser "A" 39
, MkUser "BB" 98
, MkUser "CCC" 51
, MkUser "DDDD" 37
]
users2 = fromRows [ MkUser "X" 1
, MkUser "XXX" 3
, MkUser "XX" 2
, MkUser "XXXXXXXXX" 37
]

expectation = fromRows [ MkUser "1" (39 + 1)
, MkUser "2" (98 + 2)
, MkUser "3" (51 + 3)
]
-- We join the frames on the LENGTH of the names.
assertEqual mempty expectation
$ Frame.mergeWithStrategyOn length
length
(Frame.matchedStrategy $ \k (MkUser _ age1) (MkUser _ age2) -> MkUser (show k) (age1 + age2))
users1
users2

testDisplay :: TestTree
testDisplay =
Expand Down

0 comments on commit 4a6924f

Please sign in to comment.