.

qsbase · Sep 3, 2024 · 09b85cd · 09b85cd
1 parent 728e681
commit 09b85cd
Show file tree

Hide file tree

Showing 34 changed files with 5,349 additions and 297 deletions.
diff --git a/README.md b/README.md
@@ -8,14 +8,13 @@ qs2
 
 *qs2: a framework for efficient serialization*
 
-`qs2` is the successor to the `qs` package. The goal is to have relaible
+`qs2` is the successor to the `qs` package. The goal is to have reliable
 and fast performance for saving and loading objects in R.
 
 The `qs2` format directly uses R serialization (via the
-`R_Serialize`/`R_Unserialize` C API) while improving compression and
-writing/reading to disk. If you are familiar with the `qs` package, the
-benefits and usage are the same. Compared to `saveRDS` it can be an
-order of magnitude faster while having similar levels of compression.
+`R_Serialize`/`R_Unserialize` C API) while improving underlying
+compression and disk IO patterns. If you are familiar with the `qs`
+package, the benefits and usage are the same.
 
 ``` r
 qs_save(data, "myfile.qs2")
@@ -25,21 +24,23 @@ data <- qs_read("myfile.qs2")
 Use the file extension `qs2` to distinguish it from the original `qs`
 package. It is not compatible with the original `qs` format.
 
-# Installation
+## Installation
 
 ``` r
 install.packages("qs2")
 ```
 
-On Mac or Linux, Tt enable multithreading (using
-`Intel Thread Building Blocks` via the `RcppParallel` package) compile
-from source. It is enabled by default on Windows.
+On Mac or Linux, you can enable multi-threading by compile from source.
+It is enabled by default on Windows.
 
 ``` r
 remotes::install_cran("qs2", type = "source", configure.args = " --with-TBB --with-simd=AVX2")
 ```
 
-# Advanced usage: converting qs2 to RDS
+Multi-threading in `qs2` uses the `Intel Thread Building Blocks`
+framework via the `RcppParallel` package.
+
+## Converting qs2 to RDS
 
 Because the `qs2` format directly uses R serialization you can convert
 it to RDS and vice versa.
@@ -60,7 +61,7 @@ xrds <- readRDS(file_rds)
 stopifnot(identical(x, xrds))
 ```
 
-# Advanced usage: the qdata format
+# The qdata format
 
 The package also introduces the `qdata` format which has its own
 serialization layout and works with only data types (vectors, lists,
@@ -70,9 +71,8 @@ It will replace internal types (functions, promises, external pointers,
 environments, objects) with NULL. The `qdata` format differs from the
 `qs2` format in that it is NOT a general.
 
-If you have clean data and understand the caveats, you may want to use
-it. The eventual goal of `qdata` is to also have interoperability with
-other languages, particularly `Python`.
+The eventual goal of `qdata` is to also have interoperability with other
+languages, particularly `Python`.
 
 ``` r
 qd_save(data, "myfile.qs2")
@@ -81,39 +81,37 @@ data <- qd_read("myfile.qs2")
 
 ## Benchmarks
 
-A summary table across 4 datasets is presented below.
+A summary across 4 datasets is presented below.
+
+#### Single-threaded
 
 | Algorithm       | nthreads | Save Time | Read Time | Compression |
-|-----------------|----------|-----------|-----------|-------------|
-| qs2             | 1        | 4.41      | 3.32      | 2.56        |
-| qdata           | 1        | 4.08      | 3.11      | 2.58        |
-| qs              | 1        | 4.24      | 3.17      | 2.51        |
-| saveRDS         | 1        | 27.21     | 6.84      | 2.32        |
-| base::serialize | 1        | 10.17     | 11.51     | 1.10        |
-| fst             | 1        | 2.76      | 3.80      | 1.59        |
-| parquet         | 1        | 4.99      | 3.47      | 2.33        |
-| qs2             | 4        | 1.36      | 2.73      | 2.56        |
-| qdata           | 4        | 1.28      | 2.51      | 2.58        |
-| qs              | 4        | 1.84      | 2.64      | 2.51        |
-| fst             | 4        | 2.66      | 3.76      | 1.59        |
-| parquet         | 4        | 4.93      | 3.08      | 2.33        |
-
-**Notes on running each algorithm**
-
-- `qs2`, `qdata` and `qs` used `compress_level = 5`
-- `parquet` (from the `arrow` package) were run with
-  `compression = zstd` and `compress_level = 5`
-- `fst` used `compress_level = 55`
-- `base::serialize` was run with `ascii = FALSE` and `xdr = FALSE`
+| --------------- | -------- | --------- | --------- | ----------- |
+| saveRDS         | 1        | 104.46    | 60.10     | 8.38        |
+| base::serialize | 1        | 8.91      | 46.99     | 1.12        |
+| qs2             | 1        | 13.02     | 48.69     | 8.10        |
+| qdata           | 1        | 10.31     | 45.13     | 8.80        |
+| qs-legacy       | 1        | 9.10      | 44.84     | 7.42        |
+
+#### Multi-threaded
+
+| Algorithm | nthreads | Save Time | Read Time | Compression |
+| --------- | -------- | --------- | --------- | ----------- |
+| qs2       | 8        | 3.64      | 43.87     | 8.10        |
+| qdata     | 8        | 2.12      | 41.72     | 8.80        |
+| qs-legacy | 8        | 3.34      | 47.90     | 7.42        |
+
+  - `qs2`, `qdata` and `qs` used `compress_level = 3`
+  - `base::serialize` was run with `ascii = FALSE` and `xdr = FALSE`
 
 **Datasets**
 
-- `enwik8` the first 1E8 lines of English Wikipedia
-- `gaia` galactic coordinates and color of 7.2 million stars
-- `tcell` the genomic sequencing of a COVID patient’s immune system
+  - `1000 genomes non-coding VCF` 1000 genomes non-coding variants
+  - `B-cell data` B-cell mouse data (Greiff 2017)
+  - `IP location` IPV4 range data with location information
+  - `Netflix movie ratings` Netflix Prize open competition machine
+    learning dataset
 
 These datasets are openly licensed and represent a combination of
-numeric and textual data across multiple domains.
-
-See `inst/benchmarks` on Github for comprehensive comparisons,
-reproducibility and data sources.
+numeric and text data across multiple domains. See `inst/benchmarks` on
+Github.
diff --git a/inst/analysis/benchmarks/plot_benchmark_results.R b/inst/analysis/benchmarks/plot_benchmark_results.R
@@ -0,0 +1,92 @@
+library(ggplot2)
+library(ggrepel)
+library(patchwork)
+library(data.table)
+library(dplyr)
+library(this.path)
+library(stringplus)
+set_string_ops("&", "|")
+
+options(warn=1)
+
+PLATFORM <- "ubuntu"
+DATA_PATH <- "~/datasets/processed"
+datasets <- DATA_PATH & "/" & c("1000genomes_noncoding_vcf.csv.gz", "B_cell_petshopmouse3.tsv.gz",
+                                "ip_location_2023.csv.gz", "Netflix_Ratings.csv.gz")
+read_dataset <- function(d) {
+  if(d %like% "json.gz") {
+    DATA <- RcppSimdJson::fload(d)
+  } else if(d %like% ".csv.gz") {
+    DATA <- fread(d, sep = ",", data.table=F)
+  } else if(d %like% ".tsv.gz") {
+    DATA <- fread(d, sep = "\t", data.table=F)
+  } else {
+    DATA <- readRDS(d)
+  }
+}
+
+df <- lapply(datasets, function(DATASET) {
+  obj_size <- read_dataset(DATASET) %>% object.size() %>% as.numeric %>% {. / 1048576}
+  DATASET_NAME <- basename(DATASET) %>% gsub("\\..+", "", .)
+  print(DATASET_NAME)
+  fread("%s/results/%s_serialization_benchmarks_%s.csv" | c(this.dir(), PLATFORM, DATASET_NAME)) %>%
+    mutate(dataset = DATASET_NAME, obj_size = obj_size)
+}) %>% rbindlist
+
+dfs <- df %>% 
+  group_by(algo, nthreads, compress_level, dataset, obj_size) %>%
+  summarize(file_size = max(file_size), save_time = median(save_time), read_time = median(read_time))
+
+g1 <- ggplot(df %>% filter(algo %like% "^q" | algo == "parquet"), 
+             aes(x = file_size, y = save_time, color = algo, shape = factor(nthreads))) + 
+  geom_point() + 
+  geom_line(data = dfs %>% filter(algo %like% "^q" | algo == "parquet"), 
+            aes(lty = factor(nthreads))) + 
+  scale_x_log10() +
+  scale_y_log10() + 
+  facet_wrap(~dataset, scales="free") +
+  scale_color_manual(values = palette.colors(palette = "Okabe-Ito")) +
+  labs(color = "Format", lty = "Threads", pch = "Threads",
+       x = "File Size (MB)", y = "Save Time (s)") + 
+  theme_bw(base_size=11) + 
+  guides(color = guide_legend(order = 1), 
+         pch = guide_legend(order = 2),
+         lty = guide_legend(order = 2))
+
+
+g2 <- ggplot(df %>% filter(algo %like% "^q" | algo == "parquet"), 
+             aes(x = save_time, y = read_time, color = algo, shape = factor(nthreads))) + 
+  geom_point() + 
+  geom_line(data = dfs %>% filter(algo %like% "^q" | algo == "parquet"), 
+            aes(lty = factor(nthreads))) + 
+  scale_x_log10() +
+  scale_y_log10() + 
+  facet_wrap(~dataset, scales="free") +
+  scale_color_manual(values = palette.colors(palette = "Okabe-Ito")) +
+  labs(color = "Format", lty = "Threads", pch = "Threads",
+       x = "Save Time (s)", y = "Read Time (s)") + 
+  theme_bw(base_size=11) + 
+  guides(color = guide_legend(order = 1), 
+         pch = guide_legend(order = 2),
+         lty = guide_legend(order = 2))
+
+ggsave(g1, file = "plots/%s_write_benchmarks.png" | PLATFORM, width = 7, height = 5)
+ggsave(g2, file = "plots/%s_read_benchmarks.png" | PLATFORM, width = 7, height = 5)
+
+################################################################################
+# Summary table
+
+
+dfs2 <- dfs %>%
+    filter( (algo %like% "^q" & compress_level == 3) |
+            (algo == "rds") | 
+            (algo == "base_serialize") |
+            (algo == "fst" & compress_level  == 50) |
+            (algo == "parquet" & compress_level == 3))
+
+dfs2 %>%
+  mutate(compression = obj_size / file_size) %>%
+  group_by(algo, nthreads) %>%
+  summarize(compression = mean(compression), save_time = sum(save_time), read_time = sum(read_time)) %>%
+  arrange(algo, nthreads) %>% as.data.frame
+
diff --git a/inst/analysis/benchmarks/plots/ubuntu_read_benchmarks.png b/inst/analysis/benchmarks/plots/ubuntu_read_benchmarks.png
diff --git a/inst/analysis/benchmarks/plots/ubuntu_write_benchmarks.png b/inst/analysis/benchmarks/plots/ubuntu_write_benchmarks.png
diff --git a/inst/analysis/benchmarks/read_timer_function.R b/inst/analysis/benchmarks/read_timer_function.R
@@ -0,0 +1,62 @@
+# helper script for copy and reading benchmarks
+suppressPackageStartupMessages({
+  library(dplyr, quietly=TRUE)
+  library(stringplus, quietly=TRUE)
+})
+
+set_string_ops("&", "|")
+
+args <- commandArgs(trailingOnly = TRUE)
+algo <- args[1]
+nthreads <- args[2] %>% as.numeric
+file_path <- args[3] # must be full path not relative
+output_temp_file <- args[4] # write results to here
+
+now <- function() assign(".time", Sys.time(), envir = globalenv())
+later <- function() { as.numeric(Sys.time() - get(".time", envir = globalenv()), units = "secs") }
+
+# now()
+# x <- file.copy(file_path, copy_path)
+# copy_time <- later()
+if(algo == "base_serialize") {
+  now()
+  con <- file(file_path, "rb")
+  unserialize(con)
+  close(con)
+  read_time <- later()
+} else if(algo == "rds") {
+  now()
+  x <- readRDS(file = file_path)
+  read_time <- later()
+} else if(algo == "qs-legacy") {
+  suppressPackageStartupMessages( library(qs, quietly=TRUE) )
+  now()
+  x <- qs::qread(file_path, nthreads = nthreads)
+  read_time <- later()
+} else if(algo == "qs2") {
+  suppressPackageStartupMessages( library(qs2, quietly=TRUE) )
+  now()
+  x <- qs2::qs_read(file_path, validate_checksum = FALSE, nthreads = nthreads)
+  read_time <- later()
+} else if(algo == "qdata") {
+  suppressPackageStartupMessages( library(qs2, quietly=TRUE) )
+  now()
+  x <- qs2::qd_read(file_path, validate_checksum = FALSE, nthreads = nthreads)
+  read_time <- later()
+} else if(algo == "fst") {
+  suppressPackageStartupMessages( library(fst, quietly=TRUE) )
+  fst::threads_fst(nr_of_threads = nthreads)
+  now()
+  x <- fst::read_fst(path = file_path)
+  read_time <- later()
+} else if(algo == "parquet") {
+  suppressPackageStartupMessages( library(arrow, quietly=TRUE) )
+  options(arrow.use_altrep = FALSE) # don't use ALTREP/mmap for benchmark
+  arrow::set_cpu_count(num_threads = nthreads)
+  now()
+  x <- arrow::read_parquet(file = file_path, mmap = FALSE)
+  read_time <- later()
+}
+
+writeLines(text = c(read_time) %>% as.character, con = output_temp_file)
+