From a1cea81d7a10e95f1d742716a4525d8725f01676 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Thu, 1 Feb 2024 18:10:53 +0100 Subject: [PATCH 01/49] Add tokio AsyncRead support --- .vscode/settings.json | 12 ++- Cargo.lock | 174 ++++++++++++++++++++++++++++++++++++ Cargo.toml | 5 ++ src/reader/mod.rs | 3 + src/reader/tokio/decoder.rs | 113 +++++++++++++++++++++++ src/reader/tokio/mod.rs | 2 + 6 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 src/reader/tokio/decoder.rs create mode 100644 src/reader/tokio/mod.rs diff --git a/.vscode/settings.json b/.vscode/settings.json index 3248b76..2e949f9 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,13 @@ { - "rust-analyzer.cargo.features": ["default", "lzma", "deflate64", "bzip2", "zstd"] + "rust-analyzer.cargo.features": [ + "default", + "tokio", + "lzma", + "deflate64", + "bzip2", + "zstd" + ], + "rust-analyzer.linkedProjects": [ + "./Cargo.toml" + ] } \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 286830c..79ac253 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +dependencies = [ + "gimli", +] + [[package]] name = "adler" version = "1.0.2" @@ -86,6 +95,21 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "backtrace" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + [[package]] name = "bumpalo" version = "3.14.0" @@ -286,6 +310,101 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "futures" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" + +[[package]] +name = "futures-executor" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" + +[[package]] +name = "futures-macro" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" + +[[package]] +name = "futures-task" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" + +[[package]] +name = "futures-util" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "gimli" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" + [[package]] name = "hashbrown" version = "0.14.3" @@ -490,6 +609,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" +[[package]] +name = "object" +version = "0.32.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +dependencies = [ + "memchr", +] + [[package]] name = "oem_cp" version = "2.0.0" @@ -564,6 +692,12 @@ version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "pkg-config" version = "0.3.29" @@ -649,16 +783,19 @@ dependencies = [ "deflate64", "encoding_rs", "flate2", + "futures", "humansize", "indicatif", "lzma-rs", "num_enum", "oem_cp", "oval", + "pin-project-lite", "positioned-io", "pretty-hex", "test-log", "thiserror", + "tokio", "tracing", "tracing-subscriber", "winnow", @@ -709,6 +846,12 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + [[package]] name = "ryu" version = "1.0.16" @@ -761,6 +904,15 @@ version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + [[package]] name = "smallvec" version = "1.13.1" @@ -835,6 +987,28 @@ dependencies = [ "once_cell", ] +[[package]] +name = "tokio" +version = "1.35.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c89b4efa943be685f629b149f53829423f8f5531ea21249408e8e2f8671ec104" +dependencies = [ + "backtrace", + "pin-project-lite", + "tokio-macros", +] + +[[package]] +name = "tokio-macros" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "toml_datetime" version = "0.6.5" diff --git a/Cargo.toml b/Cargo.toml index ad753c6..97e3536 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,9 @@ lzma-rs = { version = "0.3.0", features = ["stream"], optional = true } deflate64 = { version = "0.1.7", optional = true } bzip2 = { version = "0.4.4", optional = true } zstd = { version = "0.13.0", optional = true } +tokio = { version = "1.35.1", optional = true } +futures = { version = "0.3.30", optional = true } +pin-project-lite = { version = "0.2.13", optional = true } [features] default = ["sync", "file", "deflate"] @@ -49,6 +52,7 @@ deflate64 = ["dep:deflate64"] lzma = ["dep:lzma-rs"] bzip2 = ["dep:bzip2"] zstd = ["dep:zstd"] +tokio = ["dep:tokio", "dep:futures", "dep:pin-project-lite"] [dev-dependencies] clap = { version = "4.4.18", features = ["derive"] } @@ -56,6 +60,7 @@ humansize = "2.1.3" indicatif = "0.17.7" test-log = { version = "0.2.14", default-features = false, features = ["tracing-subscriber", "trace"] } tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } +tokio = { version = "1.35.1", features = ["macros"] } [profile.release] debug = 1 diff --git a/src/reader/mod.rs b/src/reader/mod.rs index c2def34..bb12f98 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -6,3 +6,6 @@ pub use self::archive_reader::{ArchiveReader, ArchiveReaderResult}; #[cfg(feature = "sync")] pub mod sync; + +#[cfg(feature = "tokio")] +pub mod tokio; diff --git a/src/reader/tokio/decoder.rs b/src/reader/tokio/decoder.rs new file mode 100644 index 0000000..fdbbce1 --- /dev/null +++ b/src/reader/tokio/decoder.rs @@ -0,0 +1,113 @@ +use std::{cmp, io, pin::Pin, task}; + +use oval::Buffer; +use tokio::io::{AsyncBufRead, AsyncRead}; + +pub trait AsyncDecoder: AsyncRead +where + R: AsyncRead, +{ + /// Moves the inner reader out of this decoder. + /// self is boxed because decoders are typically used as trait objects. + fn into_inner(self: Box) -> R; + + /// Returns a mutable reference to the inner reader. + fn get_mut(&mut self) -> &mut R; +} + +pub struct StoreAsyncDecoder +where + R: AsyncRead, +{ + inner: R, +} + +impl StoreAsyncDecoder +where + R: AsyncRead, +{ + pub fn new(inner: R) -> Self { + Self { inner } + } +} + +impl AsyncRead for StoreAsyncDecoder +where + R: AsyncRead, +{ + fn poll_read( + self: Pin<&mut Self>, + cx: &mut task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> task::Poll> { + // pin-project inner + let inner = unsafe { self.map_unchecked_mut(|s| &mut s.inner) }; + inner.poll_read(cx, buf) + } +} + +impl AsyncDecoder for StoreAsyncDecoder +where + R: AsyncRead, +{ + fn into_inner(self: Box) -> R { + self.inner + } + + fn get_mut(&mut self) -> &mut R { + &mut self.inner + } +} + +// TODO: dedup between tokio & sync + +/// Only allows reading a fixed number of bytes from a [oval::Buffer], +/// allowing to move the inner reader out afterwards. +pub struct RawEntryAsyncReader { + remaining: u64, + inner: Buffer, +} + +impl RawEntryAsyncReader { + pub fn new(inner: Buffer, remaining: u64) -> Self { + Self { inner, remaining } + } + + pub fn into_inner(self) -> Buffer { + self.inner + } + + pub fn get_mut(&mut self) -> &mut Buffer { + &mut self.inner + } +} + +impl AsyncBufRead for RawEntryAsyncReader { + fn consume(mut self: Pin<&mut Self>, amt: usize) { + self.as_mut().remaining -= amt as u64; + Buffer::consume(&mut self.inner, amt); + } + + fn poll_fill_buf( + self: Pin<&mut Self>, + _cx: &mut task::Context<'_>, + ) -> task::Poll> { + let max_avail = cmp::min(self.remaining, self.inner.available_data() as u64); + Ok(self.get_mut().inner.data()[..max_avail as _].as_ref()).into() + } +} + +impl AsyncRead for RawEntryAsyncReader { + fn poll_read( + mut self: Pin<&mut Self>, + _cx: &mut task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> task::Poll> { + let len = cmp::min(buf.remaining() as u64, self.remaining) as usize; + tracing::trace!(%len, buf_remaining = buf.remaining(), remaining = self.remaining, available_data = self.inner.available_data(), available_space = self.inner.available_space(), "computing len"); + + buf.put_slice(&self.inner.data()[..len]); + self.as_mut().inner.consume(len); + Ok(()).into() + } +} diff --git a/src/reader/tokio/mod.rs b/src/reader/tokio/mod.rs new file mode 100644 index 0000000..953526d --- /dev/null +++ b/src/reader/tokio/mod.rs @@ -0,0 +1,2 @@ +mod decoder; +pub use decoder::*; From c8d852d8d1ca00ce3c9421ee98c5da7c5e8d5cb3 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Thu, 1 Feb 2024 18:12:57 +0100 Subject: [PATCH 02/49] Use pin-project-lite --- src/reader/tokio/decoder.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/reader/tokio/decoder.rs b/src/reader/tokio/decoder.rs index fdbbce1..d53a10d 100644 --- a/src/reader/tokio/decoder.rs +++ b/src/reader/tokio/decoder.rs @@ -15,11 +15,14 @@ where fn get_mut(&mut self) -> &mut R; } -pub struct StoreAsyncDecoder -where - R: AsyncRead, -{ - inner: R, +pin_project_lite::pin_project! { + pub struct StoreAsyncDecoder + where + R: AsyncRead, + { + #[pin] + inner: R, + } } impl StoreAsyncDecoder @@ -40,9 +43,8 @@ where cx: &mut task::Context<'_>, buf: &mut tokio::io::ReadBuf<'_>, ) -> task::Poll> { - // pin-project inner - let inner = unsafe { self.map_unchecked_mut(|s| &mut s.inner) }; - inner.poll_read(cx, buf) + let this = self.project(); + this.inner.poll_read(cx, buf) } } From 4754f50674c89ade8b9fd3bd3974826a6101c7a1 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Thu, 1 Feb 2024 18:16:06 +0100 Subject: [PATCH 03/49] Dedup RawEntryReader --- src/reader/mod.rs | 27 +++++++++++++++++ src/reader/sync/decoder.rs | 23 ++------------- src/reader/sync/entry_reader/bzip2_dec.rs | 2 +- src/reader/sync/entry_reader/deflate64_dec.rs | 2 +- src/reader/sync/entry_reader/deflate_dec.rs | 2 +- src/reader/sync/entry_reader/lzma_dec.rs | 2 +- src/reader/sync/entry_reader/mod.rs | 5 +++- src/reader/sync/entry_reader/zstd_dec.rs | 2 +- src/reader/tokio/decoder.rs | 29 +++---------------- 9 files changed, 42 insertions(+), 52 deletions(-) diff --git a/src/reader/mod.rs b/src/reader/mod.rs index bb12f98..4bc5299 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -2,6 +2,8 @@ mod buffer; mod macros; mod archive_reader; +use oval::Buffer; + pub use self::archive_reader::{ArchiveReader, ArchiveReaderResult}; #[cfg(feature = "sync")] @@ -9,3 +11,28 @@ pub mod sync; #[cfg(feature = "tokio")] pub mod tokio; + +/// Only allows reading a fixed number of bytes from a [oval::Buffer], +/// used for reading the raw (compressed) data for a single zip file entry. +/// It also allows moving out the inner buffer afterwards. +pub(crate) struct RawEntryReader { + remaining: u64, + inner: Buffer, +} + +impl RawEntryReader { + pub(crate) fn new(inner: Buffer, entry_size: u64) -> Self { + Self { + inner, + remaining: entry_size, + } + } + + pub(crate) fn into_inner(self) -> Buffer { + self.inner + } + + pub(crate) fn get_mut(&mut self) -> &mut Buffer { + &mut self.inner + } +} diff --git a/src/reader/sync/decoder.rs b/src/reader/sync/decoder.rs index fca6e56..d36d140 100644 --- a/src/reader/sync/decoder.rs +++ b/src/reader/sync/decoder.rs @@ -2,6 +2,8 @@ use std::{cmp, io}; use oval::Buffer; +use crate::reader::RawEntryReader; + pub trait Decoder: io::Read where R: io::Read, @@ -52,27 +54,6 @@ where } } -/// Only allows reading a fixed number of bytes from a [oval::Buffer], -/// allowing to move the inner reader out afterwards. -pub struct RawEntryReader { - remaining: u64, - inner: Buffer, -} - -impl RawEntryReader { - pub fn new(inner: Buffer, remaining: u64) -> Self { - Self { inner, remaining } - } - - pub fn into_inner(self) -> Buffer { - self.inner - } - - pub fn get_mut(&mut self) -> &mut Buffer { - &mut self.inner - } -} - impl io::BufRead for RawEntryReader { fn fill_buf(&mut self) -> io::Result<&[u8]> { let max_avail = cmp::min(self.remaining, self.inner.available_data() as u64); diff --git a/src/reader/sync/entry_reader/bzip2_dec.rs b/src/reader/sync/entry_reader/bzip2_dec.rs index 238427f..b3ffb70 100644 --- a/src/reader/sync/entry_reader/bzip2_dec.rs +++ b/src/reader/sync/entry_reader/bzip2_dec.rs @@ -2,7 +2,7 @@ use std::io::Read; use bzip2::read::BzDecoder; -use crate::reader::sync::{Decoder, RawEntryReader}; +use crate::reader::{sync::Decoder, RawEntryReader}; impl Decoder for BzDecoder where diff --git a/src/reader/sync/entry_reader/deflate64_dec.rs b/src/reader/sync/entry_reader/deflate64_dec.rs index cb1fdd0..ccd339b 100644 --- a/src/reader/sync/entry_reader/deflate64_dec.rs +++ b/src/reader/sync/entry_reader/deflate64_dec.rs @@ -2,7 +2,7 @@ use std::io::{BufReader, Read}; use deflate64::Deflate64Decoder; -use crate::reader::sync::{Decoder, RawEntryReader}; +use crate::reader::{sync::Decoder, RawEntryReader}; impl Decoder for Deflate64Decoder> where diff --git a/src/reader/sync/entry_reader/deflate_dec.rs b/src/reader/sync/entry_reader/deflate_dec.rs index dfa6495..3e07d6a 100644 --- a/src/reader/sync/entry_reader/deflate_dec.rs +++ b/src/reader/sync/entry_reader/deflate_dec.rs @@ -2,7 +2,7 @@ use std::io::Read; use flate2::read::DeflateDecoder; -use crate::reader::sync::{Decoder, RawEntryReader}; +use crate::reader::{sync::Decoder, RawEntryReader}; impl Decoder for DeflateDecoder where diff --git a/src/reader/sync/entry_reader/lzma_dec.rs b/src/reader/sync/entry_reader/lzma_dec.rs index 1898ebf..c1b7d4e 100644 --- a/src/reader/sync/entry_reader/lzma_dec.rs +++ b/src/reader/sync/entry_reader/lzma_dec.rs @@ -2,7 +2,7 @@ use lzma_rs::decompress::Stream; use std::io::{Read, Write}; use crate::{ - reader::sync::{Decoder, RawEntryReader}, + reader::{sync::Decoder, RawEntryReader}, Error, UnsupportedError, }; diff --git a/src/reader/sync/entry_reader/mod.rs b/src/reader/sync/entry_reader/mod.rs index a46e118..bbac88b 100644 --- a/src/reader/sync/entry_reader/mod.rs +++ b/src/reader/sync/entry_reader/mod.rs @@ -3,7 +3,10 @@ use crate::{ error::*, format::*, - reader::sync::decoder::{Decoder, RawEntryReader, StoreDecoder}, + reader::{ + sync::decoder::{Decoder, StoreDecoder}, + RawEntryReader, + }, transition, }; diff --git a/src/reader/sync/entry_reader/zstd_dec.rs b/src/reader/sync/entry_reader/zstd_dec.rs index 014f672..6f42b7e 100644 --- a/src/reader/sync/entry_reader/zstd_dec.rs +++ b/src/reader/sync/entry_reader/zstd_dec.rs @@ -2,7 +2,7 @@ use std::io::{BufRead, Read}; use zstd::stream::Decoder as ZstdDecoder; -use crate::reader::sync::{Decoder, RawEntryReader}; +use crate::reader::{sync::Decoder, RawEntryReader}; impl Decoder for ZstdDecoder<'static, R> where diff --git a/src/reader/tokio/decoder.rs b/src/reader/tokio/decoder.rs index d53a10d..56c1bc3 100644 --- a/src/reader/tokio/decoder.rs +++ b/src/reader/tokio/decoder.rs @@ -3,6 +3,8 @@ use std::{cmp, io, pin::Pin, task}; use oval::Buffer; use tokio::io::{AsyncBufRead, AsyncRead}; +use crate::reader::RawEntryReader; + pub trait AsyncDecoder: AsyncRead where R: AsyncRead, @@ -61,30 +63,7 @@ where } } -// TODO: dedup between tokio & sync - -/// Only allows reading a fixed number of bytes from a [oval::Buffer], -/// allowing to move the inner reader out afterwards. -pub struct RawEntryAsyncReader { - remaining: u64, - inner: Buffer, -} - -impl RawEntryAsyncReader { - pub fn new(inner: Buffer, remaining: u64) -> Self { - Self { inner, remaining } - } - - pub fn into_inner(self) -> Buffer { - self.inner - } - - pub fn get_mut(&mut self) -> &mut Buffer { - &mut self.inner - } -} - -impl AsyncBufRead for RawEntryAsyncReader { +impl AsyncBufRead for RawEntryReader { fn consume(mut self: Pin<&mut Self>, amt: usize) { self.as_mut().remaining -= amt as u64; Buffer::consume(&mut self.inner, amt); @@ -99,7 +78,7 @@ impl AsyncBufRead for RawEntryAsyncReader { } } -impl AsyncRead for RawEntryAsyncReader { +impl AsyncRead for RawEntryReader { fn poll_read( mut self: Pin<&mut Self>, _cx: &mut task::Context<'_>, From 6574b988f07da6ed87ab6de6142ef71060a5c77a Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Thu, 1 Feb 2024 18:39:07 +0100 Subject: [PATCH 04/49] Get started on async entry reader --- src/reader/macros.rs | 2 +- src/reader/sync/decoder.rs | 6 +- src/reader/sync/entry_reader/mod.rs | 28 +-- src/reader/sync/mod.rs | 8 +- src/reader/tokio/decoder.rs | 6 +- src/reader/tokio/entry_reader/mod.rs | 288 +++++++++++++++++++++++++++ src/reader/tokio/mod.rs | 3 + 7 files changed, 312 insertions(+), 29 deletions(-) create mode 100644 src/reader/tokio/entry_reader/mod.rs diff --git a/src/reader/macros.rs b/src/reader/macros.rs index 44f6394..da62b12 100644 --- a/src/reader/macros.rs +++ b/src/reader/macros.rs @@ -1,7 +1,7 @@ #[macro_export] macro_rules! transition { ($state: expr => ($pattern: pat) $body: expr) => { - $state = if let $pattern = std::mem::replace(&mut $state, S::Transitioning) { + $state = if let $pattern = std::mem::replace(&mut $state, State::Transitioning) { $body } else { unreachable!() diff --git a/src/reader/sync/decoder.rs b/src/reader/sync/decoder.rs index d36d140..7948de8 100644 --- a/src/reader/sync/decoder.rs +++ b/src/reader/sync/decoder.rs @@ -4,7 +4,7 @@ use oval::Buffer; use crate::reader::RawEntryReader; -pub trait Decoder: io::Read +pub(crate) trait Decoder: io::Read where R: io::Read, { @@ -16,7 +16,7 @@ where fn get_mut(&mut self) -> &mut R; } -pub struct StoreDecoder +pub(crate) struct StoreDecoder where R: io::Read, { @@ -27,7 +27,7 @@ impl StoreDecoder where R: io::Read, { - pub fn new(inner: R) -> Self { + pub(crate) fn new(inner: R) -> Self { Self { inner } } } diff --git a/src/reader/sync/entry_reader/mod.rs b/src/reader/sync/entry_reader/mod.rs index bbac88b..382b3ff 100644 --- a/src/reader/sync/entry_reader/mod.rs +++ b/src/reader/sync/entry_reader/mod.rs @@ -83,8 +83,11 @@ where use State as S; match self.state { S::ReadLocalHeader { ref mut buffer } => { - // FIXME: if this returns less than the size of LocalFileHeader, we'll error out let read_bytes = self.rd.read(buffer.space())?; + if read_bytes == 0 { + // we should have read the local header by now + return Err(io::ErrorKind::UnexpectedEof.into()); + } buffer.fill(read_bytes); let mut input = Partial::new(buffer.data()); @@ -94,10 +97,8 @@ where trace!("local file header: {:#?}", header); transition!(self.state => (S::ReadLocalHeader { buffer }) { - // allow unnecessary mut for some feature combinations - #[allow(unused_mut)] let mut limited_reader = RawEntryReader::new(buffer, self.inner.compressed_size); - let decoder: Box> = self.get_decoder(limited_reader)?; + let decoder = self.get_decoder(limited_reader)?; S::ReadData { hasher: crc32fast::Hasher::new(), @@ -135,8 +136,8 @@ where } } } - match decoder.read(buf) { - Ok(0) => { + match decoder.read(buf)? { + 0 => { transition!(self.state => (S::ReadData { decoder, header, hasher, uncompressed_size, .. }) { let limited_reader = decoder.into_inner(); let buffer = limited_reader.into_inner(); @@ -154,22 +155,11 @@ where }); self.read(buf) } - Ok(n) => { + n => { *uncompressed_size += n as u64; hasher.update(&buf[..n]); Ok(n) } - Err(e) => match e.kind() { - io::ErrorKind::UnexpectedEof => { - let buffer = decoder.get_mut().get_mut(); - if self.eof || buffer.available_space() == 0 { - Err(e) - } else { - self.read(buf) - } - } - _ => Err(e), - }, } } S::ReadDataDescriptor { ref mut buffer, .. } => { @@ -271,7 +261,7 @@ where fn get_decoder( &self, - #[allow(unused_mut)] mut raw_r: RawEntryReader, + mut raw_r: RawEntryReader, ) -> Result>, Error> { let decoder: Box> = match self.method { Method::Store => Box::new(StoreDecoder::new(raw_r)), diff --git a/src/reader/sync/mod.rs b/src/reader/sync/mod.rs index a0708db..276a2e5 100644 --- a/src/reader/sync/mod.rs +++ b/src/reader/sync/mod.rs @@ -1,8 +1,10 @@ mod decoder; -pub use decoder::*; +use decoder::*; mod entry_reader; -pub use entry_reader::*; +use entry_reader::*; mod read_zip; -pub use read_zip::*; + +// re-exports +pub use read_zip::{ReadZip, ReadZipWithSize}; diff --git a/src/reader/tokio/decoder.rs b/src/reader/tokio/decoder.rs index 56c1bc3..d29a171 100644 --- a/src/reader/tokio/decoder.rs +++ b/src/reader/tokio/decoder.rs @@ -5,7 +5,7 @@ use tokio::io::{AsyncBufRead, AsyncRead}; use crate::reader::RawEntryReader; -pub trait AsyncDecoder: AsyncRead +pub(crate) trait AsyncDecoder: AsyncRead where R: AsyncRead, { @@ -18,7 +18,7 @@ where } pin_project_lite::pin_project! { - pub struct StoreAsyncDecoder + pub(crate) struct StoreAsyncDecoder where R: AsyncRead, { @@ -31,7 +31,7 @@ impl StoreAsyncDecoder where R: AsyncRead, { - pub fn new(inner: R) -> Self { + pub(crate) fn new(inner: R) -> Self { Self { inner } } } diff --git a/src/reader/tokio/entry_reader/mod.rs b/src/reader/tokio/entry_reader/mod.rs new file mode 100644 index 0000000..498eb8e --- /dev/null +++ b/src/reader/tokio/entry_reader/mod.rs @@ -0,0 +1,288 @@ +use crate::{ + error::*, + format::*, + reader::{ + tokio::decoder::{AsyncDecoder, StoreAsyncDecoder}, + RawEntryReader, + }, + transition, +}; + +use cfg_if::cfg_if; +use oval::Buffer; +use std::{io, pin::Pin, task}; +use tokio::io::AsyncRead; +use tracing::trace; +use winnow::{ + error::ErrMode, + stream::{AsBytes, Offset}, + Parser, Partial, +}; + +struct EntryReadMetrics { + uncompressed_size: u64, + crc32: u32, +} + +pin_project_lite::pin_project! { + #[project = StateProj] + enum State { + ReadLocalHeader { + buffer: Buffer, + }, + ReadData { + hasher: crc32fast::Hasher, + uncompressed_size: u64, + header: LocalFileHeaderRecord, + #[pin] + decoder: Box + Unpin>, + }, + ReadDataDescriptor { + metrics: EntryReadMetrics, + header: LocalFileHeaderRecord, + buffer: Buffer, + }, + Validate { + metrics: EntryReadMetrics, + header: LocalFileHeaderRecord, + descriptor: Option, + }, + Done, + Transitioning, + } +} + +pin_project_lite::pin_project! { + pub struct EntryReader + where + R: AsyncRead, + { + #[pin] + rd: R, + eof: bool, + #[pin] + state: State, + inner: StoredEntryInner, + method: Method, + } +} + +impl AsyncRead for EntryReader +where + R: AsyncRead, +{ + fn poll_read( + self: Pin<&mut Self>, + cx: &mut task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> task::Poll> { + let this = self.project(); + + use StateProj as S; + match this.state.project() { + S::ReadLocalHeader { ref mut buffer } => { + let mut read_buf = tokio::io::ReadBuf::new(buffer.space()); + futures::ready!(this.rd.poll_read(cx, &mut read_buf))?; + let read_bytes = read_buf.filled().len(); + if read_bytes == 0 { + return Err(io::ErrorKind::UnexpectedEof.into()).into(); + } + buffer.fill(read_bytes); + + let mut input = Partial::new(buffer.data()); + match LocalFileHeaderRecord::parser.parse_next(&mut input) { + Ok(header) => { + buffer.consume(input.as_bytes().offset_from(&buffer.data())); + + trace!("local file header: {:#?}", header); + transition!(self.state => (State::ReadLocalHeader { buffer }) { + let mut limited_reader = RawEntryReader::new(buffer, self.inner.compressed_size); + let decoder = self.get_decoder(limited_reader)?; + + State::ReadData { + hasher: crc32fast::Hasher::new(), + uncompressed_size: 0, + decoder, + header, + } + }); + self.poll_read(cx, buf) + } + Err(ErrMode::Incomplete(_)) => { + // try another read - if it returns pending, it'll be propagated + self.poll_read(cx, buf) + } + Err(_e) => Err(Error::Format(FormatError::InvalidLocalHeader).into()).into(), + } + } + S::ReadData { + ref mut uncompressed_size, + ref mut decoder, + ref mut hasher, + .. + } => { + { + let buffer = decoder.get_mut().get_mut().get_mut(); + if !*this.eof && buffer.available_data() == 0 { + if buffer.available_space() == 0 { + buffer.shift(); + } + + let mut read_buf = tokio::io::ReadBuf::new(buffer.space()); + futures::ready!(this.rd.poll_read(cx, &mut read_buf))?; + match read_buf.filled().len() { + 0 => { + *this.eof = true; + } + n => { + buffer.fill(n); + } + } + } + } + + let filled_before = buf.filled().len(); + futures::ready!(decoder.poll_read(cx, buf))?; + let filled_after = buf.filled().len(); + let read_bytes = filled_after - filled_before; + + match read_bytes { + 0 => { + transition!(self.state => (State::ReadData { decoder, header, hasher, uncompressed_size, .. }) { + let limited_reader = decoder.into_inner(); + let buffer = limited_reader.into_inner(); + let metrics = EntryReadMetrics { + crc32: hasher.finalize(), + uncompressed_size, + }; + if header.has_data_descriptor() { + trace!("will read data descriptor (flags = {:x})", header.flags); + State::ReadDataDescriptor { metrics, buffer, header } + } else { + trace!("no data descriptor to read"); + State::Validate { metrics, header, descriptor: None } + } + }); + self.poll_read(cx, buf) + } + n => { + **uncompressed_size = **uncompressed_size + n as u64; + let read_slice = &buf.filled()[filled_before..filled_after]; + hasher.update(&buf.filled()[..n]); + Ok(()).into() + } + } + } + S::ReadDataDescriptor { ref mut buffer, .. } => { + trace!( + "read data descriptor, avail data = {}, avail space = {}", + buffer.available_data(), + buffer.available_space() + ); + + let mut input = Partial::new(buffer.data()); + match DataDescriptorRecord::mk_parser(self.inner.is_zip64).parse_next(&mut input) { + Ok(descriptor) => { + buffer.consume(input.as_bytes().offset_from(&buffer.data())); + trace!("data descriptor = {:#?}", descriptor); + transition!(self.state => (State::ReadDataDescriptor { metrics, header, .. }) { + State::Validate { metrics, header, descriptor: Some(descriptor) } + }); + self.poll_read(cx, buf) + } + Err(ErrMode::Incomplete(_)) => { + let mut read_buf = tokio::io::ReadBuf::new(buffer.space()); + futures::ready!(this.rd.poll_read(cx, &mut read_buf))?; + let read_bytes = read_buf.filled().len(); + if read_bytes == 0 { + return Err(io::ErrorKind::UnexpectedEof.into()).into(); + } + buffer.fill(read_bytes); + self.poll_read(cx, buf) + } + Err(_e) => Err(Error::Format(FormatError::InvalidLocalHeader).into()).into(), + } + } + S::Validate { + ref metrics, + ref header, + ref descriptor, + } => { + let expected_crc32 = if self.inner.crc32 != 0 { + self.inner.crc32 + } else if let Some(descriptor) = descriptor.as_ref() { + descriptor.crc32 + } else { + header.crc32 + }; + + let expected_size = if self.inner.uncompressed_size != 0 { + self.inner.uncompressed_size + } else if let Some(descriptor) = descriptor.as_ref() { + descriptor.uncompressed_size + } else { + header.uncompressed_size as u64 + }; + + if expected_size != metrics.uncompressed_size { + return Err(Error::Format(FormatError::WrongSize { + expected: expected_size, + actual: metrics.uncompressed_size, + }) + .into()) + .into(); + } + + if expected_crc32 != 0 && expected_crc32 != metrics.crc32 { + return Err(Error::Format(FormatError::WrongChecksum { + expected: expected_crc32, + actual: metrics.crc32, + }) + .into()) + .into(); + } + + self.state = State::Done; + self.poll_read(cx, buf) + } + S::Done => Ok(()).into(), + S::Transitioning => unreachable!(), + } + } +} + +impl EntryReader +where + R: AsyncRead, +{ + const DEFAULT_BUFFER_SIZE: usize = 256 * 1024; + + pub fn new(entry: &StoredEntry, get_reader: F) -> Self + where + F: Fn(u64) -> R, + { + Self { + rd: get_reader(entry.header_offset), + eof: false, + state: State::ReadLocalHeader { + buffer: Buffer::with_capacity(Self::DEFAULT_BUFFER_SIZE), + }, + method: entry.method(), + inner: entry.inner, + } + } + + fn get_decoder( + &self, + mut raw_r: RawEntryReader, + ) -> Result + Unpin>, Error> { + let decoder: Box + Unpin> = match self.method { + Method::Store => Box::new(StoreAsyncDecoder::new(raw_r)), + method => { + return Err(Error::method_not_supported(method)); + } + }; + + Ok(decoder) + } +} diff --git a/src/reader/tokio/mod.rs b/src/reader/tokio/mod.rs index 953526d..4fd27db 100644 --- a/src/reader/tokio/mod.rs +++ b/src/reader/tokio/mod.rs @@ -1,2 +1,5 @@ mod decoder; pub use decoder::*; + +mod entry_reader; +pub use entry_reader::*; From 2d3d7aad0d818049bfc628cebbaf4e82d5fe96dc Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Thu, 1 Feb 2024 18:46:07 +0100 Subject: [PATCH 05/49] Now for the fun part --- src/reader/archive_reader.rs | 6 ++++++ src/reader/macros.rs | 14 ++++++++++++- src/reader/sync/entry_reader/mod.rs | 9 ++++++-- src/reader/tokio/entry_reader/mod.rs | 31 ++++++++++++++++------------ 4 files changed, 44 insertions(+), 16 deletions(-) diff --git a/src/reader/archive_reader.rs b/src/reader/archive_reader.rs index 0108142..bbd1026 100644 --- a/src/reader/archive_reader.rs +++ b/src/reader/archive_reader.rs @@ -57,6 +57,12 @@ enum ArchiveReaderState { Done, } +impl Default for ArchiveReaderState { + fn default() -> Self { + Self::Transitioning + } +} + impl ArchiveReaderState { fn buffer_as_mut(&mut self) -> Option<&mut Buffer> { use ArchiveReaderState as S; diff --git a/src/reader/macros.rs b/src/reader/macros.rs index da62b12..2e7997d 100644 --- a/src/reader/macros.rs +++ b/src/reader/macros.rs @@ -1,7 +1,19 @@ #[macro_export] macro_rules! transition { ($state: expr => ($pattern: pat) $body: expr) => { - $state = if let $pattern = std::mem::replace(&mut $state, State::Transitioning) { + $state = if let $pattern = std::mem::replace(&mut $state, Default::default()) { + $body + } else { + unreachable!() + }; + }; +} + +#[macro_export] +macro_rules! transition_async { + ($state: expr => ($pattern: pat) $body: expr) => { + *$state.as_mut() = if let $pattern = std::mem::replace($state.get_mut(), Default::default()) + { $body } else { unreachable!() diff --git a/src/reader/sync/entry_reader/mod.rs b/src/reader/sync/entry_reader/mod.rs index 382b3ff..3e05d3c 100644 --- a/src/reader/sync/entry_reader/mod.rs +++ b/src/reader/sync/entry_reader/mod.rs @@ -64,6 +64,12 @@ enum State { Transitioning, } +impl Default for State { + fn default() -> Self { + State::Transitioning + } +} + pub struct EntryReader where R: io::Read, @@ -97,8 +103,7 @@ where trace!("local file header: {:#?}", header); transition!(self.state => (S::ReadLocalHeader { buffer }) { - let mut limited_reader = RawEntryReader::new(buffer, self.inner.compressed_size); - let decoder = self.get_decoder(limited_reader)?; + let decoder = self.get_decoder(RawEntryReader::new(buffer, self.inner.compressed_size))?; S::ReadData { hasher: crc32fast::Hasher::new(), diff --git a/src/reader/tokio/entry_reader/mod.rs b/src/reader/tokio/entry_reader/mod.rs index 498eb8e..69473f6 100644 --- a/src/reader/tokio/entry_reader/mod.rs +++ b/src/reader/tokio/entry_reader/mod.rs @@ -5,7 +5,7 @@ use crate::{ tokio::decoder::{AsyncDecoder, StoreAsyncDecoder}, RawEntryReader, }, - transition, + transition_async, }; use cfg_if::cfg_if; @@ -52,8 +52,14 @@ pin_project_lite::pin_project! { } } +impl Default for State { + fn default() -> Self { + State::Transitioning + } +} + pin_project_lite::pin_project! { - pub struct EntryReader + pub struct AsyncEntryReader where R: AsyncRead, { @@ -67,7 +73,7 @@ pin_project_lite::pin_project! { } } -impl AsyncRead for EntryReader +impl AsyncRead for AsyncEntryReader where R: AsyncRead, { @@ -76,10 +82,10 @@ where cx: &mut task::Context<'_>, buf: &mut tokio::io::ReadBuf<'_>, ) -> task::Poll> { - let this = self.project(); + let mut this = self.project(); use StateProj as S; - match this.state.project() { + match this.state.as_mut().project() { S::ReadLocalHeader { ref mut buffer } => { let mut read_buf = tokio::io::ReadBuf::new(buffer.space()); futures::ready!(this.rd.poll_read(cx, &mut read_buf))?; @@ -95,9 +101,8 @@ where buffer.consume(input.as_bytes().offset_from(&buffer.data())); trace!("local file header: {:#?}", header); - transition!(self.state => (State::ReadLocalHeader { buffer }) { - let mut limited_reader = RawEntryReader::new(buffer, self.inner.compressed_size); - let decoder = self.get_decoder(limited_reader)?; + transition_async!(this.state => (State::ReadLocalHeader { buffer }) { + let decoder = self.get_decoder(RawEntryReader::new(buffer, self.inner.compressed_size))?; State::ReadData { hasher: crc32fast::Hasher::new(), @@ -148,7 +153,7 @@ where match read_bytes { 0 => { - transition!(self.state => (State::ReadData { decoder, header, hasher, uncompressed_size, .. }) { + transition_async!(this.state => (State::ReadData { decoder, header, hasher, uncompressed_size, .. }) { let limited_reader = decoder.into_inner(); let buffer = limited_reader.into_inner(); let metrics = EntryReadMetrics { @@ -168,7 +173,7 @@ where n => { **uncompressed_size = **uncompressed_size + n as u64; let read_slice = &buf.filled()[filled_before..filled_after]; - hasher.update(&buf.filled()[..n]); + hasher.update(read_slice); Ok(()).into() } } @@ -185,7 +190,7 @@ where Ok(descriptor) => { buffer.consume(input.as_bytes().offset_from(&buffer.data())); trace!("data descriptor = {:#?}", descriptor); - transition!(self.state => (State::ReadDataDescriptor { metrics, header, .. }) { + transition_async!(this.state => (State::ReadDataDescriptor { metrics, header, .. }) { State::Validate { metrics, header, descriptor: Some(descriptor) } }); self.poll_read(cx, buf) @@ -251,7 +256,7 @@ where } } -impl EntryReader +impl AsyncEntryReader where R: AsyncRead, { @@ -274,7 +279,7 @@ where fn get_decoder( &self, - mut raw_r: RawEntryReader, + raw_r: RawEntryReader, ) -> Result + Unpin>, Error> { let decoder: Box + Unpin> = match self.method { Method::Store => Box::new(StoreAsyncDecoder::new(raw_r)), From b2fde1cb317ed8f157336cd8d6527c03cae0b401 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Thu, 1 Feb 2024 18:50:19 +0100 Subject: [PATCH 06/49] Eyy it typechecks? --- src/reader/macros.rs | 12 +++---- src/reader/tokio/entry_reader/mod.rs | 47 ++++++++++++++-------------- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/src/reader/macros.rs b/src/reader/macros.rs index 2e7997d..dfdf754 100644 --- a/src/reader/macros.rs +++ b/src/reader/macros.rs @@ -12,11 +12,11 @@ macro_rules! transition { #[macro_export] macro_rules! transition_async { ($state: expr => ($pattern: pat) $body: expr) => { - *$state.as_mut() = if let $pattern = std::mem::replace($state.get_mut(), Default::default()) - { - $body - } else { - unreachable!() - }; + *$state.as_mut() = + if let $pattern = std::mem::replace($state.as_mut().get_mut(), Default::default()) { + $body + } else { + unreachable!() + }; }; } diff --git a/src/reader/tokio/entry_reader/mod.rs b/src/reader/tokio/entry_reader/mod.rs index 69473f6..fb216e1 100644 --- a/src/reader/tokio/entry_reader/mod.rs +++ b/src/reader/tokio/entry_reader/mod.rs @@ -8,7 +8,6 @@ use crate::{ transition_async, }; -use cfg_if::cfg_if; use oval::Buffer; use std::{io, pin::Pin, task}; use tokio::io::AsyncRead; @@ -78,11 +77,11 @@ where R: AsyncRead, { fn poll_read( - self: Pin<&mut Self>, + mut self: Pin<&mut Self>, cx: &mut task::Context<'_>, buf: &mut tokio::io::ReadBuf<'_>, ) -> task::Poll> { - let mut this = self.project(); + let mut this = self.as_mut().project(); use StateProj as S; match this.state.as_mut().project() { @@ -102,7 +101,7 @@ where trace!("local file header: {:#?}", header); transition_async!(this.state => (State::ReadLocalHeader { buffer }) { - let decoder = self.get_decoder(RawEntryReader::new(buffer, self.inner.compressed_size))?; + let decoder = method_to_decoder(*this.method, RawEntryReader::new(buffer, this.inner.compressed_size))?; State::ReadData { hasher: crc32fast::Hasher::new(), @@ -127,7 +126,7 @@ where .. } => { { - let buffer = decoder.get_mut().get_mut().get_mut(); + let buffer = decoder.as_mut().get_mut().get_mut().get_mut(); if !*this.eof && buffer.available_data() == 0 { if buffer.available_space() == 0 { buffer.shift(); @@ -147,7 +146,7 @@ where } let filled_before = buf.filled().len(); - futures::ready!(decoder.poll_read(cx, buf))?; + futures::ready!(decoder.as_mut().poll_read(cx, buf))?; let filled_after = buf.filled().len(); let read_bytes = filled_after - filled_before; @@ -186,7 +185,7 @@ where ); let mut input = Partial::new(buffer.data()); - match DataDescriptorRecord::mk_parser(self.inner.is_zip64).parse_next(&mut input) { + match DataDescriptorRecord::mk_parser(this.inner.is_zip64).parse_next(&mut input) { Ok(descriptor) => { buffer.consume(input.as_bytes().offset_from(&buffer.data())); trace!("data descriptor = {:#?}", descriptor); @@ -213,16 +212,16 @@ where ref header, ref descriptor, } => { - let expected_crc32 = if self.inner.crc32 != 0 { - self.inner.crc32 + let expected_crc32 = if this.inner.crc32 != 0 { + this.inner.crc32 } else if let Some(descriptor) = descriptor.as_ref() { descriptor.crc32 } else { header.crc32 }; - let expected_size = if self.inner.uncompressed_size != 0 { - self.inner.uncompressed_size + let expected_size = if this.inner.uncompressed_size != 0 { + this.inner.uncompressed_size } else if let Some(descriptor) = descriptor.as_ref() { descriptor.uncompressed_size } else { @@ -247,7 +246,7 @@ where .into(); } - self.state = State::Done; + *this.state.as_mut().get_mut() = State::Done; self.poll_read(cx, buf) } S::Done => Ok(()).into(), @@ -276,18 +275,18 @@ where inner: entry.inner, } } +} - fn get_decoder( - &self, - raw_r: RawEntryReader, - ) -> Result + Unpin>, Error> { - let decoder: Box + Unpin> = match self.method { - Method::Store => Box::new(StoreAsyncDecoder::new(raw_r)), - method => { - return Err(Error::method_not_supported(method)); - } - }; +fn method_to_decoder( + method: Method, + raw_r: RawEntryReader, +) -> Result + Unpin>, Error> { + let decoder: Box + Unpin> = match method { + Method::Store => Box::new(StoreAsyncDecoder::new(raw_r)), + method => { + return Err(Error::method_not_supported(method)); + } + }; - Ok(decoder) - } + Ok(decoder) } From 20de58acb8a72bc18bfd5e2af6d70e36a729685e Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Thu, 1 Feb 2024 18:54:03 +0100 Subject: [PATCH 07/49] Smol fixes --- src/reader/macros.rs | 13 ++++++------- src/reader/sync/entry_reader/bzip2_dec.rs | 2 +- src/reader/sync/entry_reader/deflate64_dec.rs | 2 +- src/reader/sync/entry_reader/deflate_dec.rs | 2 +- src/reader/sync/entry_reader/lzma_dec.rs | 2 +- src/reader/sync/entry_reader/mod.rs | 10 +++------- src/reader/sync/entry_reader/zstd_dec.rs | 2 +- src/reader/sync/mod.rs | 5 +---- src/reader/tokio/entry_reader/mod.rs | 10 +++------- src/reader/tokio/mod.rs | 3 --- 10 files changed, 18 insertions(+), 33 deletions(-) diff --git a/src/reader/macros.rs b/src/reader/macros.rs index dfdf754..70a39df 100644 --- a/src/reader/macros.rs +++ b/src/reader/macros.rs @@ -1,7 +1,7 @@ #[macro_export] macro_rules! transition { ($state: expr => ($pattern: pat) $body: expr) => { - $state = if let $pattern = std::mem::replace(&mut $state, Default::default()) { + $state = if let $pattern = std::mem::take(&mut $state) { $body } else { unreachable!() @@ -12,11 +12,10 @@ macro_rules! transition { #[macro_export] macro_rules! transition_async { ($state: expr => ($pattern: pat) $body: expr) => { - *$state.as_mut() = - if let $pattern = std::mem::replace($state.as_mut().get_mut(), Default::default()) { - $body - } else { - unreachable!() - }; + *$state.as_mut() = if let $pattern = std::mem::take($state.as_mut().get_mut()) { + $body + } else { + unreachable!() + }; }; } diff --git a/src/reader/sync/entry_reader/bzip2_dec.rs b/src/reader/sync/entry_reader/bzip2_dec.rs index b3ffb70..d663476 100644 --- a/src/reader/sync/entry_reader/bzip2_dec.rs +++ b/src/reader/sync/entry_reader/bzip2_dec.rs @@ -2,7 +2,7 @@ use std::io::Read; use bzip2::read::BzDecoder; -use crate::reader::{sync::Decoder, RawEntryReader}; +use crate::reader::{sync::decoder::Decoder, RawEntryReader}; impl Decoder for BzDecoder where diff --git a/src/reader/sync/entry_reader/deflate64_dec.rs b/src/reader/sync/entry_reader/deflate64_dec.rs index ccd339b..1bac41f 100644 --- a/src/reader/sync/entry_reader/deflate64_dec.rs +++ b/src/reader/sync/entry_reader/deflate64_dec.rs @@ -2,7 +2,7 @@ use std::io::{BufReader, Read}; use deflate64::Deflate64Decoder; -use crate::reader::{sync::Decoder, RawEntryReader}; +use crate::reader::{sync::decoder::Decoder, RawEntryReader}; impl Decoder for Deflate64Decoder> where diff --git a/src/reader/sync/entry_reader/deflate_dec.rs b/src/reader/sync/entry_reader/deflate_dec.rs index 3e07d6a..a915515 100644 --- a/src/reader/sync/entry_reader/deflate_dec.rs +++ b/src/reader/sync/entry_reader/deflate_dec.rs @@ -2,7 +2,7 @@ use std::io::Read; use flate2::read::DeflateDecoder; -use crate::reader::{sync::Decoder, RawEntryReader}; +use crate::reader::{sync::decoder::Decoder, RawEntryReader}; impl Decoder for DeflateDecoder where diff --git a/src/reader/sync/entry_reader/lzma_dec.rs b/src/reader/sync/entry_reader/lzma_dec.rs index c1b7d4e..eb66bb3 100644 --- a/src/reader/sync/entry_reader/lzma_dec.rs +++ b/src/reader/sync/entry_reader/lzma_dec.rs @@ -2,7 +2,7 @@ use lzma_rs::decompress::Stream; use std::io::{Read, Write}; use crate::{ - reader::{sync::Decoder, RawEntryReader}, + reader::{sync::decoder::Decoder, RawEntryReader}, Error, UnsupportedError, }; diff --git a/src/reader/sync/entry_reader/mod.rs b/src/reader/sync/entry_reader/mod.rs index 3e05d3c..9e59bd4 100644 --- a/src/reader/sync/entry_reader/mod.rs +++ b/src/reader/sync/entry_reader/mod.rs @@ -40,6 +40,7 @@ struct EntryReadMetrics { crc32: u32, } +#[derive(Default)] enum State { ReadLocalHeader { buffer: Buffer, @@ -61,15 +62,10 @@ enum State { descriptor: Option, }, Done, + #[default] Transitioning, } -impl Default for State { - fn default() -> Self { - State::Transitioning - } -} - pub struct EntryReader where R: io::Read, @@ -266,7 +262,7 @@ where fn get_decoder( &self, - mut raw_r: RawEntryReader, + raw_r: RawEntryReader, ) -> Result>, Error> { let decoder: Box> = match self.method { Method::Store => Box::new(StoreDecoder::new(raw_r)), diff --git a/src/reader/sync/entry_reader/zstd_dec.rs b/src/reader/sync/entry_reader/zstd_dec.rs index 6f42b7e..712c456 100644 --- a/src/reader/sync/entry_reader/zstd_dec.rs +++ b/src/reader/sync/entry_reader/zstd_dec.rs @@ -2,7 +2,7 @@ use std::io::{BufRead, Read}; use zstd::stream::Decoder as ZstdDecoder; -use crate::reader::{sync::Decoder, RawEntryReader}; +use crate::reader::{sync::decoder::Decoder, RawEntryReader}; impl Decoder for ZstdDecoder<'static, R> where diff --git a/src/reader/sync/mod.rs b/src/reader/sync/mod.rs index 276a2e5..0075111 100644 --- a/src/reader/sync/mod.rs +++ b/src/reader/sync/mod.rs @@ -1,10 +1,7 @@ mod decoder; -use decoder::*; - mod entry_reader; -use entry_reader::*; - mod read_zip; // re-exports +pub use entry_reader::EntryReader; pub use read_zip::{ReadZip, ReadZipWithSize}; diff --git a/src/reader/tokio/entry_reader/mod.rs b/src/reader/tokio/entry_reader/mod.rs index fb216e1..5445f9a 100644 --- a/src/reader/tokio/entry_reader/mod.rs +++ b/src/reader/tokio/entry_reader/mod.rs @@ -25,6 +25,7 @@ struct EntryReadMetrics { pin_project_lite::pin_project! { #[project = StateProj] + #[derive(Default)] enum State { ReadLocalHeader { buffer: Buffer, @@ -47,16 +48,11 @@ pin_project_lite::pin_project! { descriptor: Option, }, Done, + #[default] Transitioning, } } -impl Default for State { - fn default() -> Self { - State::Transitioning - } -} - pin_project_lite::pin_project! { pub struct AsyncEntryReader where @@ -170,7 +166,7 @@ where self.poll_read(cx, buf) } n => { - **uncompressed_size = **uncompressed_size + n as u64; + **uncompressed_size += n as u64; let read_slice = &buf.filled()[filled_before..filled_after]; hasher.update(read_slice); Ok(()).into() diff --git a/src/reader/tokio/mod.rs b/src/reader/tokio/mod.rs index 4fd27db..419f6d0 100644 --- a/src/reader/tokio/mod.rs +++ b/src/reader/tokio/mod.rs @@ -1,5 +1,2 @@ mod decoder; -pub use decoder::*; - mod entry_reader; -pub use entry_reader::*; From 3b776d785e40e20bb6bbdd4f47cd2a4b118ccd6b Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Thu, 1 Feb 2024 18:55:01 +0100 Subject: [PATCH 08/49] It checks --- src/reader/sync/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/reader/sync/mod.rs b/src/reader/sync/mod.rs index 0075111..e4d063f 100644 --- a/src/reader/sync/mod.rs +++ b/src/reader/sync/mod.rs @@ -4,4 +4,4 @@ mod read_zip; // re-exports pub use entry_reader::EntryReader; -pub use read_zip::{ReadZip, ReadZipWithSize}; +pub use read_zip::{HasCursor, ReadZip, ReadZipWithSize, SyncArchive, SyncStoredEntry}; From 821bebefe2a1c39e49f0f19a8bfcaaac6c4bc020 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Thu, 1 Feb 2024 22:26:46 +0100 Subject: [PATCH 09/49] Full async impl? --- Cargo.lock | 24 +++ Cargo.toml | 2 +- src/reader/archive_reader.rs | 19 +++ src/reader/buffer.rs | 20 +++ src/reader/tokio/mod.rs | 7 + src/reader/tokio/read_zip.rs | 277 +++++++++++++++++++++++++++++++++++ 6 files changed, 348 insertions(+), 1 deletion(-) create mode 100644 src/reader/tokio/read_zip.rs diff --git a/Cargo.lock b/Cargo.lock index 79ac253..db1e186 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -122,6 +122,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "bytes" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" + [[package]] name = "bzip2" version = "0.4.4" @@ -417,6 +423,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "hermit-abi" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d3d0e0f38255e7fa3cf31335b3a56f05febd18025f4db5ef7a0cfb4f8da651f" + [[package]] name = "humansize" version = "2.1.3" @@ -582,6 +594,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "num_enum" version = "0.7.2" @@ -994,6 +1016,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c89b4efa943be685f629b149f53829423f8f5531ea21249408e8e2f8671ec104" dependencies = [ "backtrace", + "bytes", + "num_cpus", "pin-project-lite", "tokio-macros", ] diff --git a/Cargo.toml b/Cargo.toml index 97e3536..a942504 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,7 +39,7 @@ lzma-rs = { version = "0.3.0", features = ["stream"], optional = true } deflate64 = { version = "0.1.7", optional = true } bzip2 = { version = "0.4.4", optional = true } zstd = { version = "0.13.0", optional = true } -tokio = { version = "1.35.1", optional = true } +tokio = { version = "1.35.1", optional = true, features = ["fs", "io-util", "rt-multi-thread"] } futures = { version = "0.3.30", optional = true } pin-project-lite = { version = "0.2.13", optional = true } diff --git a/src/reader/archive_reader.rs b/src/reader/archive_reader.rs index bbd1026..a372210 100644 --- a/src/reader/archive_reader.rs +++ b/src/reader/archive_reader.rs @@ -154,6 +154,25 @@ impl ArchiveReader { } } + /// Reads some data from `rd` (which implements AsyncRead) into the + /// reader's internal buffer. + /// + /// Any I/O errors will be returned. + /// + /// If successful, this returns the number of bytes read. On success, + /// [process()](ArchiveReader::process()) should be called next. + #[cfg(feature = "tokio")] + pub async fn read_async( + &mut self, + rd: &mut R, + ) -> Result { + if let Some(buffer) = self.state.buffer_as_mut() { + buffer.read_async(rd).await + } else { + Ok(0) + } + } + /// Process buffered data /// /// Errors returned from process() are caused by invalid zip archives, diff --git a/src/reader/buffer.rs b/src/reader/buffer.rs index 7e2fda9..a132ab9 100644 --- a/src/reader/buffer.rs +++ b/src/reader/buffer.rs @@ -66,6 +66,26 @@ impl Buffer { } } + /// fill that buffer from the given AsyncRead + #[cfg(feature = "tokio")] + pub(crate) async fn read_async( + &mut self, + rd: &mut (impl tokio::io::AsyncRead + Unpin), + ) -> Result { + if self.buffer.available_space() == 0 { + trace!("uh oh, buffer has no available space!") + } + + match tokio::io::AsyncReadExt::read(rd, self.buffer.space()).await { + Ok(written) => { + self.read_bytes += written as u64; + self.buffer.fill(written); + Ok(written) + } + Err(e) => Err(e), + } + } + pub(crate) fn read_offset(&self, offset: u64) -> u64 { self.read_bytes + offset } diff --git a/src/reader/tokio/mod.rs b/src/reader/tokio/mod.rs index 419f6d0..75b4ffb 100644 --- a/src/reader/tokio/mod.rs +++ b/src/reader/tokio/mod.rs @@ -1,2 +1,9 @@ mod decoder; mod entry_reader; +mod read_zip; + +// re-exports +pub use entry_reader::AsyncEntryReader; +pub use read_zip::{ + AsyncArchive, AsyncReadZip, AsyncReadZipWithSize, AsyncStoredEntry, HasAsyncCursor, +}; diff --git a/src/reader/tokio/read_zip.rs b/src/reader/tokio/read_zip.rs new file mode 100644 index 0000000..3e731d6 --- /dev/null +++ b/src/reader/tokio/read_zip.rs @@ -0,0 +1,277 @@ +use std::{io, ops::Deref, pin::Pin, sync::Arc, task}; + +use futures::future::BoxFuture; +use positioned_io::{RandomAccessFile, ReadAt}; +use tokio::io::{AsyncRead, AsyncReadExt, ReadBuf}; + +use crate::{ + reader::{tokio::AsyncEntryReader, ArchiveReader, ArchiveReaderResult}, + Archive, Error, +}; + +/// A trait for reading something as a zip archive (blocking I/O model) +/// +/// See also [ReadZip]. +pub trait AsyncReadZipWithSize { + type File: HasAsyncCursor; + + /// Reads self as a zip archive. + /// + /// This functions blocks until the entire archive has been read. + /// It is not compatible with non-blocking or async I/O. + async fn read_zip_with_size(&self, size: u64) -> Result, Error>; +} + +/// A trait for reading something as a zip archive (blocking I/O model), +/// when we can tell size from self. +/// +/// See also [ReadZipWithSize]. +pub trait AsyncReadZip { + type File: HasAsyncCursor; + + /// Reads self as a zip archive. + /// + /// This functions blocks until the entire archive has been read. + /// It is not compatible with non-blocking or async I/O. + async fn read_zip(&self) -> Result, Error>; +} + +impl AsyncReadZipWithSize for F +where + F: HasAsyncCursor, +{ + type File = F; + + async fn read_zip_with_size(&self, size: u64) -> Result, Error> { + let mut ar = ArchiveReader::new(size); + loop { + if let Some(offset) = ar.wants_read() { + match ar.read_async(&mut self.cursor_at(offset)).await { + Ok(read_bytes) => { + if read_bytes == 0 { + return Err(Error::IO(io::ErrorKind::UnexpectedEof.into())); + } + } + Err(err) => return Err(Error::IO(err)), + } + } + + match ar.process()? { + ArchiveReaderResult::Done(archive) => { + return Ok(AsyncArchive { + file: self, + archive, + }) + } + ArchiveReaderResult::Continue => {} + } + } + } +} + +impl AsyncReadZip for &[u8] { + type File = Self; + + async fn read_zip(&self) -> Result, Error> { + self.read_zip_with_size(self.len() as u64).await + } +} + +impl AsyncReadZip for Vec { + type File = Self; + + async fn read_zip(&self) -> Result, Error> { + self.read_zip_with_size(self.len() as u64).await + } +} + +pub struct AsyncArchive<'a, F> +where + F: HasAsyncCursor, +{ + file: &'a F, + archive: Archive, +} + +impl Deref for AsyncArchive<'_, F> +where + F: HasAsyncCursor, +{ + type Target = Archive; + + fn deref(&self) -> &Self::Target { + &self.archive + } +} + +impl AsyncArchive<'_, F> +where + F: HasAsyncCursor, +{ + /// Iterate over all files in this zip, read from the central directory. + pub fn entries(&self) -> impl Iterator> { + self.archive.entries().map(move |entry| AsyncStoredEntry { + file: self.file, + entry, + }) + } + + /// Attempts to look up an entry by name. This is usually a bad idea, + /// as names aren't necessarily normalized in zip archives. + pub fn by_name>(&self, name: N) -> Option> { + self.entries + .iter() + .find(|&x| x.name() == name.as_ref()) + .map(|entry| AsyncStoredEntry { + file: self.file, + entry, + }) + } +} + +pub struct AsyncStoredEntry<'a, F> { + file: &'a F, + entry: &'a crate::StoredEntry, +} + +impl Deref for AsyncStoredEntry<'_, F> { + type Target = crate::StoredEntry; + + fn deref(&self) -> &Self::Target { + self.entry + } +} + +impl<'a, F> AsyncStoredEntry<'a, F> +where + F: HasAsyncCursor, +{ + /// Returns a reader for the entry. + pub fn reader(&self) -> AsyncEntryReader<::Cursor<'a>> { + tracing::trace!("Creating EntryReader"); + AsyncEntryReader::new(self.entry, |offset| self.file.cursor_at(offset)) + } + + /// Reads the entire entry into a vector. + pub async fn bytes(&self) -> io::Result> { + let mut v = Vec::new(); + self.reader().read_to_end(&mut v).await?; + Ok(v) + } +} + +/// A sliceable I/O resource: we can ask for a [Read] at a given offset. +pub trait HasAsyncCursor { + type Cursor<'a>: AsyncRead + Unpin + 'a + where + Self: 'a; + + /// Returns a [Read] at the given offset. + fn cursor_at(&self, offset: u64) -> Self::Cursor<'_>; +} + +impl HasAsyncCursor for &[u8] { + type Cursor<'a> = &'a [u8] + where + Self: 'a; + + fn cursor_at(&self, offset: u64) -> Self::Cursor<'_> { + &self[offset.try_into().unwrap()..] + } +} + +impl HasAsyncCursor for Vec { + type Cursor<'a> = &'a [u8] + where + Self: 'a; + + fn cursor_at(&self, offset: u64) -> Self::Cursor<'_> { + &self[offset.try_into().unwrap()..] + } +} + +impl HasAsyncCursor for Arc { + type Cursor<'a> = AsyncRandomAccessFileCursor + where + Self: 'a; + + fn cursor_at(&self, offset: u64) -> Self::Cursor<'_> { + AsyncRandomAccessFileCursor { + pos: offset, + state: ARAFCState::Idle(ARAFCCore { + inner_buf: vec![0u8; 128 * 1024], + file: self.clone(), + }), + } + } +} + +struct ARAFCCore { + inner_buf: Vec, + file: Arc, +} + +type JoinResult = Result; + +#[derive(Default)] +enum ARAFCState { + Idle(ARAFCCore), + Reading { + fut: BoxFuture<'static, JoinResult<(Result, ARAFCCore)>>, + }, + + #[default] + Transitioning, +} + +/// A cursor for reading from a [RandomAccessFile] asynchronously. +pub struct AsyncRandomAccessFileCursor { + pos: u64, + state: ARAFCState, +} + +impl AsyncRead for AsyncRandomAccessFileCursor { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut task::Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> task::Poll> { + match &mut self.state { + ARAFCState::Idle { .. } => { + let mut core = match std::mem::take(&mut self.state) { + ARAFCState::Idle(core) => core, + _ => unreachable!(), + }; + let pos = self.pos; + let fut = Box::pin(tokio::task::spawn_blocking(move || { + let read = core.file.read_at(pos, &mut core.inner_buf); + (read, core) + })); + self.state = ARAFCState::Reading { fut }; + self.poll_read(cx, buf) + } + ARAFCState::Reading { fut } => { + let (read, core) = match fut.as_mut().poll(cx) { + task::Poll::Ready(Ok(r)) => r, + task::Poll::Ready(Err(e)) => { + return task::Poll::Ready(Err(io::Error::new( + io::ErrorKind::Other, + e.to_string(), + ))) + } + task::Poll::Pending => return task::Poll::Pending, + }; + match read { + Ok(read) => { + self.pos += read as u64; + buf.put_slice(&core.inner_buf[..read]); + self.state = ARAFCState::Idle(core); + task::Poll::Ready(Ok(())) + } + Err(e) => task::Poll::Ready(Err(e)), + } + } + ARAFCState::Transitioning => unreachable!(), + } + } +} From 4c5ccaba541dd60aff5f208fe71bfa2fd9e117c3 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Thu, 1 Feb 2024 22:32:11 +0100 Subject: [PATCH 10/49] Now we need deflate --- src/reader/tokio/read_zip.rs | 19 +++--- tests/integration_tests.rs | 115 +++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 7 deletions(-) diff --git a/src/reader/tokio/read_zip.rs b/src/reader/tokio/read_zip.rs index 3e731d6..fc060c9 100644 --- a/src/reader/tokio/read_zip.rs +++ b/src/reader/tokio/read_zip.rs @@ -19,7 +19,11 @@ pub trait AsyncReadZipWithSize { /// /// This functions blocks until the entire archive has been read. /// It is not compatible with non-blocking or async I/O. - async fn read_zip_with_size(&self, size: u64) -> Result, Error>; + #[allow(async_fn_in_trait)] + async fn read_zip_with_size_async( + &self, + size: u64, + ) -> Result, Error>; } /// A trait for reading something as a zip archive (blocking I/O model), @@ -33,7 +37,8 @@ pub trait AsyncReadZip { /// /// This functions blocks until the entire archive has been read. /// It is not compatible with non-blocking or async I/O. - async fn read_zip(&self) -> Result, Error>; + #[allow(async_fn_in_trait)] + async fn read_zip_async(&self) -> Result, Error>; } impl AsyncReadZipWithSize for F @@ -42,7 +47,7 @@ where { type File = F; - async fn read_zip_with_size(&self, size: u64) -> Result, Error> { + async fn read_zip_with_size_async(&self, size: u64) -> Result, Error> { let mut ar = ArchiveReader::new(size); loop { if let Some(offset) = ar.wants_read() { @@ -72,16 +77,16 @@ where impl AsyncReadZip for &[u8] { type File = Self; - async fn read_zip(&self) -> Result, Error> { - self.read_zip_with_size(self.len() as u64).await + async fn read_zip_async(&self) -> Result, Error> { + self.read_zip_with_size_async(self.len() as u64).await } } impl AsyncReadZip for Vec { type File = Self; - async fn read_zip(&self) -> Result, Error> { - self.read_zip_with_size(self.len() as u64).await + async fn read_zip_async(&self) -> Result, Error> { + self.read_zip_with_size_async(self.len() as u64).await } } diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index b7741b7..9b2d5f1 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -7,6 +7,10 @@ use rc_zip::{ reader::sync::{HasCursor, SyncArchive, SyncStoredEntry}, Archive, Encoding, }; + +#[cfg(feature = "tokio")] +use rc_zip::reader::tokio::{AsyncArchive, AsyncReadZip, AsyncStoredEntry, HasAsyncCursor}; + use std::{fs::File, path::PathBuf}; enum ZipSource { @@ -73,6 +77,49 @@ impl ZipTest { f.check(&archive); } } + + #[cfg(feature = "tokio")] + async fn check_async( + &self, + archive: Result, rc_zip::Error>, + ) { + let case_bytes = self.bytes(); + + if let Some(expected) = &self.error { + let actual = match archive { + Err(e) => e, + Ok(_) => panic!("should have failed"), + }; + let expected = format!("{:#?}", expected); + let actual = format!("{:#?}", actual); + assert_eq!(expected, actual); + return; + } + let archive = archive.unwrap(); + + assert_eq!(case_bytes.len() as u64, archive.size()); + + if let Some(expected) = self.comment { + assert_eq!(expected, archive.comment().expect("should have comment")) + } + + if let Some(exp_encoding) = self.expected_encoding { + println!("{}: should be {}", self.name(), exp_encoding); + assert_eq!(archive.encoding(), exp_encoding); + } + + assert_eq!( + self.files.len(), + archive.entries().count(), + "{} should have {} entries files", + self.name(), + self.files.len() + ); + + for f in &self.files { + f.check_async(&archive).await; + } + } } struct ZipTestFile { @@ -140,6 +187,65 @@ impl ZipTestFile { } } } +#[cfg(feature = "tokio")] +impl ZipTestFile { + async fn check_async(&self, archive: &AsyncArchive<'_, F>) { + let entry = archive + .by_name(self.name) + .unwrap_or_else(|| panic!("entry {} should exist", self.name)); + + let archive_inner: &Archive = archive; + let entry_inner = archive_inner.by_name(self.name).unwrap(); + assert_eq!(entry.name(), entry_inner.name()); + + self.check_against_async(entry).await; + } + + async fn check_against_async(&self, entry: AsyncStoredEntry<'_, F>) { + if let Some(expected) = self.modified { + assert_eq!( + expected, + entry.modified(), + "entry {} should have modified = {:?}", + entry.name(), + expected + ) + } + + if let Some(mode) = self.mode { + assert_eq!(entry.mode.0 & 0o777, mode); + } + + // I have honestly yet to see a zip file _entry_ with a comment. + assert!(entry.comment().is_none()); + + match entry.contents() { + rc_zip::EntryContents::File => { + let actual_bytes = entry.bytes().await.unwrap(); + + match &self.content { + FileContent::Unchecked => { + // ah well + } + FileContent::Bytes(expected_bytes) => { + // first check length + assert_eq!(actual_bytes.len(), expected_bytes.len()); + assert_eq!(&actual_bytes[..], &expected_bytes[..]) + } + FileContent::File(file_path) => { + let expected_bytes = std::fs::read(zips_dir().join(file_path)).unwrap(); + // first check length + assert_eq!(actual_bytes.len(), expected_bytes.len()); + assert_eq!(&actual_bytes[..], &expected_bytes[..]) + } + } + } + rc_zip::EntryContents::Symlink | rc_zip::EntryContents::Directory => { + assert!(matches!(self.content, FileContent::Unchecked)); + } + } + } +} enum FileContent { Unchecked, @@ -345,6 +451,15 @@ fn real_world_files() { } } +#[cfg(feature = "tokio")] +#[test_log::test(tokio::test)] +async fn real_world_files_async() { + for case in test_cases() { + tracing::trace!("============ testing {}", case.name()); + case.check_async(case.bytes().read_zip_async().await).await; + } +} + #[test_log::test] fn state_machine() { use rc_zip::reader::{ArchiveReader, ArchiveReaderResult}; From fe0eb7fbd7f00dd0433de470078ad21483aef872 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Thu, 1 Feb 2024 22:45:45 +0100 Subject: [PATCH 11/49] Failing tests, fun! --- Cargo.lock | 14 +++++++++++++ Cargo.toml | 3 ++- src/reader/sync/entry_reader/mod.rs | 8 +++----- src/reader/tokio/entry_reader/deflate_dec.rs | 21 ++++++++++++++++++++ src/reader/tokio/entry_reader/mod.rs | 13 ++++++++++++ 5 files changed, 53 insertions(+), 6 deletions(-) create mode 100644 src/reader/tokio/entry_reader/deflate_dec.rs diff --git a/Cargo.lock b/Cargo.lock index db1e186..1d535d8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -89,6 +89,19 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "async-compression" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a116f46a969224200a0a97f29cfd4c50e7534e4b4826bd23ea2c3c533039c82c" +dependencies = [ + "flate2", + "futures-core", + "memchr", + "pin-project-lite", + "tokio", +] + [[package]] name = "autocfg" version = "1.1.0" @@ -795,6 +808,7 @@ checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" name = "rc-zip" version = "2.0.1" dependencies = [ + "async-compression", "byteorder", "bzip2", "cfg-if", diff --git a/Cargo.toml b/Cargo.toml index a942504..ed1ae98 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,7 @@ zstd = { version = "0.13.0", optional = true } tokio = { version = "1.35.1", optional = true, features = ["fs", "io-util", "rt-multi-thread"] } futures = { version = "0.3.30", optional = true } pin-project-lite = { version = "0.2.13", optional = true } +async-compression = { version = "0.4.6", optional = true, features = ["tokio", "deflate"] } [features] default = ["sync", "file", "deflate"] @@ -52,7 +53,7 @@ deflate64 = ["dep:deflate64"] lzma = ["dep:lzma-rs"] bzip2 = ["dep:bzip2"] zstd = ["dep:zstd"] -tokio = ["dep:tokio", "dep:futures", "dep:pin-project-lite"] +tokio = ["dep:tokio", "dep:futures", "dep:pin-project-lite", "async-compression"] [dev-dependencies] clap = { version = "4.4.18", features = ["derive"] } diff --git a/src/reader/sync/entry_reader/mod.rs b/src/reader/sync/entry_reader/mod.rs index 9e59bd4..ad09c0e 100644 --- a/src/reader/sync/entry_reader/mod.rs +++ b/src/reader/sync/entry_reader/mod.rs @@ -1,5 +1,3 @@ -//! This part of the API is still being designed - no guarantees are made -//! whatsoever. use crate::{ error::*, format::*, @@ -10,9 +8,6 @@ use crate::{ transition, }; -#[cfg(feature = "lzma")] -mod lzma_dec; - #[cfg(feature = "deflate")] mod deflate_dec; @@ -22,6 +17,9 @@ mod deflate64_dec; #[cfg(feature = "bzip2")] mod bzip2_dec; +#[cfg(feature = "lzma")] +mod lzma_dec; + #[cfg(feature = "zstd")] mod zstd_dec; diff --git a/src/reader/tokio/entry_reader/deflate_dec.rs b/src/reader/tokio/entry_reader/deflate_dec.rs new file mode 100644 index 0000000..f89c773 --- /dev/null +++ b/src/reader/tokio/entry_reader/deflate_dec.rs @@ -0,0 +1,21 @@ +use async_compression::tokio::bufread::DeflateDecoder; +use tokio::io::AsyncBufRead; + +use crate::reader::{tokio::decoder::AsyncDecoder, RawEntryReader}; + +impl AsyncDecoder for DeflateDecoder +where + R: AsyncBufRead, +{ + fn into_inner(self: Box) -> R { + Self::into_inner(*self) + } + + fn get_mut(&mut self) -> &mut R { + Self::get_mut(self) + } +} + +pub(crate) fn mk_decoder(r: RawEntryReader) -> impl AsyncDecoder { + DeflateDecoder::new(r) +} diff --git a/src/reader/tokio/entry_reader/mod.rs b/src/reader/tokio/entry_reader/mod.rs index 5445f9a..ff9404d 100644 --- a/src/reader/tokio/entry_reader/mod.rs +++ b/src/reader/tokio/entry_reader/mod.rs @@ -8,6 +8,10 @@ use crate::{ transition_async, }; +#[cfg(feature = "deflate")] +mod deflate_dec; + +use cfg_if::cfg_if; use oval::Buffer; use std::{io, pin::Pin, task}; use tokio::io::AsyncRead; @@ -279,6 +283,15 @@ fn method_to_decoder( ) -> Result + Unpin>, Error> { let decoder: Box + Unpin> = match method { Method::Store => Box::new(StoreAsyncDecoder::new(raw_r)), + Method::Deflate => { + cfg_if! { + if #[cfg(feature = "deflate")] { + Box::new(deflate_dec::mk_decoder(raw_r)) + } else { + return Err(Error::method_not_enabled(self.method)); + } + } + } method => { return Err(Error::method_not_supported(method)); } From f91c6047e41edecaa5712da3c57553aaf2c71df7 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Thu, 1 Feb 2024 22:48:52 +0100 Subject: [PATCH 12/49] async deflate test work, it seems --- src/reader/tokio/decoder.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/reader/tokio/decoder.rs b/src/reader/tokio/decoder.rs index d29a171..dc5994c 100644 --- a/src/reader/tokio/decoder.rs +++ b/src/reader/tokio/decoder.rs @@ -84,11 +84,16 @@ impl AsyncRead for RawEntryReader { _cx: &mut task::Context<'_>, buf: &mut tokio::io::ReadBuf<'_>, ) -> task::Poll> { - let len = cmp::min(buf.remaining() as u64, self.remaining) as usize; + let len = cmp::min( + buf.remaining() as u64, + cmp::min(self.remaining, self.inner.available_data() as _), + ) as usize; tracing::trace!(%len, buf_remaining = buf.remaining(), remaining = self.remaining, available_data = self.inner.available_data(), available_space = self.inner.available_space(), "computing len"); buf.put_slice(&self.inner.data()[..len]); self.as_mut().inner.consume(len); + self.remaining -= len as u64; + Ok(()).into() } } From 1413e7a73d01636da1a676bc4792493e3b911ef9 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Thu, 1 Feb 2024 23:58:01 +0100 Subject: [PATCH 13/49] Split sync/tokio into their own separate crates --- .vscode/settings.json | 2 - Cargo.lock | 84 ++++++--- Cargo.toml | 74 +------- codecov.yml | 2 - examples/jean/.gitignore | 2 - examples/jean/Cargo.toml | 23 --- rc-zip-sync/Cargo.toml | 52 ++++++ .../main.rs => rc-zip-sync/examples/jean.rs | 3 +- .../sync => rc-zip-sync/src}/decoder.rs | 25 ++- .../src}/entry_reader/bzip2_dec.rs | 2 +- .../src}/entry_reader/deflate64_dec.rs | 2 +- .../src}/entry_reader/deflate_dec.rs | 2 +- .../src}/entry_reader/lzma_dec.rs | 9 +- .../src}/entry_reader/mod.rs | 17 +- .../src}/entry_reader/zstd_dec.rs | 2 +- rc-zip-sync/src/lib.rs | 17 ++ .../sync => rc-zip-sync/src}/read_zip.rs | 20 ++- .../tests}/integration_tests.rs | 148 ++------------- rc-zip-tokio/Cargo.toml | 38 ++++ .../tokio => rc-zip-tokio/src}/decoder.rs | 25 ++- .../src}/entry_reader/deflate_dec.rs | 2 +- .../src}/entry_reader/mod.rs | 16 +- .../macros.rs => rc-zip-tokio/src/lib.rs | 22 ++- .../tokio => rc-zip-tokio/src}/read_zip.rs | 19 +- rc-zip/Cargo.toml | 31 ++++ {src => rc-zip/src}/encoding.rs | 0 {src => rc-zip/src}/error.rs | 6 +- {src => rc-zip/src}/format/archive.rs | 0 {src => rc-zip/src}/format/date_time.rs | 0 .../src}/format/directory_header.rs | 0 {src => rc-zip/src}/format/eocd.rs | 0 {src => rc-zip/src}/format/extra_field.rs | 0 {src => rc-zip/src}/format/local.rs | 0 {src => rc-zip/src}/format/mod.rs | 0 {src => rc-zip/src}/format/mode.rs | 0 {src => rc-zip/src}/format/raw.rs | 0 {src => rc-zip/src}/format/version.rs | 0 {src => rc-zip/src}/lib.rs | 8 +- .../archive_reader.rs => rc-zip/src/reader.rs | 168 +++++++++++++----- src/prelude.rs | 4 - src/reader/buffer.rs | 92 ---------- src/reader/mod.rs | 38 ---- src/reader/sync/mod.rs | 7 - src/reader/tokio/mod.rs | 9 - {tests/data => testdata}/cp-437.zip | Bin .../data => testdata}/crc32-not-streamed.zip | Bin {tests/data => testdata}/dd.zip | Bin {tests/data => testdata}/found-me-bzip2.zip | Bin .../data => testdata}/found-me-deflate64.zip | Bin {tests/data => testdata}/found-me-lzma.zip | Bin {tests/data => testdata}/found-me-zstd.zip | Bin {tests/data => testdata}/found-me.txt | 0 .../data => testdata}/go-no-datadesc-sig.zip | Bin .../go-with-datadesc-sig.zip | Bin {tests/data => testdata}/gophercolor16x16.png | Bin {tests/data => testdata}/readme.notzip | 0 {tests/data => testdata}/readme.trailingzip | Bin {tests/data => testdata}/readme.zip | Bin {tests/data => testdata}/shift-jis.zip | Bin {tests/data => testdata}/symlink.zip | Bin .../data => testdata}/test-trailing-junk.zip | Bin {tests/data => testdata}/test.zip | Bin {tests/data => testdata}/time-22738.zip | Bin {tests/data => testdata}/time-7zip.zip | Bin {tests/data => testdata}/time-go.zip | Bin {tests/data => testdata}/time-infozip.zip | Bin {tests/data => testdata}/time-osx.zip | Bin {tests/data => testdata}/time-win7.zip | Bin {tests/data => testdata}/time-winrar.zip | Bin {tests/data => testdata}/time-winzip.zip | Bin {tests/data => testdata}/unix.zip | Bin {tests/data => testdata}/utf8-7zip.zip | Bin {tests/data => testdata}/utf8-infozip.zip | Bin {tests/data => testdata}/utf8-osx.zip | Bin {tests/data => testdata}/utf8-winrar.zip | Bin {tests/data => testdata}/utf8-winzip.zip | Bin {tests/data => testdata}/winxp.zip | Bin {tests/data => testdata}/zip64-2.zip | Bin {tests/data => testdata}/zip64.zip | Bin 79 files changed, 458 insertions(+), 513 deletions(-) delete mode 100644 codecov.yml delete mode 100644 examples/jean/.gitignore delete mode 100644 examples/jean/Cargo.toml create mode 100644 rc-zip-sync/Cargo.toml rename examples/jean/src/main.rs => rc-zip-sync/examples/jean.rs (99%) rename {src/reader/sync => rc-zip-sync/src}/decoder.rs (75%) rename {src/reader/sync => rc-zip-sync/src}/entry_reader/bzip2_dec.rs (85%) rename {src/reader/sync => rc-zip-sync/src}/entry_reader/deflate64_dec.rs (87%) rename {src/reader/sync => rc-zip-sync/src}/entry_reader/deflate_dec.rs (86%) rename {src/reader/sync => rc-zip-sync/src}/entry_reader/lzma_dec.rs (96%) rename {src/reader/sync => rc-zip-sync/src}/entry_reader/mod.rs (97%) rename {src/reader/sync => rc-zip-sync/src}/entry_reader/zstd_dec.rs (87%) create mode 100644 rc-zip-sync/src/lib.rs rename {src/reader/sync => rc-zip-sync/src}/read_zip.rs (93%) rename {tests => rc-zip-sync/tests}/integration_tests.rs (73%) create mode 100644 rc-zip-tokio/Cargo.toml rename {src/reader/tokio => rc-zip-tokio/src}/decoder.rs (80%) rename {src/reader/tokio => rc-zip-tokio/src}/entry_reader/deflate_dec.rs (86%) rename {src/reader/tokio => rc-zip-tokio/src}/entry_reader/mod.rs (98%) rename src/reader/macros.rs => rc-zip-tokio/src/lib.rs (50%) rename {src/reader/tokio => rc-zip-tokio/src}/read_zip.rs (95%) create mode 100644 rc-zip/Cargo.toml rename {src => rc-zip/src}/encoding.rs (100%) rename {src => rc-zip/src}/error.rs (96%) rename {src => rc-zip/src}/format/archive.rs (100%) rename {src => rc-zip/src}/format/date_time.rs (100%) rename {src => rc-zip/src}/format/directory_header.rs (100%) rename {src => rc-zip/src}/format/eocd.rs (100%) rename {src => rc-zip/src}/format/extra_field.rs (100%) rename {src => rc-zip/src}/format/local.rs (100%) rename {src => rc-zip/src}/format/mod.rs (100%) rename {src => rc-zip/src}/format/mode.rs (100%) rename {src => rc-zip/src}/format/raw.rs (100%) rename {src => rc-zip/src}/format/version.rs (100%) rename {src => rc-zip/src}/lib.rs (91%) rename src/reader/archive_reader.rs => rc-zip/src/reader.rs (82%) delete mode 100644 src/prelude.rs delete mode 100644 src/reader/buffer.rs delete mode 100644 src/reader/mod.rs delete mode 100644 src/reader/sync/mod.rs delete mode 100644 src/reader/tokio/mod.rs rename {tests/data => testdata}/cp-437.zip (100%) rename {tests/data => testdata}/crc32-not-streamed.zip (100%) rename {tests/data => testdata}/dd.zip (100%) rename {tests/data => testdata}/found-me-bzip2.zip (100%) rename {tests/data => testdata}/found-me-deflate64.zip (100%) rename {tests/data => testdata}/found-me-lzma.zip (100%) rename {tests/data => testdata}/found-me-zstd.zip (100%) rename {tests/data => testdata}/found-me.txt (100%) rename {tests/data => testdata}/go-no-datadesc-sig.zip (100%) rename {tests/data => testdata}/go-with-datadesc-sig.zip (100%) rename {tests/data => testdata}/gophercolor16x16.png (100%) rename {tests/data => testdata}/readme.notzip (100%) rename {tests/data => testdata}/readme.trailingzip (100%) rename {tests/data => testdata}/readme.zip (100%) rename {tests/data => testdata}/shift-jis.zip (100%) rename {tests/data => testdata}/symlink.zip (100%) rename {tests/data => testdata}/test-trailing-junk.zip (100%) rename {tests/data => testdata}/test.zip (100%) rename {tests/data => testdata}/time-22738.zip (100%) rename {tests/data => testdata}/time-7zip.zip (100%) rename {tests/data => testdata}/time-go.zip (100%) rename {tests/data => testdata}/time-infozip.zip (100%) rename {tests/data => testdata}/time-osx.zip (100%) rename {tests/data => testdata}/time-win7.zip (100%) rename {tests/data => testdata}/time-winrar.zip (100%) rename {tests/data => testdata}/time-winzip.zip (100%) rename {tests/data => testdata}/unix.zip (100%) rename {tests/data => testdata}/utf8-7zip.zip (100%) rename {tests/data => testdata}/utf8-infozip.zip (100%) rename {tests/data => testdata}/utf8-osx.zip (100%) rename {tests/data => testdata}/utf8-winrar.zip (100%) rename {tests/data => testdata}/utf8-winzip.zip (100%) rename {tests/data => testdata}/winxp.zip (100%) rename {tests/data => testdata}/zip64-2.zip (100%) rename {tests/data => testdata}/zip64.zip (100%) diff --git a/.vscode/settings.json b/.vscode/settings.json index 2e949f9..8bf6ba4 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,7 +1,5 @@ { "rust-analyzer.cargo.features": [ - "default", - "tokio", "lzma", "deflate64", "bzip2", diff --git a/Cargo.lock b/Cargo.lock index 1d535d8..15987e9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -95,11 +95,16 @@ version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a116f46a969224200a0a97f29cfd4c50e7534e4b4826bd23ea2c3c533039c82c" dependencies = [ + "bzip2", + "deflate64", "flate2", "futures-core", "memchr", "pin-project-lite", "tokio", + "xz2", + "zstd", + "zstd-safe", ] [[package]] @@ -564,6 +569,17 @@ dependencies = [ "crc", ] +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "matchers" version = "0.1.0" @@ -808,36 +824,63 @@ checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" name = "rc-zip" version = "2.0.1" dependencies = [ - "async-compression", "byteorder", - "bzip2", "cfg-if", "chardetng", "chrono", + "crc32fast", + "encoding_rs", + "num_enum", + "oem_cp", + "oval", + "pretty-hex", + "thiserror", + "tracing", + "winnow", +] + +[[package]] +name = "rc-zip-sync" +version = "2.0.1" +dependencies = [ + "byteorder", + "bzip2", + "cfg-if", + "chrono", "clap", "crc32fast", "deflate64", - "encoding_rs", "flate2", - "futures", "humansize", "indicatif", "lzma-rs", - "num_enum", - "oem_cp", "oval", - "pin-project-lite", "positioned-io", - "pretty-hex", + "rc-zip", "test-log", - "thiserror", - "tokio", "tracing", "tracing-subscriber", "winnow", "zstd", ] +[[package]] +name = "rc-zip-tokio" +version = "2.0.1" +dependencies = [ + "async-compression", + "cfg-if", + "crc32fast", + "futures", + "oval", + "pin-project-lite", + "positioned-io", + "rc-zip", + "tokio", + "tracing", + "winnow", +] + [[package]] name = "regex" version = "1.10.3" @@ -1033,18 +1076,6 @@ dependencies = [ "bytes", "num_cpus", "pin-project-lite", - "tokio-macros", -] - -[[package]] -name = "tokio-macros" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" -dependencies = [ - "proc-macro2", - "quote", - "syn", ] [[package]] @@ -1309,6 +1340,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "zstd" version = "0.13.0" diff --git a/Cargo.toml b/Cargo.toml index ed1ae98..c77b87a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,67 +1,7 @@ -[package] -name = "rc-zip" -version = "2.0.1" -description = "zip reading" -repository = "https://github.com/fasterthanlime/rc-zip" -license = "Apache-2.0 OR MIT" -authors = ["Amos Wenger "] -edition = "2021" -readme = "README.md" - -keywords = ["zip", "unzip"] -categories = ["compression"] - -[lib] -name = "rc_zip" -path = "src/lib.rs" - -[[example]] -name = "jean" -path = "examples/jean/src/main.rs" - -[dependencies] -winnow = "0.5.36" -pretty-hex = "0.4.1" -oval = "2.0.0" -chrono = "0.4.33" -encoding_rs = "0.8.33" -crc32fast = "1.3.2" -positioned-io = { version = "0.3.3", optional = true } -tracing = "0.1.40" -oem_cp = "2.0.0" -thiserror = "1.0.56" -chardetng = "0.1.17" -flate2 = { version = "1.0.28", optional = true } -num_enum = "0.7.2" -byteorder = "1.5.0" -cfg-if = "1.0.0" -lzma-rs = { version = "0.3.0", features = ["stream"], optional = true } -deflate64 = { version = "0.1.7", optional = true } -bzip2 = { version = "0.4.4", optional = true } -zstd = { version = "0.13.0", optional = true } -tokio = { version = "1.35.1", optional = true, features = ["fs", "io-util", "rt-multi-thread"] } -futures = { version = "0.3.30", optional = true } -pin-project-lite = { version = "0.2.13", optional = true } -async-compression = { version = "0.4.6", optional = true, features = ["tokio", "deflate"] } - -[features] -default = ["sync", "file", "deflate"] -sync = [] -file = ["positioned-io"] -deflate = ["dep:flate2"] -deflate64 = ["dep:deflate64"] -lzma = ["dep:lzma-rs"] -bzip2 = ["dep:bzip2"] -zstd = ["dep:zstd"] -tokio = ["dep:tokio", "dep:futures", "dep:pin-project-lite", "async-compression"] - -[dev-dependencies] -clap = { version = "4.4.18", features = ["derive"] } -humansize = "2.1.3" -indicatif = "0.17.7" -test-log = { version = "0.2.14", default-features = false, features = ["tracing-subscriber", "trace"] } -tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } -tokio = { version = "1.35.1", features = ["macros"] } - -[profile.release] -debug = 1 +[workspace] +resolver = "2" +members = [ + "rc-zip", + "rc-zip-sync", + "rc-zip-tokio", +] \ No newline at end of file diff --git a/codecov.yml b/codecov.yml deleted file mode 100644 index 7b12ce4..0000000 --- a/codecov.yml +++ /dev/null @@ -1,2 +0,0 @@ -ignore: - - "crates/jean/**" \ No newline at end of file diff --git a/examples/jean/.gitignore b/examples/jean/.gitignore deleted file mode 100644 index 53eaa21..0000000 --- a/examples/jean/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -/target -**/*.rs.bk diff --git a/examples/jean/Cargo.toml b/examples/jean/Cargo.toml deleted file mode 100644 index 98dcc24..0000000 --- a/examples/jean/Cargo.toml +++ /dev/null @@ -1,23 +0,0 @@ -[package] -name = "jean" -version.workspace = true -repository.workspace = true -license.workspace = true -authors.workspace = true -edition = "2021" -publish = false - -[dependencies] -rc-zip = { path = "../rc-zip" } -clap = { version = "4.4.18", features = ["derive"] } -humansize = "2.1.3" -positioned-io.workspace = true -indicatif = "0.17.7" -tracing-subscriber = "0.3.18" -cfg-if = "1.0.0" - -[features] -default = ["lzma"] -deflate = ["rc-zip/deflate"] -deflate64 = ["rc-zip/deflate64"] -lzma = ["rc-zip/lzma"] diff --git a/rc-zip-sync/Cargo.toml b/rc-zip-sync/Cargo.toml new file mode 100644 index 0000000..ec7fe86 --- /dev/null +++ b/rc-zip-sync/Cargo.toml @@ -0,0 +1,52 @@ +[package] +name = "rc-zip-sync" +version = "2.0.1" +description = "Synchronous zip reading on top of rc-zip" +repository = "https://github.com/fasterthanlime/rc-zip" +license = "Apache-2.0 or MIT" +authors = ["Amos Wenger "] +edition = "2021" +readme = "README.md" + +keywords = ["zip", "unzip"] +categories = ["compression"] + +[lib] +name = "rc_zip_sync" +path = "src/lib.rs" + +[[example]] +name = "jean" +path = "examples/jean.rs" + +[dependencies] +positioned-io = { version = "0.3.3", optional = true } +flate2 = { version = "1.0.28", optional = true } +rc-zip = { version = "2.0.1", path = "../rc-zip" } +lzma-rs = { version = "0.3.0", features = ["stream"], optional = true } +deflate64 = { version = "0.1.7", optional = true } +bzip2 = { version = "0.4.4", optional = true } +zstd = { version = "0.13.0", optional = true } +oval = "2.0.0" +crc32fast = "1.3.2" +tracing = "0.1.40" +byteorder = "1.5.0" +cfg-if = "1.0.0" +winnow = "0.5.36" + +[features] +default = ["file", "deflate"] +file = ["positioned-io"] +deflate = ["dep:flate2"] +deflate64 = ["dep:deflate64"] +lzma = ["dep:lzma-rs"] +bzip2 = ["dep:bzip2"] +zstd = ["dep:zstd"] + +[dev-dependencies] +chrono = "0.4.33" +clap = { version = "4.4.18", features = ["derive"] } +humansize = "2.1.3" +indicatif = "0.17.7" +test-log = { version = "0.2.14", default-features = false, features = ["tracing-subscriber", "trace"] } +tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } diff --git a/examples/jean/src/main.rs b/rc-zip-sync/examples/jean.rs similarity index 99% rename from examples/jean/src/main.rs rename to rc-zip-sync/examples/jean.rs index 841f179..df76da1 100644 --- a/examples/jean/src/main.rs +++ b/rc-zip-sync/examples/jean.rs @@ -1,7 +1,8 @@ use cfg_if::cfg_if; use clap::{Parser, Subcommand}; use humansize::{format_size, BINARY}; -use rc_zip::{prelude::*, EntryContents}; +use rc_zip::EntryContents; +use rc_zip_sync::ReadZip; use std::{ borrow::Cow, diff --git a/src/reader/sync/decoder.rs b/rc-zip-sync/src/decoder.rs similarity index 75% rename from src/reader/sync/decoder.rs rename to rc-zip-sync/src/decoder.rs index 7948de8..b826d8d 100644 --- a/src/reader/sync/decoder.rs +++ b/rc-zip-sync/src/decoder.rs @@ -2,7 +2,30 @@ use std::{cmp, io}; use oval::Buffer; -use crate::reader::RawEntryReader; +/// Only allows reading a fixed number of bytes from a [oval::Buffer], +/// used for reading the raw (compressed) data for a single zip file entry. +/// It also allows moving out the inner buffer afterwards. +pub(crate) struct RawEntryReader { + remaining: u64, + inner: Buffer, +} + +impl RawEntryReader { + pub(crate) fn new(inner: Buffer, entry_size: u64) -> Self { + Self { + inner, + remaining: entry_size, + } + } + + pub(crate) fn into_inner(self) -> Buffer { + self.inner + } + + pub(crate) fn get_mut(&mut self) -> &mut Buffer { + &mut self.inner + } +} pub(crate) trait Decoder: io::Read where diff --git a/src/reader/sync/entry_reader/bzip2_dec.rs b/rc-zip-sync/src/entry_reader/bzip2_dec.rs similarity index 85% rename from src/reader/sync/entry_reader/bzip2_dec.rs rename to rc-zip-sync/src/entry_reader/bzip2_dec.rs index d663476..e808831 100644 --- a/src/reader/sync/entry_reader/bzip2_dec.rs +++ b/rc-zip-sync/src/entry_reader/bzip2_dec.rs @@ -2,7 +2,7 @@ use std::io::Read; use bzip2::read::BzDecoder; -use crate::reader::{sync::decoder::Decoder, RawEntryReader}; +use crate::decoder::{Decoder, RawEntryReader}; impl Decoder for BzDecoder where diff --git a/src/reader/sync/entry_reader/deflate64_dec.rs b/rc-zip-sync/src/entry_reader/deflate64_dec.rs similarity index 87% rename from src/reader/sync/entry_reader/deflate64_dec.rs rename to rc-zip-sync/src/entry_reader/deflate64_dec.rs index 1bac41f..f9e6d22 100644 --- a/src/reader/sync/entry_reader/deflate64_dec.rs +++ b/rc-zip-sync/src/entry_reader/deflate64_dec.rs @@ -2,7 +2,7 @@ use std::io::{BufReader, Read}; use deflate64::Deflate64Decoder; -use crate::reader::{sync::decoder::Decoder, RawEntryReader}; +use crate::decoder::{Decoder, RawEntryReader}; impl Decoder for Deflate64Decoder> where diff --git a/src/reader/sync/entry_reader/deflate_dec.rs b/rc-zip-sync/src/entry_reader/deflate_dec.rs similarity index 86% rename from src/reader/sync/entry_reader/deflate_dec.rs rename to rc-zip-sync/src/entry_reader/deflate_dec.rs index a915515..db4e1e9 100644 --- a/src/reader/sync/entry_reader/deflate_dec.rs +++ b/rc-zip-sync/src/entry_reader/deflate_dec.rs @@ -2,7 +2,7 @@ use std::io::Read; use flate2::read::DeflateDecoder; -use crate::reader::{sync::decoder::Decoder, RawEntryReader}; +use crate::decoder::{Decoder, RawEntryReader}; impl Decoder for DeflateDecoder where diff --git a/src/reader/sync/entry_reader/lzma_dec.rs b/rc-zip-sync/src/entry_reader/lzma_dec.rs similarity index 96% rename from src/reader/sync/entry_reader/lzma_dec.rs rename to rc-zip-sync/src/entry_reader/lzma_dec.rs index eb66bb3..425e0ed 100644 --- a/src/reader/sync/entry_reader/lzma_dec.rs +++ b/rc-zip-sync/src/entry_reader/lzma_dec.rs @@ -1,10 +1,9 @@ +use byteorder::{LittleEndian, ReadBytesExt}; use lzma_rs::decompress::Stream; +use rc_zip::{Error, UnsupportedError}; use std::io::{Read, Write}; -use crate::{ - reader::{sync::decoder::Decoder, RawEntryReader}, - Error, UnsupportedError, -}; +use crate::decoder::{Decoder, RawEntryReader}; enum LzmaDecoderState { Writing(Box>>), @@ -105,7 +104,7 @@ pub(crate) fn mk_decoder( mut r: RawEntryReader, uncompressed_size: u64, ) -> std::io::Result> { - use byteorder::{LittleEndian, ReadBytesExt}; + // TODO: move into rc-zip // see `appnote.txt` section 5.8 diff --git a/src/reader/sync/entry_reader/mod.rs b/rc-zip-sync/src/entry_reader/mod.rs similarity index 97% rename from src/reader/sync/entry_reader/mod.rs rename to rc-zip-sync/src/entry_reader/mod.rs index ad09c0e..2b0c02d 100644 --- a/src/reader/sync/entry_reader/mod.rs +++ b/rc-zip-sync/src/entry_reader/mod.rs @@ -1,13 +1,3 @@ -use crate::{ - error::*, - format::*, - reader::{ - sync::decoder::{Decoder, StoreDecoder}, - RawEntryReader, - }, - transition, -}; - #[cfg(feature = "deflate")] mod deflate_dec; @@ -25,6 +15,10 @@ mod zstd_dec; use cfg_if::cfg_if; use oval::Buffer; +use rc_zip::{ + DataDescriptorRecord, Error, FormatError, LocalFileHeaderRecord, Method, StoredEntry, + StoredEntryInner, +}; use std::io; use tracing::trace; use winnow::{ @@ -33,11 +27,14 @@ use winnow::{ Parser, Partial, }; +use crate::decoder::{Decoder, RawEntryReader, StoreDecoder}; + struct EntryReadMetrics { uncompressed_size: u64, crc32: u32, } +// FIXME: move this state machine to rc-zip #[derive(Default)] enum State { ReadLocalHeader { diff --git a/src/reader/sync/entry_reader/zstd_dec.rs b/rc-zip-sync/src/entry_reader/zstd_dec.rs similarity index 87% rename from src/reader/sync/entry_reader/zstd_dec.rs rename to rc-zip-sync/src/entry_reader/zstd_dec.rs index 712c456..bc0df63 100644 --- a/src/reader/sync/entry_reader/zstd_dec.rs +++ b/rc-zip-sync/src/entry_reader/zstd_dec.rs @@ -2,7 +2,7 @@ use std::io::{BufRead, Read}; use zstd::stream::Decoder as ZstdDecoder; -use crate::reader::{sync::decoder::Decoder, RawEntryReader}; +use crate::decoder::{Decoder, RawEntryReader}; impl Decoder for ZstdDecoder<'static, R> where diff --git a/rc-zip-sync/src/lib.rs b/rc-zip-sync/src/lib.rs new file mode 100644 index 0000000..f5996fb --- /dev/null +++ b/rc-zip-sync/src/lib.rs @@ -0,0 +1,17 @@ +macro_rules! transition { + ($state: expr => ($pattern: pat) $body: expr) => { + $state = if let $pattern = std::mem::take(&mut $state) { + $body + } else { + unreachable!() + }; + }; +} + +mod decoder; +mod entry_reader; +mod read_zip; + +// re-exports +pub use entry_reader::EntryReader; +pub use read_zip::{HasCursor, ReadZip, ReadZipWithSize, SyncArchive, SyncStoredEntry}; diff --git a/src/reader/sync/read_zip.rs b/rc-zip-sync/src/read_zip.rs similarity index 93% rename from src/reader/sync/read_zip.rs rename to rc-zip-sync/src/read_zip.rs index 01db41e..f99c23e 100644 --- a/src/reader/sync/read_zip.rs +++ b/rc-zip-sync/src/read_zip.rs @@ -1,8 +1,9 @@ -use crate::{ - error::Error, - format::Archive, - reader::{sync::EntryReader, ArchiveReader, ArchiveReaderResult}, +use rc_zip::{ + reader::{ArchiveReader, ArchiveReaderResult}, + Archive, Error, StoredEntry, }; + +use crate::EntryReader; use std::{io::Read, ops::Deref}; /// A trait for reading something as a zip archive (blocking I/O model) @@ -42,11 +43,12 @@ where let mut ar = ArchiveReader::new(size); loop { if let Some(offset) = ar.wants_read() { - match ar.read(&mut self.cursor_at(offset)) { + match self.cursor_at(offset).read(ar.space()) { Ok(read_bytes) => { if read_bytes == 0 { return Err(Error::IO(std::io::ErrorKind::UnexpectedEof.into())); } + ar.fill(read_bytes); } Err(err) => return Err(Error::IO(err)), } @@ -115,8 +117,8 @@ where /// Attempts to look up an entry by name. This is usually a bad idea, /// as names aren't necessarily normalized in zip archives. pub fn by_name>(&self, name: N) -> Option> { - self.entries - .iter() + self.archive + .entries() .find(|&x| x.name() == name.as_ref()) .map(|entry| SyncStoredEntry { file: self.file, @@ -127,11 +129,11 @@ where pub struct SyncStoredEntry<'a, F> { file: &'a F, - entry: &'a crate::StoredEntry, + entry: &'a StoredEntry, } impl Deref for SyncStoredEntry<'_, F> { - type Target = crate::StoredEntry; + type Target = StoredEntry; fn deref(&self) -> &Self::Target { self.entry diff --git a/tests/integration_tests.rs b/rc-zip-sync/tests/integration_tests.rs similarity index 73% rename from tests/integration_tests.rs rename to rc-zip-sync/tests/integration_tests.rs index 9b2d5f1..8b98bff 100644 --- a/tests/integration_tests.rs +++ b/rc-zip-sync/tests/integration_tests.rs @@ -2,16 +2,10 @@ use chrono::{ offset::{FixedOffset, Utc}, DateTime, TimeZone, Timelike, }; -use rc_zip::{ - prelude::*, - reader::sync::{HasCursor, SyncArchive, SyncStoredEntry}, - Archive, Encoding, -}; - -#[cfg(feature = "tokio")] -use rc_zip::reader::tokio::{AsyncArchive, AsyncReadZip, AsyncStoredEntry, HasAsyncCursor}; +use rc_zip::{Archive, Encoding}; +use rc_zip_sync::{HasCursor, ReadZip, SyncArchive, SyncStoredEntry}; -use std::{fs::File, path::PathBuf}; +use std::{cmp, fs::File, path::PathBuf}; enum ZipSource { File(&'static str), @@ -77,49 +71,6 @@ impl ZipTest { f.check(&archive); } } - - #[cfg(feature = "tokio")] - async fn check_async( - &self, - archive: Result, rc_zip::Error>, - ) { - let case_bytes = self.bytes(); - - if let Some(expected) = &self.error { - let actual = match archive { - Err(e) => e, - Ok(_) => panic!("should have failed"), - }; - let expected = format!("{:#?}", expected); - let actual = format!("{:#?}", actual); - assert_eq!(expected, actual); - return; - } - let archive = archive.unwrap(); - - assert_eq!(case_bytes.len() as u64, archive.size()); - - if let Some(expected) = self.comment { - assert_eq!(expected, archive.comment().expect("should have comment")) - } - - if let Some(exp_encoding) = self.expected_encoding { - println!("{}: should be {}", self.name(), exp_encoding); - assert_eq!(archive.encoding(), exp_encoding); - } - - assert_eq!( - self.files.len(), - archive.entries().count(), - "{} should have {} entries files", - self.name(), - self.files.len() - ); - - for f in &self.files { - f.check_async(&archive).await; - } - } } struct ZipTestFile { @@ -187,65 +138,6 @@ impl ZipTestFile { } } } -#[cfg(feature = "tokio")] -impl ZipTestFile { - async fn check_async(&self, archive: &AsyncArchive<'_, F>) { - let entry = archive - .by_name(self.name) - .unwrap_or_else(|| panic!("entry {} should exist", self.name)); - - let archive_inner: &Archive = archive; - let entry_inner = archive_inner.by_name(self.name).unwrap(); - assert_eq!(entry.name(), entry_inner.name()); - - self.check_against_async(entry).await; - } - - async fn check_against_async(&self, entry: AsyncStoredEntry<'_, F>) { - if let Some(expected) = self.modified { - assert_eq!( - expected, - entry.modified(), - "entry {} should have modified = {:?}", - entry.name(), - expected - ) - } - - if let Some(mode) = self.mode { - assert_eq!(entry.mode.0 & 0o777, mode); - } - - // I have honestly yet to see a zip file _entry_ with a comment. - assert!(entry.comment().is_none()); - - match entry.contents() { - rc_zip::EntryContents::File => { - let actual_bytes = entry.bytes().await.unwrap(); - - match &self.content { - FileContent::Unchecked => { - // ah well - } - FileContent::Bytes(expected_bytes) => { - // first check length - assert_eq!(actual_bytes.len(), expected_bytes.len()); - assert_eq!(&actual_bytes[..], &expected_bytes[..]) - } - FileContent::File(file_path) => { - let expected_bytes = std::fs::read(zips_dir().join(file_path)).unwrap(); - // first check length - assert_eq!(actual_bytes.len(), expected_bytes.len()); - assert_eq!(&actual_bytes[..], &expected_bytes[..]) - } - } - } - rc_zip::EntryContents::Symlink | rc_zip::EntryContents::Directory => { - assert!(matches!(self.content, FileContent::Unchecked)); - } - } - } -} enum FileContent { Unchecked, @@ -283,8 +175,9 @@ impl ZipTest { fn zips_dir() -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join("tests") - .join("data") + .parent() + .unwrap() + .join("testdata") } fn time_zone(hours: i32) -> FixedOffset { @@ -451,15 +344,6 @@ fn real_world_files() { } } -#[cfg(feature = "tokio")] -#[test_log::test(tokio::test)] -async fn real_world_files_async() { - for case in test_cases() { - tracing::trace!("============ testing {}", case.name()); - case.check_async(case.bytes().read_zip_async().await).await; - } -} - #[test_log::test] fn state_machine() { use rc_zip::reader::{ArchiveReader, ArchiveReaderResult}; @@ -473,20 +357,24 @@ fn state_machine() { if let Some(offset) = zar.wants_read() { let increment = 128usize; let offset = offset as usize; - let mut slice = if offset + increment > bs.len() { + let slice = if offset + increment > bs.len() { &bs[offset..] } else { &bs[offset..offset + increment] }; - match zar.read(&mut slice) { - Ok(0) => panic!("EOF!"), - Ok(read_bytes) => { + let len = cmp::min(slice.len(), zar.space().len()); + println!( + "slice len: {}, zar space len: {}", + slice.len(), + zar.space().len() + ); + zar.space()[..len].copy_from_slice(&slice[..len]); + match len { + 0 => panic!("EOF!"), + read_bytes => { println!("at {}, zar read {} bytes", offset, read_bytes); - } - Err(err) => { - println!("at {}, zar encountered an error:", offset); - panic!("{}", err) + zar.fill(read_bytes); } } } diff --git a/rc-zip-tokio/Cargo.toml b/rc-zip-tokio/Cargo.toml new file mode 100644 index 0000000..99e8df9 --- /dev/null +++ b/rc-zip-tokio/Cargo.toml @@ -0,0 +1,38 @@ +[package] +name = "rc-zip-tokio" +version = "2.0.1" +description = "Asynchronous zip reading on top of rc-zip (for tokio I/O traits)" +repository = "https://github.com/fasterthanlime/rc-zip" +license = "Apache-2.0 or MIT" +authors = ["Amos Wenger "] +edition = "2021" +readme = "README.md" + +keywords = ["zip", "unzip"] +categories = ["compression"] + +[lib] +name = "rc_zip_tokio" +path = "src/lib.rs" + +[dependencies] +rc-zip = { version = "2.0.1", path = "../rc-zip" } +positioned-io = { version = "0.3.3" } +tokio = { version = "1.35.1", features = ["fs", "io-util", "rt-multi-thread"] } +futures = { version = "0.3.30" } +pin-project-lite = { version = "0.2.13" } +async-compression = { version = "0.4.6", features = ["tokio"] } +oval = "2.0.0" +crc32fast = "1.3.2" +tracing = "0.1.40" +cfg-if = "1.0.0" +winnow = "0.5.36" + +[features] +default = ["deflate"] +deflate = ["async-compression/deflate"] +deflate64 = ["async-compression/deflate64"] +lzma = ["async-compression/lzma"] +bzip2 = ["async-compression/bzip2"] +zstd = ["async-compression/zstd"] + diff --git a/src/reader/tokio/decoder.rs b/rc-zip-tokio/src/decoder.rs similarity index 80% rename from src/reader/tokio/decoder.rs rename to rc-zip-tokio/src/decoder.rs index dc5994c..0c6c71f 100644 --- a/src/reader/tokio/decoder.rs +++ b/rc-zip-tokio/src/decoder.rs @@ -3,7 +3,30 @@ use std::{cmp, io, pin::Pin, task}; use oval::Buffer; use tokio::io::{AsyncBufRead, AsyncRead}; -use crate::reader::RawEntryReader; +/// Only allows reading a fixed number of bytes from a [oval::Buffer], +/// used for reading the raw (compressed) data for a single zip file entry. +/// It also allows moving out the inner buffer afterwards. +pub(crate) struct RawEntryReader { + remaining: u64, + inner: Buffer, +} + +impl RawEntryReader { + pub(crate) fn new(inner: Buffer, entry_size: u64) -> Self { + Self { + inner, + remaining: entry_size, + } + } + + pub(crate) fn into_inner(self) -> Buffer { + self.inner + } + + pub(crate) fn get_mut(&mut self) -> &mut Buffer { + &mut self.inner + } +} pub(crate) trait AsyncDecoder: AsyncRead where diff --git a/src/reader/tokio/entry_reader/deflate_dec.rs b/rc-zip-tokio/src/entry_reader/deflate_dec.rs similarity index 86% rename from src/reader/tokio/entry_reader/deflate_dec.rs rename to rc-zip-tokio/src/entry_reader/deflate_dec.rs index f89c773..e3034e4 100644 --- a/src/reader/tokio/entry_reader/deflate_dec.rs +++ b/rc-zip-tokio/src/entry_reader/deflate_dec.rs @@ -1,7 +1,7 @@ use async_compression::tokio::bufread::DeflateDecoder; use tokio::io::AsyncBufRead; -use crate::reader::{tokio::decoder::AsyncDecoder, RawEntryReader}; +use crate::decoder::{AsyncDecoder, RawEntryReader}; impl AsyncDecoder for DeflateDecoder where diff --git a/src/reader/tokio/entry_reader/mod.rs b/rc-zip-tokio/src/entry_reader/mod.rs similarity index 98% rename from src/reader/tokio/entry_reader/mod.rs rename to rc-zip-tokio/src/entry_reader/mod.rs index ff9404d..2127c82 100644 --- a/src/reader/tokio/entry_reader/mod.rs +++ b/rc-zip-tokio/src/entry_reader/mod.rs @@ -1,18 +1,12 @@ -use crate::{ - error::*, - format::*, - reader::{ - tokio::decoder::{AsyncDecoder, StoreAsyncDecoder}, - RawEntryReader, - }, - transition_async, -}; - #[cfg(feature = "deflate")] mod deflate_dec; use cfg_if::cfg_if; use oval::Buffer; +use rc_zip::{ + DataDescriptorRecord, Error, FormatError, LocalFileHeaderRecord, Method, StoredEntry, + StoredEntryInner, +}; use std::{io, pin::Pin, task}; use tokio::io::AsyncRead; use tracing::trace; @@ -22,6 +16,8 @@ use winnow::{ Parser, Partial, }; +use crate::decoder::{AsyncDecoder, RawEntryReader, StoreAsyncDecoder}; + struct EntryReadMetrics { uncompressed_size: u64, crc32: u32, diff --git a/src/reader/macros.rs b/rc-zip-tokio/src/lib.rs similarity index 50% rename from src/reader/macros.rs rename to rc-zip-tokio/src/lib.rs index 70a39df..36af3f0 100644 --- a/src/reader/macros.rs +++ b/rc-zip-tokio/src/lib.rs @@ -1,15 +1,3 @@ -#[macro_export] -macro_rules! transition { - ($state: expr => ($pattern: pat) $body: expr) => { - $state = if let $pattern = std::mem::take(&mut $state) { - $body - } else { - unreachable!() - }; - }; -} - -#[macro_export] macro_rules! transition_async { ($state: expr => ($pattern: pat) $body: expr) => { *$state.as_mut() = if let $pattern = std::mem::take($state.as_mut().get_mut()) { @@ -19,3 +7,13 @@ macro_rules! transition_async { }; }; } + +mod decoder; +mod entry_reader; +mod read_zip; + +// re-exports +pub use entry_reader::AsyncEntryReader; +pub use read_zip::{ + AsyncArchive, AsyncReadZip, AsyncReadZipWithSize, AsyncStoredEntry, HasAsyncCursor, +}; diff --git a/src/reader/tokio/read_zip.rs b/rc-zip-tokio/src/read_zip.rs similarity index 95% rename from src/reader/tokio/read_zip.rs rename to rc-zip-tokio/src/read_zip.rs index fc060c9..c16c7cf 100644 --- a/src/reader/tokio/read_zip.rs +++ b/rc-zip-tokio/src/read_zip.rs @@ -4,11 +4,13 @@ use futures::future::BoxFuture; use positioned_io::{RandomAccessFile, ReadAt}; use tokio::io::{AsyncRead, AsyncReadExt, ReadBuf}; -use crate::{ - reader::{tokio::AsyncEntryReader, ArchiveReader, ArchiveReaderResult}, - Archive, Error, +use rc_zip::{ + reader::{ArchiveReader, ArchiveReaderResult}, + Archive, Error, StoredEntry, }; +use crate::AsyncEntryReader; + /// A trait for reading something as a zip archive (blocking I/O model) /// /// See also [ReadZip]. @@ -51,11 +53,12 @@ where let mut ar = ArchiveReader::new(size); loop { if let Some(offset) = ar.wants_read() { - match ar.read_async(&mut self.cursor_at(offset)).await { + match self.cursor_at(offset).read(ar.space()).await { Ok(read_bytes) => { if read_bytes == 0 { return Err(Error::IO(io::ErrorKind::UnexpectedEof.into())); } + ar.fill(read_bytes); } Err(err) => return Err(Error::IO(err)), } @@ -124,8 +127,8 @@ where /// Attempts to look up an entry by name. This is usually a bad idea, /// as names aren't necessarily normalized in zip archives. pub fn by_name>(&self, name: N) -> Option> { - self.entries - .iter() + self.archive + .entries() .find(|&x| x.name() == name.as_ref()) .map(|entry| AsyncStoredEntry { file: self.file, @@ -136,11 +139,11 @@ where pub struct AsyncStoredEntry<'a, F> { file: &'a F, - entry: &'a crate::StoredEntry, + entry: &'a StoredEntry, } impl Deref for AsyncStoredEntry<'_, F> { - type Target = crate::StoredEntry; + type Target = StoredEntry; fn deref(&self) -> &Self::Target { self.entry diff --git a/rc-zip/Cargo.toml b/rc-zip/Cargo.toml new file mode 100644 index 0000000..5822b9f --- /dev/null +++ b/rc-zip/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "rc-zip" +version = "2.0.1" +description = "An I/O-agnostic implementation of the zip file format" +repository = "https://github.com/fasterthanlime/rc-zip" +license = "Apache-2.0 OR MIT" +authors = ["Amos Wenger "] +edition = "2021" +readme = "../README.md" + +keywords = ["zip", "unzip"] +categories = ["compression"] + +[lib] +name = "rc_zip" +path = "src/lib.rs" + +[dependencies] +winnow = "0.5.36" +pretty-hex = "0.4.1" +oval = "2.0.0" +chrono = "0.4.33" +encoding_rs = "0.8.33" +crc32fast = "1.3.2" +tracing = "0.1.40" +oem_cp = "2.0.0" +thiserror = "1.0.56" +chardetng = "0.1.17" +num_enum = "0.7.2" +byteorder = "1.5.0" +cfg-if = "1.0.0" diff --git a/src/encoding.rs b/rc-zip/src/encoding.rs similarity index 100% rename from src/encoding.rs rename to rc-zip/src/encoding.rs diff --git a/src/error.rs b/rc-zip/src/error.rs similarity index 96% rename from src/error.rs rename to rc-zip/src/error.rs index dafa1a9..99bdc29 100644 --- a/src/error.rs +++ b/rc-zip/src/error.rs @@ -27,13 +27,11 @@ pub enum Error { } impl Error { - #[allow(unused)] - pub(crate) fn method_not_supported(method: Method) -> Self { + pub fn method_not_supported(method: Method) -> Self { Self::Unsupported(UnsupportedError::MethodNotSupported(method)) } - #[allow(unused)] - pub(crate) fn method_not_enabled(method: Method) -> Self { + pub fn method_not_enabled(method: Method) -> Self { Self::Unsupported(UnsupportedError::MethodNotEnabled(method)) } } diff --git a/src/format/archive.rs b/rc-zip/src/format/archive.rs similarity index 100% rename from src/format/archive.rs rename to rc-zip/src/format/archive.rs diff --git a/src/format/date_time.rs b/rc-zip/src/format/date_time.rs similarity index 100% rename from src/format/date_time.rs rename to rc-zip/src/format/date_time.rs diff --git a/src/format/directory_header.rs b/rc-zip/src/format/directory_header.rs similarity index 100% rename from src/format/directory_header.rs rename to rc-zip/src/format/directory_header.rs diff --git a/src/format/eocd.rs b/rc-zip/src/format/eocd.rs similarity index 100% rename from src/format/eocd.rs rename to rc-zip/src/format/eocd.rs diff --git a/src/format/extra_field.rs b/rc-zip/src/format/extra_field.rs similarity index 100% rename from src/format/extra_field.rs rename to rc-zip/src/format/extra_field.rs diff --git a/src/format/local.rs b/rc-zip/src/format/local.rs similarity index 100% rename from src/format/local.rs rename to rc-zip/src/format/local.rs diff --git a/src/format/mod.rs b/rc-zip/src/format/mod.rs similarity index 100% rename from src/format/mod.rs rename to rc-zip/src/format/mod.rs diff --git a/src/format/mode.rs b/rc-zip/src/format/mode.rs similarity index 100% rename from src/format/mode.rs rename to rc-zip/src/format/mode.rs diff --git a/src/format/raw.rs b/rc-zip/src/format/raw.rs similarity index 100% rename from src/format/raw.rs rename to rc-zip/src/format/raw.rs diff --git a/src/format/version.rs b/rc-zip/src/format/version.rs similarity index 100% rename from src/format/version.rs rename to rc-zip/src/format/version.rs diff --git a/src/lib.rs b/rc-zip/src/lib.rs similarity index 91% rename from src/lib.rs rename to rc-zip/src/lib.rs index 9199643..25224b6 100644 --- a/src/lib.rs +++ b/rc-zip/src/lib.rs @@ -18,9 +18,11 @@ //! mod encoding; + mod error; +pub use error::*; + mod format; -pub mod prelude; -pub mod reader; +pub use format::*; -pub use self::{error::*, format::*}; +pub mod reader; diff --git a/src/reader/archive_reader.rs b/rc-zip/src/reader.rs similarity index 82% rename from src/reader/archive_reader.rs rename to rc-zip/src/reader.rs index a372210..dc14d23 100644 --- a/src/reader/archive_reader.rs +++ b/rc-zip/src/reader.rs @@ -1,6 +1,19 @@ -use crate::{encoding::Encoding, error::*, format::*, reader::buffer::*, transition}; +use crate::{ + encoding::Encoding, Archive, DirectoryHeader, EndOfCentralDirectory, + EndOfCentralDirectory64Locator, EndOfCentralDirectory64Record, EndOfCentralDirectoryRecord, + Error, FormatError, Located, StoredEntry, +}; + +macro_rules! transition { + ($state: expr => ($pattern: pat) $body: expr) => { + $state = if let $pattern = std::mem::take(&mut $state) { + $body + } else { + unreachable!() + }; + }; +} -use std::io::Read; use tracing::trace; use winnow::{ error::ErrMode, @@ -14,7 +27,7 @@ use winnow::{ pub struct ArchiveReader { // Size of the entire zip file size: u64, - state: ArchiveReaderState, + state: State, } pub enum ArchiveReaderResult { @@ -26,10 +39,8 @@ pub enum ArchiveReaderResult { Done(Archive), } -enum ArchiveReaderState { - /// Used while transitioning because ownership rules are tough. - Transitioning, - +#[derive(Default)] +enum State { /// Finding and reading the end of central directory record ReadEocd { buffer: Buffer, haystack_size: u64 }, @@ -55,17 +66,14 @@ enum ArchiveReaderState { /// Done! Done, -} -impl Default for ArchiveReaderState { - fn default() -> Self { - Self::Transitioning - } + #[default] + Transitioning, } -impl ArchiveReaderState { - fn buffer_as_mut(&mut self) -> Option<&mut Buffer> { - use ArchiveReaderState as S; +impl State { + fn get_buffer_mut(&mut self) -> Option<&mut Buffer> { + use State as S; match self { S::ReadEocd { ref mut buffer, .. } => Some(buffer), S::ReadEocd64Locator { ref mut buffer, .. } => Some(buffer), @@ -74,6 +82,11 @@ impl ArchiveReaderState { _ => None, } } + + fn expect_buffer_mut(&mut self) -> &mut Buffer { + self.get_buffer_mut() + .expect("called expect_buffer_mut() on invalid state") + } } impl ArchiveReader { @@ -96,7 +109,7 @@ impl ArchiveReader { Self { size, - state: ArchiveReaderState::ReadEocd { + state: State::ReadEocd { buffer: Buffer::with_capacity(Self::DEFAULT_BUFFER_SIZE), haystack_size, }, @@ -112,7 +125,7 @@ impl ArchiveReader { /// Returns `None` if the reader does not need data and [process()](ArchiveReader::process()) /// can be called directly. pub fn wants_read(&self) -> Option { - use ArchiveReaderState as S; + use State as S; match self.state { S::ReadEocd { ref buffer, @@ -140,37 +153,22 @@ impl ArchiveReader { } } - /// Reads some data from `rd` into the reader's internal buffer. - /// - /// Any I/O errors will be returned. - /// - /// If successful, this returns the number of bytes read. On success, - /// [process()](ArchiveReader::process()) should be called next. - pub fn read(&mut self, rd: &mut dyn Read) -> Result { - if let Some(buffer) = self.state.buffer_as_mut() { - buffer.read(rd) - } else { - Ok(0) - } + /// returns a mutable slice with all the available space to + /// write to + #[inline] + pub fn space(&mut self) -> &mut [u8] { + self.state.expect_buffer_mut().space() } - /// Reads some data from `rd` (which implements AsyncRead) into the - /// reader's internal buffer. - /// - /// Any I/O errors will be returned. + /// after having written data to the buffer, use this function + /// to indicate how many bytes were written /// - /// If successful, this returns the number of bytes read. On success, - /// [process()](ArchiveReader::process()) should be called next. - #[cfg(feature = "tokio")] - pub async fn read_async( - &mut self, - rd: &mut R, - ) -> Result { - if let Some(buffer) = self.state.buffer_as_mut() { - buffer.read_async(rd).await - } else { - Ok(0) - } + /// if there is not enough available space, this function can call + /// `shift()` to move the remaining data to the beginning of the + /// buffer + #[inline] + pub fn fill(&mut self, count: usize) -> usize { + self.state.expect_buffer_mut().fill(count) } /// Process buffered data @@ -185,7 +183,7 @@ impl ArchiveReader { /// method should ever be called again on this reader. pub fn process(&mut self) -> Result { use ArchiveReaderResult as R; - use ArchiveReaderState as S; + use State as S; match self.state { S::ReadEocd { ref mut buffer, @@ -441,3 +439,81 @@ impl ArchiveReader { } } } + +/// A wrapper around [oval::Buffer] that keeps track of how many bytes we've read since +/// initialization or the last reset. +pub(crate) struct Buffer { + pub(crate) buffer: oval::Buffer, + pub(crate) read_bytes: u64, +} + +impl Buffer { + /// creates a new buffer with the specified capacity + pub(crate) fn with_capacity(size: usize) -> Self { + Self { + buffer: oval::Buffer::with_capacity(size), + read_bytes: 0, + } + } + + /// resets the buffer (so that data() returns an empty slice, + /// and space() returns the full capacity), along with th e + /// read bytes counter. + pub(crate) fn reset(&mut self) { + self.read_bytes = 0; + self.buffer.reset(); + } + + /// returns the number of read bytes since the last reset + #[inline] + pub(crate) fn read_bytes(&self) -> u64 { + self.read_bytes + } + + /// returns a slice with all the available data + #[inline] + pub(crate) fn data(&self) -> &[u8] { + self.buffer.data() + } + + /// returns how much data can be read from the buffer + #[inline] + pub(crate) fn available_data(&self) -> usize { + self.buffer.available_data() + } + + /// returns a mutable slice with all the available space to + /// write to + #[inline] + pub(crate) fn space(&mut self) -> &mut [u8] { + self.buffer.space() + } + + /// after having written data to the buffer, use this function + /// to indicate how many bytes were written + /// + /// if there is not enough available space, this function can call + /// `shift()` to move the remaining data to the beginning of the + /// buffer + #[inline] + pub(crate) fn fill(&mut self, count: usize) -> usize { + self.buffer.fill(count) + } + + /// advances the position tracker + /// + /// if the position gets past the buffer's half, + /// this will call `shift()` to move the remaining data + /// to the beginning of the buffer + #[inline] + pub(crate) fn consume(&mut self, size: usize) { + self.buffer.consume(size); + self.read_bytes += size as u64; + } + + /// computes an absolute offset, given an offset relative + /// to the current read position + pub(crate) fn read_offset(&self, offset: u64) -> u64 { + self.read_bytes + offset + } +} diff --git a/src/prelude.rs b/src/prelude.rs deleted file mode 100644 index 21b88d1..0000000 --- a/src/prelude.rs +++ /dev/null @@ -1,4 +0,0 @@ -//! Prelude for rc-zip - -#[cfg(feature = "sync")] -pub use crate::reader::sync::{ReadZip, ReadZipWithSize}; diff --git a/src/reader/buffer.rs b/src/reader/buffer.rs deleted file mode 100644 index a132ab9..0000000 --- a/src/reader/buffer.rs +++ /dev/null @@ -1,92 +0,0 @@ -use std::io::Read; - -use tracing::trace; - -/// A wrapper around [oval::Buffer] that keeps track of how many bytes we've read since -/// initialization or the last reset. -pub(crate) struct Buffer { - pub(crate) buffer: oval::Buffer, - pub(crate) read_bytes: u64, -} - -impl Buffer { - /// creates a new buffer with the specified capacity - pub(crate) fn with_capacity(size: usize) -> Self { - Self { - buffer: oval::Buffer::with_capacity(size), - read_bytes: 0, - } - } - - /// resets the buffer (so that data() returns an empty slice, - /// and space() returns the full capacity), along with th e - /// read bytes counter. - pub(crate) fn reset(&mut self) { - self.read_bytes = 0; - self.buffer.reset(); - } - - /// returns the number of read bytes since the last reset - pub(crate) fn read_bytes(&self) -> u64 { - self.read_bytes - } - - /// returns a slice with all the available data - pub(crate) fn data(&self) -> &[u8] { - self.buffer.data() - } - - /// returns how much data can be read from the buffer - pub(crate) fn available_data(&self) -> usize { - self.buffer.available_data() - } - - /// advances the position tracker - /// - /// if the position gets past the buffer's half, - /// this will call `shift()` to move the remaining data - /// to the beginning of the buffer - pub(crate) fn consume(&mut self, count: usize) -> usize { - self.buffer.consume(count) - } - - /// fill that buffer from the given Read - pub(crate) fn read(&mut self, rd: &mut dyn Read) -> Result { - if self.buffer.available_space() == 0 { - trace!("uh oh, buffer has no available space!") - } - - match rd.read(self.buffer.space()) { - Ok(written) => { - self.read_bytes += written as u64; - self.buffer.fill(written); - Ok(written) - } - Err(e) => Err(e), - } - } - - /// fill that buffer from the given AsyncRead - #[cfg(feature = "tokio")] - pub(crate) async fn read_async( - &mut self, - rd: &mut (impl tokio::io::AsyncRead + Unpin), - ) -> Result { - if self.buffer.available_space() == 0 { - trace!("uh oh, buffer has no available space!") - } - - match tokio::io::AsyncReadExt::read(rd, self.buffer.space()).await { - Ok(written) => { - self.read_bytes += written as u64; - self.buffer.fill(written); - Ok(written) - } - Err(e) => Err(e), - } - } - - pub(crate) fn read_offset(&self, offset: u64) -> u64 { - self.read_bytes + offset - } -} diff --git a/src/reader/mod.rs b/src/reader/mod.rs deleted file mode 100644 index 4bc5299..0000000 --- a/src/reader/mod.rs +++ /dev/null @@ -1,38 +0,0 @@ -mod buffer; -mod macros; - -mod archive_reader; -use oval::Buffer; - -pub use self::archive_reader::{ArchiveReader, ArchiveReaderResult}; - -#[cfg(feature = "sync")] -pub mod sync; - -#[cfg(feature = "tokio")] -pub mod tokio; - -/// Only allows reading a fixed number of bytes from a [oval::Buffer], -/// used for reading the raw (compressed) data for a single zip file entry. -/// It also allows moving out the inner buffer afterwards. -pub(crate) struct RawEntryReader { - remaining: u64, - inner: Buffer, -} - -impl RawEntryReader { - pub(crate) fn new(inner: Buffer, entry_size: u64) -> Self { - Self { - inner, - remaining: entry_size, - } - } - - pub(crate) fn into_inner(self) -> Buffer { - self.inner - } - - pub(crate) fn get_mut(&mut self) -> &mut Buffer { - &mut self.inner - } -} diff --git a/src/reader/sync/mod.rs b/src/reader/sync/mod.rs deleted file mode 100644 index e4d063f..0000000 --- a/src/reader/sync/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -mod decoder; -mod entry_reader; -mod read_zip; - -// re-exports -pub use entry_reader::EntryReader; -pub use read_zip::{HasCursor, ReadZip, ReadZipWithSize, SyncArchive, SyncStoredEntry}; diff --git a/src/reader/tokio/mod.rs b/src/reader/tokio/mod.rs deleted file mode 100644 index 75b4ffb..0000000 --- a/src/reader/tokio/mod.rs +++ /dev/null @@ -1,9 +0,0 @@ -mod decoder; -mod entry_reader; -mod read_zip; - -// re-exports -pub use entry_reader::AsyncEntryReader; -pub use read_zip::{ - AsyncArchive, AsyncReadZip, AsyncReadZipWithSize, AsyncStoredEntry, HasAsyncCursor, -}; diff --git a/tests/data/cp-437.zip b/testdata/cp-437.zip similarity index 100% rename from tests/data/cp-437.zip rename to testdata/cp-437.zip diff --git a/tests/data/crc32-not-streamed.zip b/testdata/crc32-not-streamed.zip similarity index 100% rename from tests/data/crc32-not-streamed.zip rename to testdata/crc32-not-streamed.zip diff --git a/tests/data/dd.zip b/testdata/dd.zip similarity index 100% rename from tests/data/dd.zip rename to testdata/dd.zip diff --git a/tests/data/found-me-bzip2.zip b/testdata/found-me-bzip2.zip similarity index 100% rename from tests/data/found-me-bzip2.zip rename to testdata/found-me-bzip2.zip diff --git a/tests/data/found-me-deflate64.zip b/testdata/found-me-deflate64.zip similarity index 100% rename from tests/data/found-me-deflate64.zip rename to testdata/found-me-deflate64.zip diff --git a/tests/data/found-me-lzma.zip b/testdata/found-me-lzma.zip similarity index 100% rename from tests/data/found-me-lzma.zip rename to testdata/found-me-lzma.zip diff --git a/tests/data/found-me-zstd.zip b/testdata/found-me-zstd.zip similarity index 100% rename from tests/data/found-me-zstd.zip rename to testdata/found-me-zstd.zip diff --git a/tests/data/found-me.txt b/testdata/found-me.txt similarity index 100% rename from tests/data/found-me.txt rename to testdata/found-me.txt diff --git a/tests/data/go-no-datadesc-sig.zip b/testdata/go-no-datadesc-sig.zip similarity index 100% rename from tests/data/go-no-datadesc-sig.zip rename to testdata/go-no-datadesc-sig.zip diff --git a/tests/data/go-with-datadesc-sig.zip b/testdata/go-with-datadesc-sig.zip similarity index 100% rename from tests/data/go-with-datadesc-sig.zip rename to testdata/go-with-datadesc-sig.zip diff --git a/tests/data/gophercolor16x16.png b/testdata/gophercolor16x16.png similarity index 100% rename from tests/data/gophercolor16x16.png rename to testdata/gophercolor16x16.png diff --git a/tests/data/readme.notzip b/testdata/readme.notzip similarity index 100% rename from tests/data/readme.notzip rename to testdata/readme.notzip diff --git a/tests/data/readme.trailingzip b/testdata/readme.trailingzip similarity index 100% rename from tests/data/readme.trailingzip rename to testdata/readme.trailingzip diff --git a/tests/data/readme.zip b/testdata/readme.zip similarity index 100% rename from tests/data/readme.zip rename to testdata/readme.zip diff --git a/tests/data/shift-jis.zip b/testdata/shift-jis.zip similarity index 100% rename from tests/data/shift-jis.zip rename to testdata/shift-jis.zip diff --git a/tests/data/symlink.zip b/testdata/symlink.zip similarity index 100% rename from tests/data/symlink.zip rename to testdata/symlink.zip diff --git a/tests/data/test-trailing-junk.zip b/testdata/test-trailing-junk.zip similarity index 100% rename from tests/data/test-trailing-junk.zip rename to testdata/test-trailing-junk.zip diff --git a/tests/data/test.zip b/testdata/test.zip similarity index 100% rename from tests/data/test.zip rename to testdata/test.zip diff --git a/tests/data/time-22738.zip b/testdata/time-22738.zip similarity index 100% rename from tests/data/time-22738.zip rename to testdata/time-22738.zip diff --git a/tests/data/time-7zip.zip b/testdata/time-7zip.zip similarity index 100% rename from tests/data/time-7zip.zip rename to testdata/time-7zip.zip diff --git a/tests/data/time-go.zip b/testdata/time-go.zip similarity index 100% rename from tests/data/time-go.zip rename to testdata/time-go.zip diff --git a/tests/data/time-infozip.zip b/testdata/time-infozip.zip similarity index 100% rename from tests/data/time-infozip.zip rename to testdata/time-infozip.zip diff --git a/tests/data/time-osx.zip b/testdata/time-osx.zip similarity index 100% rename from tests/data/time-osx.zip rename to testdata/time-osx.zip diff --git a/tests/data/time-win7.zip b/testdata/time-win7.zip similarity index 100% rename from tests/data/time-win7.zip rename to testdata/time-win7.zip diff --git a/tests/data/time-winrar.zip b/testdata/time-winrar.zip similarity index 100% rename from tests/data/time-winrar.zip rename to testdata/time-winrar.zip diff --git a/tests/data/time-winzip.zip b/testdata/time-winzip.zip similarity index 100% rename from tests/data/time-winzip.zip rename to testdata/time-winzip.zip diff --git a/tests/data/unix.zip b/testdata/unix.zip similarity index 100% rename from tests/data/unix.zip rename to testdata/unix.zip diff --git a/tests/data/utf8-7zip.zip b/testdata/utf8-7zip.zip similarity index 100% rename from tests/data/utf8-7zip.zip rename to testdata/utf8-7zip.zip diff --git a/tests/data/utf8-infozip.zip b/testdata/utf8-infozip.zip similarity index 100% rename from tests/data/utf8-infozip.zip rename to testdata/utf8-infozip.zip diff --git a/tests/data/utf8-osx.zip b/testdata/utf8-osx.zip similarity index 100% rename from tests/data/utf8-osx.zip rename to testdata/utf8-osx.zip diff --git a/tests/data/utf8-winrar.zip b/testdata/utf8-winrar.zip similarity index 100% rename from tests/data/utf8-winrar.zip rename to testdata/utf8-winrar.zip diff --git a/tests/data/utf8-winzip.zip b/testdata/utf8-winzip.zip similarity index 100% rename from tests/data/utf8-winzip.zip rename to testdata/utf8-winzip.zip diff --git a/tests/data/winxp.zip b/testdata/winxp.zip similarity index 100% rename from tests/data/winxp.zip rename to testdata/winxp.zip diff --git a/tests/data/zip64-2.zip b/testdata/zip64-2.zip similarity index 100% rename from tests/data/zip64-2.zip rename to testdata/zip64-2.zip diff --git a/tests/data/zip64.zip b/testdata/zip64.zip similarity index 100% rename from tests/data/zip64.zip rename to testdata/zip64.zip From 635dcd8454ecd88f718bc190806d397323e0621c Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 00:05:50 +0100 Subject: [PATCH 14/49] All tests pass --- rc-zip-sync/src/read_zip.rs | 10 ++++++++-- rc-zip/src/reader.rs | 36 +++++++++++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/rc-zip-sync/src/read_zip.rs b/rc-zip-sync/src/read_zip.rs index f99c23e..5e0522c 100644 --- a/rc-zip-sync/src/read_zip.rs +++ b/rc-zip-sync/src/read_zip.rs @@ -40,11 +40,14 @@ where type File = F; fn read_zip_with_size(&self, size: u64) -> Result, Error> { + tracing::trace!(%size, "read_zip_with_size"); let mut ar = ArchiveReader::new(size); loop { if let Some(offset) = ar.wants_read() { + tracing::trace!(%offset, "read_zip_with_size: wants_read, space len = {}", ar.space().len()); match self.cursor_at(offset).read(ar.space()) { Ok(read_bytes) => { + tracing::trace!(%read_bytes, "read_zip_with_size: read"); if read_bytes == 0 { return Err(Error::IO(std::io::ErrorKind::UnexpectedEof.into())); } @@ -56,12 +59,15 @@ where match ar.process()? { ArchiveReaderResult::Done(archive) => { + tracing::trace!("read_zip_with_size: done"); return Ok(SyncArchive { file: self, archive, - }) + }); + } + ArchiveReaderResult::Continue => { + tracing::trace!("read_zip_with_size: continue"); } - ArchiveReaderResult::Continue => {} } } } diff --git a/rc-zip/src/reader.rs b/rc-zip/src/reader.rs index dc14d23..38fe4af 100644 --- a/rc-zip/src/reader.rs +++ b/rc-zip/src/reader.rs @@ -157,7 +157,15 @@ impl ArchiveReader { /// write to #[inline] pub fn space(&mut self) -> &mut [u8] { - self.state.expect_buffer_mut().space() + let buf = self.state.expect_buffer_mut(); + trace!( + available_space = buf.available_space(), + "space() | available_space" + ); + if buf.available_space() == 0 { + buf.shift(); + } + buf.space() } /// after having written data to the buffer, use this function @@ -190,6 +198,11 @@ impl ArchiveReader { haystack_size, } => { if buffer.read_bytes() < haystack_size { + trace!( + read_bytes = buffer.read_bytes(), + haystack_size, + "ReadEocd | need more data" + ); return Ok(R::Continue); } @@ -440,6 +453,8 @@ impl ArchiveReader { } } +/// FIXME: get rid of this wrapper entirely, we can just use `.available_data` from oval::Buffer ? + /// A wrapper around [oval::Buffer] that keeps track of how many bytes we've read since /// initialization or the last reset. pub(crate) struct Buffer { @@ -482,6 +497,12 @@ impl Buffer { self.buffer.available_data() } + /// returns how much free space is available to write to + #[inline] + pub fn available_space(&self) -> usize { + self.buffer.available_space() + } + /// returns a mutable slice with all the available space to /// write to #[inline] @@ -489,6 +510,14 @@ impl Buffer { self.buffer.space() } + /// moves the data at the beginning of the buffer + /// + /// if the position was more than 0, it is now 0 + #[inline] + pub fn shift(&mut self) { + self.buffer.shift() + } + /// after having written data to the buffer, use this function /// to indicate how many bytes were written /// @@ -497,7 +526,9 @@ impl Buffer { /// buffer #[inline] pub(crate) fn fill(&mut self, count: usize) -> usize { - self.buffer.fill(count) + let n = self.buffer.fill(count); + self.read_bytes += n as u64; + n } /// advances the position tracker @@ -508,7 +539,6 @@ impl Buffer { #[inline] pub(crate) fn consume(&mut self, size: usize) { self.buffer.consume(size); - self.read_bytes += size as u64; } /// computes an absolute offset, given an offset relative From 7a822253905dede6ba3a2571ec20281718ad263d Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 00:06:41 +0100 Subject: [PATCH 15/49] Can't get rid of the Buffer wrapper, because relative reads --- rc-zip/src/reader.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/rc-zip/src/reader.rs b/rc-zip/src/reader.rs index 38fe4af..9fb822d 100644 --- a/rc-zip/src/reader.rs +++ b/rc-zip/src/reader.rs @@ -453,8 +453,6 @@ impl ArchiveReader { } } -/// FIXME: get rid of this wrapper entirely, we can just use `.available_data` from oval::Buffer ? - /// A wrapper around [oval::Buffer] that keeps track of how many bytes we've read since /// initialization or the last reset. pub(crate) struct Buffer { From c6d0980d43ef3b8b15e91081d8b86d9da9bb33ce Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 11:17:36 +0100 Subject: [PATCH 16/49] Move LZMA properties parsing into rc-zip, remove unused dependencies --- Cargo.lock | 3 - Cargo.toml | 2 +- Justfile | 4 ++ rc-zip-sync/Cargo.toml | 1 - rc-zip-sync/src/entry_reader/lzma_dec.rs | 32 +--------- rc-zip-sync/src/entry_reader/mod.rs | 2 +- rc-zip/Cargo.toml | 2 - rc-zip/src/format/local.rs | 80 ++++++++++++++++++++++-- 8 files changed, 83 insertions(+), 43 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 15987e9..763131c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -824,11 +824,9 @@ checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" name = "rc-zip" version = "2.0.1" dependencies = [ - "byteorder", "cfg-if", "chardetng", "chrono", - "crc32fast", "encoding_rs", "num_enum", "oem_cp", @@ -843,7 +841,6 @@ dependencies = [ name = "rc-zip-sync" version = "2.0.1" dependencies = [ - "byteorder", "bzip2", "cfg-if", "chrono", diff --git a/Cargo.toml b/Cargo.toml index c77b87a..4d79c02 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,4 +4,4 @@ members = [ "rc-zip", "rc-zip-sync", "rc-zip-tokio", -] \ No newline at end of file +] diff --git a/Justfile b/Justfile index 03efa18..75f16b4 100644 --- a/Justfile +++ b/Justfile @@ -10,6 +10,10 @@ check: test *args: cargo nextest run {{args}} --all-features +# Report unused dependencies: +udeps: + RUSTC_BOOTSTRAP=1 cargo udeps + # Run all tests with nextest and cargo-llvm-cov ci-test: #!/bin/bash -eux diff --git a/rc-zip-sync/Cargo.toml b/rc-zip-sync/Cargo.toml index ec7fe86..2ef1302 100644 --- a/rc-zip-sync/Cargo.toml +++ b/rc-zip-sync/Cargo.toml @@ -30,7 +30,6 @@ zstd = { version = "0.13.0", optional = true } oval = "2.0.0" crc32fast = "1.3.2" tracing = "0.1.40" -byteorder = "1.5.0" cfg-if = "1.0.0" winnow = "0.5.36" diff --git a/rc-zip-sync/src/entry_reader/lzma_dec.rs b/rc-zip-sync/src/entry_reader/lzma_dec.rs index 425e0ed..d42f4aa 100644 --- a/rc-zip-sync/src/entry_reader/lzma_dec.rs +++ b/rc-zip-sync/src/entry_reader/lzma_dec.rs @@ -1,6 +1,4 @@ -use byteorder::{LittleEndian, ReadBytesExt}; use lzma_rs::decompress::Stream; -use rc_zip::{Error, UnsupportedError}; use std::io::{Read, Write}; use crate::decoder::{Decoder, RawEntryReader}; @@ -101,37 +99,9 @@ where } pub(crate) fn mk_decoder( - mut r: RawEntryReader, + r: RawEntryReader, uncompressed_size: u64, ) -> std::io::Result> { - // TODO: move into rc-zip - - // see `appnote.txt` section 5.8 - - // major & minor version are each 1 byte - let major = r.read_u8()?; - let minor = r.read_u8()?; - - // properties size is a 2-byte little-endian integer - let properties_size = r.read_u16::()?; - - if (major, minor) != (2, 0) { - return Err( - Error::Unsupported(UnsupportedError::LzmaVersionUnsupported { minor, major }).into(), - ); - } - - const LZMA_PROPERTIES_SIZE: u16 = 5; - if properties_size != LZMA_PROPERTIES_SIZE { - return Err( - Error::Unsupported(UnsupportedError::LzmaPropertiesHeaderWrongSize { - expected: 5, - actual: properties_size, - }) - .into(), - ); - } - let memlimit = 128 * 1024 * 1024; let opts = lzma_rs::decompress::Options { unpacked_size: lzma_rs::decompress::UnpackedSize::UseProvided(Some(uncompressed_size)), diff --git a/rc-zip-sync/src/entry_reader/mod.rs b/rc-zip-sync/src/entry_reader/mod.rs index 2b0c02d..94a663f 100644 --- a/rc-zip-sync/src/entry_reader/mod.rs +++ b/rc-zip-sync/src/entry_reader/mod.rs @@ -282,7 +282,7 @@ where Method::Lzma => { cfg_if! { if #[cfg(feature = "lzma")] { - Box::new(lzma_dec::mk_decoder(raw_r,self.inner.uncompressed_size)?) + Box::new(lzma_dec::mk_decoder(raw_r, self.inner.uncompressed_size)?) } else { return Err(Error::method_not_enabled(self.method)); } diff --git a/rc-zip/Cargo.toml b/rc-zip/Cargo.toml index 5822b9f..65284b0 100644 --- a/rc-zip/Cargo.toml +++ b/rc-zip/Cargo.toml @@ -21,11 +21,9 @@ pretty-hex = "0.4.1" oval = "2.0.0" chrono = "0.4.33" encoding_rs = "0.8.33" -crc32fast = "1.3.2" tracing = "0.1.40" oem_cp = "2.0.0" thiserror = "1.0.56" chardetng = "0.1.17" num_enum = "0.7.2" -byteorder = "1.5.0" cfg-if = "1.0.0" diff --git a/rc-zip/src/format/local.rs b/rc-zip/src/format/local.rs index 553b3e1..2c43c43 100644 --- a/rc-zip/src/format/local.rs +++ b/rc-zip/src/format/local.rs @@ -1,7 +1,8 @@ -use crate::format::*; +use crate::{format::*, Error, UnsupportedError}; use winnow::{ - binary::{le_u16, le_u32, le_u64}, + binary::{le_u16, le_u32, le_u64, le_u8}, combinator::opt, + error::{ContextError, ErrMode, ErrorKind, FromExternalError}, seq, token::tag, PResult, Parser, Partial, @@ -15,7 +16,7 @@ pub struct LocalFileHeaderRecord { /// general purpose bit flag pub flags: u16, /// compression method - pub method: u16, + pub method: Method, /// last mod file datetime pub modified: MsdosTimestamp, /// crc-32 @@ -28,6 +29,16 @@ pub struct LocalFileHeaderRecord { pub name: ZipString, // extra field pub extra: ZipBytes, + + // method-specific fields + pub method_specific: MethodSpecific, +} + +#[derive(Debug)] +/// Method-specific properties following the local file header +pub enum MethodSpecific { + None, + Lzma(LzmaProperties), } impl LocalFileHeaderRecord { @@ -38,7 +49,7 @@ impl LocalFileHeaderRecord { let reader_version = Version::parser.parse_next(i)?; let flags = le_u16.parse_next(i)?; - let method = le_u16.parse_next(i)?; + let method = le_u16.parse_next(i).map(Method::from)?; let modified = MsdosTimestamp::parser.parse_next(i)?; let crc32 = le_u32.parse_next(i)?; let compressed_size = le_u32.parse_next(i)?; @@ -50,6 +61,21 @@ impl LocalFileHeaderRecord { let name = ZipString::parser(name_len).parse_next(i)?; let extra = ZipBytes::parser(extra_len).parse_next(i)?; + let method_specific = match method { + Method::Lzma => { + let lzma_properties = LzmaProperties::parser.parse_next(i)?; + if let Err(e) = lzma_properties.error_if_unsupported() { + return Err(ErrMode::Cut(ContextError::from_external_error( + i, + ErrorKind::Verify, + e, + ))); + } + MethodSpecific::Lzma(lzma_properties) + } + _ => MethodSpecific::None, + }; + Ok(Self { reader_version, flags, @@ -60,6 +86,7 @@ impl LocalFileHeaderRecord { uncompressed_size, name, extra, + method_specific, }) } @@ -114,3 +141,48 @@ impl DataDescriptorRecord { } } } + +/// 5.8.5 LZMA Properties header +#[derive(Debug)] +pub struct LzmaProperties { + /// major version + pub major: u8, + /// minor version + pub minor: u8, + /// properties size + pub properties_size: u16, +} + +impl LzmaProperties { + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + seq! {Self { + major: le_u8, + minor: le_u8, + properties_size: le_u16, + }} + .parse_next(i) + } + + pub fn error_if_unsupported(&self) -> Result<(), Error> { + if (self.major, self.minor) != (2, 0) { + return Err(Error::Unsupported( + UnsupportedError::LzmaVersionUnsupported { + minor: self.minor, + major: self.major, + }, + )); + } + + const LZMA_PROPERTIES_SIZE: u16 = 5; + if self.properties_size != LZMA_PROPERTIES_SIZE { + return Err(Error::Unsupported( + UnsupportedError::LzmaPropertiesHeaderWrongSize { + expected: 5, + actual: self.properties_size, + }, + )); + } + + Ok(()) + } +} From 93d3518c6fabc060c66e9868a8e259cc3fba48ed Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 11:28:27 +0100 Subject: [PATCH 17/49] Rename 'reader' API to state machine api --- .github/workflows/test.yml | 2 +- Justfile | 2 +- rc-zip-tokio/src/entry_reader/mod.rs | 2 +- rc-zip/src/{reader.rs => reader/archive.rs} | 47 ++++++--------------- rc-zip/src/reader/entry.rs | 1 + rc-zip/src/reader/mod.rs | 24 +++++++++++ 6 files changed, 42 insertions(+), 36 deletions(-) rename rc-zip/src/{reader.rs => reader/archive.rs} (94%) create mode 100644 rc-zip/src/reader/entry.rs create mode 100644 rc-zip/src/reader/mod.rs diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 44ea5b7..d3176aa 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -34,7 +34,7 @@ jobs: cargo doc --all-features --no-deps - name: Run cargo clippy run: | - cargo hack clippy --feature-powerset --group-features deflate,deflate64,lzma,bzip2 + cargo hack clippy --each-feature - name: Run tests and collect coverage run: just ci-test - name: Upload coverage information diff --git a/Justfile b/Justfile index 75f16b4..3a0c6cf 100644 --- a/Justfile +++ b/Justfile @@ -4,7 +4,7 @@ _default: just --list check: - cargo hack clippy --feature-powerset --group-features deflate,deflate64,lzma,bzip2 + cargo hack clippy --each-feature # Run all tests locally test *args: diff --git a/rc-zip-tokio/src/entry_reader/mod.rs b/rc-zip-tokio/src/entry_reader/mod.rs index 2127c82..62a2dd1 100644 --- a/rc-zip-tokio/src/entry_reader/mod.rs +++ b/rc-zip-tokio/src/entry_reader/mod.rs @@ -284,7 +284,7 @@ fn method_to_decoder( if #[cfg(feature = "deflate")] { Box::new(deflate_dec::mk_decoder(raw_r)) } else { - return Err(Error::method_not_enabled(self.method)); + return Err(Error::method_not_enabled(method)); } } } diff --git a/rc-zip/src/reader.rs b/rc-zip/src/reader/archive.rs similarity index 94% rename from rc-zip/src/reader.rs rename to rc-zip/src/reader/archive.rs index 9fb822d..75c98a5 100644 --- a/rc-zip/src/reader.rs +++ b/rc-zip/src/reader/archive.rs @@ -1,19 +1,10 @@ +use super::FsmResult; use crate::{ encoding::Encoding, Archive, DirectoryHeader, EndOfCentralDirectory, EndOfCentralDirectory64Locator, EndOfCentralDirectory64Record, EndOfCentralDirectoryRecord, Error, FormatError, Located, StoredEntry, }; -macro_rules! transition { - ($state: expr => ($pattern: pat) $body: expr) => { - $state = if let $pattern = std::mem::take(&mut $state) { - $body - } else { - unreachable!() - }; - }; -} - use tracing::trace; use winnow::{ error::ErrMode, @@ -30,15 +21,6 @@ pub struct ArchiveReader { state: State, } -pub enum ArchiveReaderResult { - /// Indicates that [ArchiveReader][] has work left, and the loop should continue. - Continue, - /// Indicates that [ArchiveReader][] is done reading the central directory, - /// contains an [Archive][]. Calling any method after [process()](ArchiveReader::process()) has returned - /// `Done` will panic. - Done(Archive), -} - #[derive(Default)] enum State { /// Finding and reading the end of central directory record @@ -184,13 +166,12 @@ impl ArchiveReader { /// Errors returned from process() are caused by invalid zip archives, /// unsupported format quirks, or implementation bugs - never I/O errors. /// - /// A result of [ArchiveReaderResult::Continue] indicates one should loop again, + /// A result of [FsmResult::Continue] indicates one should loop again, /// starting with [wants_read()](ArchiveReader::wants_read()). /// - /// A result of [ArchiveReaderResult::Done] contains the [Archive], and indicates that no + /// A result of [FsmResult::Done] contains the [Archive], and indicates that no /// method should ever be called again on this reader. - pub fn process(&mut self) -> Result { - use ArchiveReaderResult as R; + pub fn process(&mut self) -> Result, Error> { use State as S; match self.state { S::ReadEocd { @@ -203,7 +184,7 @@ impl ArchiveReader { haystack_size, "ReadEocd | need more data" ); - return Ok(R::Continue); + return Ok(FsmResult::Continue); } match { @@ -235,14 +216,14 @@ impl ArchiveReader { directory_headers: vec![], } }); - Ok(R::Continue) + Ok(FsmResult::Continue) } else { trace!("ReadEocd | transition to ReadEocd64Locator"); transition!(self.state => (S::ReadEocd { mut buffer, .. }) { buffer.reset(); S::ReadEocd64Locator { buffer, eocdr } }); - Ok(R::Continue) + Ok(FsmResult::Continue) } } } @@ -252,7 +233,7 @@ impl ArchiveReader { match EndOfCentralDirectory64Locator::parser.parse_peek(input) { Err(ErrMode::Incomplete(_)) => { // need more data - Ok(R::Continue) + Ok(FsmResult::Continue) } Err(ErrMode::Backtrack(_)) | Err(ErrMode::Cut(_)) => { // we don't have a zip64 end of central directory locator - that's ok! @@ -266,7 +247,7 @@ impl ArchiveReader { directory_headers: vec![], } }); - Ok(R::Continue) + Ok(FsmResult::Continue) } Ok((_, locator)) => { trace!( @@ -281,7 +262,7 @@ impl ArchiveReader { eocdr, } }); - Ok(R::Continue) + Ok(FsmResult::Continue) } } } @@ -290,7 +271,7 @@ impl ArchiveReader { match EndOfCentralDirectory64Record::parser.parse_peek(input) { Err(ErrMode::Incomplete(_)) => { // need more data - Ok(R::Continue) + Ok(FsmResult::Continue) } Err(ErrMode::Backtrack(_)) | Err(ErrMode::Cut(_)) => { // at this point, we really expected to have a zip64 end @@ -310,7 +291,7 @@ impl ArchiveReader { directory_headers: vec![], } }); - Ok(R::Continue) + Ok(FsmResult::Continue) } } } @@ -422,7 +403,7 @@ impl ArchiveReader { } self.state = S::Done; - return Ok(R::Done(Archive { + return Ok(FsmResult::Done(Archive { size: self.size, comment, entries, @@ -445,7 +426,7 @@ impl ArchiveReader { buffer.consume(consumed); // need more data - Ok(R::Continue) + Ok(FsmResult::Continue) } S::Done { .. } => panic!("Called process() on ArchiveReader in Done state"), S::Transitioning => unreachable!(), diff --git a/rc-zip/src/reader/entry.rs b/rc-zip/src/reader/entry.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/rc-zip/src/reader/entry.rs @@ -0,0 +1 @@ + diff --git a/rc-zip/src/reader/mod.rs b/rc-zip/src/reader/mod.rs new file mode 100644 index 0000000..92fea0e --- /dev/null +++ b/rc-zip/src/reader/mod.rs @@ -0,0 +1,24 @@ +macro_rules! transition { + ($state: expr => ($pattern: pat) $body: expr) => { + $state = if let $pattern = std::mem::take(&mut $state) { + $body + } else { + unreachable!() + }; + }; +} + +mod archive; +pub use archive::ArchiveReader; + +mod entry; + +/// Indicates whether or not the state machine has completed its work +pub enum FsmResult { + /// Indicates that the state machine still has work to do, and + /// needs either data or a call to process + Continue, + /// Indicates that the state machine has completed its work, and + /// the result is the value provided + Done(T), +} From 2f220132f8f93903ec901e51a6b07e36509839f9 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 11:40:58 +0100 Subject: [PATCH 18/49] Clarify state machine terminology --- rc-zip-sync/src/read_zip.rs | 8 ++++---- rc-zip-sync/tests/integration_tests.rs | 8 ++++---- rc-zip-tokio/src/read_zip.rs | 8 ++++---- rc-zip/src/encoding.rs | 10 ++++++++++ rc-zip/src/error.rs | 2 ++ rc-zip/src/format/mod.rs | 7 +++++++ rc-zip/src/{reader => fsm}/archive.rs | 6 +++--- rc-zip/src/fsm/entry.rs | 12 ++++++++++++ rc-zip/src/{reader => fsm}/mod.rs | 12 +++++++++++- rc-zip/src/lib.rs | 21 +-------------------- rc-zip/src/reader/entry.rs | 1 - 11 files changed, 58 insertions(+), 37 deletions(-) rename rc-zip/src/{reader => fsm}/archive.rs (99%) create mode 100644 rc-zip/src/fsm/entry.rs rename rc-zip/src/{reader => fsm}/mod.rs (53%) delete mode 100644 rc-zip/src/reader/entry.rs diff --git a/rc-zip-sync/src/read_zip.rs b/rc-zip-sync/src/read_zip.rs index 5e0522c..fa21a2e 100644 --- a/rc-zip-sync/src/read_zip.rs +++ b/rc-zip-sync/src/read_zip.rs @@ -1,5 +1,5 @@ use rc_zip::{ - reader::{ArchiveReader, ArchiveReaderResult}, + fsm::{ArchiveFsm, FsmResult}, Archive, Error, StoredEntry, }; @@ -41,7 +41,7 @@ where fn read_zip_with_size(&self, size: u64) -> Result, Error> { tracing::trace!(%size, "read_zip_with_size"); - let mut ar = ArchiveReader::new(size); + let mut ar = ArchiveFsm::new(size); loop { if let Some(offset) = ar.wants_read() { tracing::trace!(%offset, "read_zip_with_size: wants_read, space len = {}", ar.space().len()); @@ -58,14 +58,14 @@ where } match ar.process()? { - ArchiveReaderResult::Done(archive) => { + FsmResult::Done(archive) => { tracing::trace!("read_zip_with_size: done"); return Ok(SyncArchive { file: self, archive, }); } - ArchiveReaderResult::Continue => { + FsmResult::Continue => { tracing::trace!("read_zip_with_size: continue"); } } diff --git a/rc-zip-sync/tests/integration_tests.rs b/rc-zip-sync/tests/integration_tests.rs index 8b98bff..4fd962e 100644 --- a/rc-zip-sync/tests/integration_tests.rs +++ b/rc-zip-sync/tests/integration_tests.rs @@ -346,12 +346,12 @@ fn real_world_files() { #[test_log::test] fn state_machine() { - use rc_zip::reader::{ArchiveReader, ArchiveReaderResult}; + use rc_zip::fsm::{ArchiveFsm, FsmResult}; let cases = test_cases(); let case = cases.iter().find(|x| x.name() == "zip64.zip").unwrap(); let bs = case.bytes(); - let mut zar = ArchiveReader::new(bs.len() as u64); + let mut zar = ArchiveFsm::new(bs.len() as u64); let archive = 'read_zip: loop { if let Some(offset) = zar.wants_read() { @@ -381,8 +381,8 @@ fn state_machine() { match zar.process() { Ok(res) => match res { - ArchiveReaderResult::Continue => {} - ArchiveReaderResult::Done(archive) => break 'read_zip archive, + FsmResult::Continue => {} + FsmResult::Done(archive) => break 'read_zip archive, }, Err(err) => { println!("zar processing error: {:#?}", err); diff --git a/rc-zip-tokio/src/read_zip.rs b/rc-zip-tokio/src/read_zip.rs index c16c7cf..438bceb 100644 --- a/rc-zip-tokio/src/read_zip.rs +++ b/rc-zip-tokio/src/read_zip.rs @@ -5,7 +5,7 @@ use positioned_io::{RandomAccessFile, ReadAt}; use tokio::io::{AsyncRead, AsyncReadExt, ReadBuf}; use rc_zip::{ - reader::{ArchiveReader, ArchiveReaderResult}, + fsm::{ArchiveFsm, FsmResult}, Archive, Error, StoredEntry, }; @@ -50,7 +50,7 @@ where type File = F; async fn read_zip_with_size_async(&self, size: u64) -> Result, Error> { - let mut ar = ArchiveReader::new(size); + let mut ar = ArchiveFsm::new(size); loop { if let Some(offset) = ar.wants_read() { match self.cursor_at(offset).read(ar.space()).await { @@ -65,13 +65,13 @@ where } match ar.process()? { - ArchiveReaderResult::Done(archive) => { + FsmResult::Done(archive) => { return Ok(AsyncArchive { file: self, archive, }) } - ArchiveReaderResult::Continue => {} + FsmResult::Continue => {} } } } diff --git a/rc-zip/src/encoding.rs b/rc-zip/src/encoding.rs index 823221d..6d126bc 100644 --- a/rc-zip/src/encoding.rs +++ b/rc-zip/src/encoding.rs @@ -1,3 +1,11 @@ +//! zip entry paths may be encoded in a variety of character encodings. +//! +//! Historically, CP-437 was used, but many modern zip files use UTF-8 with an +//! optional UTF-8 flag. +//! +//! Others use the system's local character encoding, and we have no choice but +//! to make an educated guess thanks to the chardet-ng crate. + use std::fmt; /// Encodings supported by this crate @@ -5,12 +13,14 @@ use std::fmt; pub enum Encoding { /// UTF-8 Utf8, + /// [Codepage 437](https://en.wikipedia.org/wiki/Code_page_437), also known as /// OEM-US, PC-8, or DOS Latin US. /// /// This is the fallback if UTF-8 is not specified and no other encoding /// is auto-detected. It was the original encoding of the zip format. Cp437, + /// [Shift JIS](https://en.wikipedia.org/wiki/Shift_JIS), also known as SJIS. /// /// Still in use by some Japanese users as of 2019. diff --git a/rc-zip/src/error.rs b/rc-zip/src/error.rs index 99bdc29..023454e 100644 --- a/rc-zip/src/error.rs +++ b/rc-zip/src/error.rs @@ -1,3 +1,5 @@ +//! All error types used in this crate + use crate::Method; use super::encoding; diff --git a/rc-zip/src/format/mod.rs b/rc-zip/src/format/mod.rs index 709634f..541edc8 100644 --- a/rc-zip/src/format/mod.rs +++ b/rc-zip/src/format/mod.rs @@ -1,3 +1,10 @@ +//! Contain winnow parsers for most elements that make up a ZIP file, like +//! the end-of-central-directory record, local file headers, and central +//! directory headers. +//! +//! Everything in there is based off of the appnote, which you can find in the +//! source repository. + pub use crate::encoding::Encoding; mod archive; diff --git a/rc-zip/src/reader/archive.rs b/rc-zip/src/fsm/archive.rs similarity index 99% rename from rc-zip/src/reader/archive.rs rename to rc-zip/src/fsm/archive.rs index 75c98a5..cca0c80 100644 --- a/rc-zip/src/reader/archive.rs +++ b/rc-zip/src/fsm/archive.rs @@ -12,10 +12,10 @@ use winnow::{ Parser, Partial, }; -/// ArchiveReader parses a valid zip archive into an [Archive][]. In particular, this struct finds +/// [ArchiveReader] parses a valid zip archive into an [Archive]. In particular, this struct finds /// an end of central directory record, parses the entire central directory, detects text encoding, /// and normalizes metadata. -pub struct ArchiveReader { +pub struct ArchiveFsm { // Size of the entire zip file size: u64, state: State, @@ -71,7 +71,7 @@ impl State { } } -impl ArchiveReader { +impl ArchiveFsm { /// This should be > 65KiB, because the section at the end of the /// file that we check for end of central directory record is 65KiB. const DEFAULT_BUFFER_SIZE: usize = 256 * 1024; diff --git a/rc-zip/src/fsm/entry.rs b/rc-zip/src/fsm/entry.rs new file mode 100644 index 0000000..d4c908a --- /dev/null +++ b/rc-zip/src/fsm/entry.rs @@ -0,0 +1,12 @@ +#[derive(Default)] +enum State { + /// Done! + Done, + + #[default] + Transition, +} + +pub struct EntryFsm { + state: State, +} diff --git a/rc-zip/src/reader/mod.rs b/rc-zip/src/fsm/mod.rs similarity index 53% rename from rc-zip/src/reader/mod.rs rename to rc-zip/src/fsm/mod.rs index 92fea0e..33a0404 100644 --- a/rc-zip/src/reader/mod.rs +++ b/rc-zip/src/fsm/mod.rs @@ -1,3 +1,12 @@ +//! Parsers are just part of the puzzle when it comes to zip files: finding the +//! central directory is non-trivial and involves seeking around the input: +//! [ArchiveFsm] provides a state machine to handle this. +//! +//! Similarly, reading an entry involves reading the local header, then the +//! data (while calculating the CRC32), then the data descriptor, and then +//! checking whether the uncompressed size and CRC32 match the values in the +//! central directory. + macro_rules! transition { ($state: expr => ($pattern: pat) $body: expr) => { $state = if let $pattern = std::mem::take(&mut $state) { @@ -9,9 +18,10 @@ macro_rules! transition { } mod archive; -pub use archive::ArchiveReader; +pub use archive::ArchiveFsm; mod entry; +pub use entry::EntryFsm; /// Indicates whether or not the state machine has completed its work pub enum FsmResult { diff --git a/rc-zip/src/lib.rs b/rc-zip/src/lib.rs index 25224b6..1408bcf 100644 --- a/rc-zip/src/lib.rs +++ b/rc-zip/src/lib.rs @@ -1,22 +1,3 @@ -//! # rc-zip -//! -//! rc-zip is a zip archive library with a focus on compatibility and correctness. -//! -//! ### Reading -//! -//! [ArchiveReader](reader::ArchiveReader) is your first stop. It -//! ensures we are dealing with a valid zip archive, and reads the central -//! directory. It does not perform I/O itself, but rather, it is a state machine -//! that asks for reads at specific offsets. -//! -//! An [Archive] contains a full list of [entries](StoredEntry), -//! which you can then extract. -//! -//! ### Writing -//! -//! Writing archives is not implemented yet. -//! - mod encoding; mod error; @@ -25,4 +6,4 @@ pub use error::*; mod format; pub use format::*; -pub mod reader; +pub mod fsm; diff --git a/rc-zip/src/reader/entry.rs b/rc-zip/src/reader/entry.rs deleted file mode 100644 index 8b13789..0000000 --- a/rc-zip/src/reader/entry.rs +++ /dev/null @@ -1 +0,0 @@ - From 2c2d35a8ab821e1eb049e3cdb6cc26075d2eecdf Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 11:45:22 +0100 Subject: [PATCH 19/49] rename format to parse --- rc-zip/src/error.rs | 4 +- rc-zip/src/lib.rs | 4 +- rc-zip/src/parse/archive.rs | 328 +++++++++++++++++++++++++++ rc-zip/src/parse/date_time.rs | 104 +++++++++ rc-zip/src/parse/directory_header.rs | 249 ++++++++++++++++++++ rc-zip/src/parse/eocd.rs | 264 +++++++++++++++++++++ rc-zip/src/parse/extra_field.rs | 291 ++++++++++++++++++++++++ rc-zip/src/parse/local.rs | 189 +++++++++++++++ rc-zip/src/parse/mod.rs | 35 +++ rc-zip/src/parse/mode.rs | 239 +++++++++++++++++++ rc-zip/src/parse/raw.rs | 77 +++++++ rc-zip/src/parse/version.rs | 133 +++++++++++ 12 files changed, 1913 insertions(+), 4 deletions(-) create mode 100644 rc-zip/src/parse/archive.rs create mode 100644 rc-zip/src/parse/date_time.rs create mode 100644 rc-zip/src/parse/directory_header.rs create mode 100644 rc-zip/src/parse/eocd.rs create mode 100644 rc-zip/src/parse/extra_field.rs create mode 100644 rc-zip/src/parse/local.rs create mode 100644 rc-zip/src/parse/mod.rs create mode 100644 rc-zip/src/parse/mode.rs create mode 100644 rc-zip/src/parse/raw.rs create mode 100644 rc-zip/src/parse/version.rs diff --git a/rc-zip/src/error.rs b/rc-zip/src/error.rs index 023454e..dceb7b1 100644 --- a/rc-zip/src/error.rs +++ b/rc-zip/src/error.rs @@ -41,10 +41,10 @@ impl Error { #[derive(Debug, thiserror::Error)] pub enum UnsupportedError { #[error("compression method not supported: {0:?}")] - MethodNotSupported(crate::format::Method), + MethodNotSupported(Method), #[error("compression method supported, but not enabled in this build: {0:?}")] - MethodNotEnabled(crate::format::Method), + MethodNotEnabled(Method), #[error("only LZMA2.0 is supported, found LZMA{minor}.{major}")] LzmaVersionUnsupported { minor: u8, major: u8 }, diff --git a/rc-zip/src/lib.rs b/rc-zip/src/lib.rs index 1408bcf..7193e91 100644 --- a/rc-zip/src/lib.rs +++ b/rc-zip/src/lib.rs @@ -3,7 +3,7 @@ mod encoding; mod error; pub use error::*; -mod format; -pub use format::*; +mod parse; +pub use parse::*; pub mod fsm; diff --git a/rc-zip/src/parse/archive.rs b/rc-zip/src/parse/archive.rs new file mode 100644 index 0000000..421b72a --- /dev/null +++ b/rc-zip/src/parse/archive.rs @@ -0,0 +1,328 @@ +use chrono::{DateTime, Utc}; +use num_enum::{FromPrimitive, IntoPrimitive}; + +use crate::{Encoding, ExtraField, Mode, Version}; + +/// An Archive contains general information about a zip files, +/// along with a list of [entries][StoredEntry]. +/// +/// It is obtained via an [ArchiveReader](crate::reader::ArchiveReader), or via a higher-level API +/// like the [ReadZip](crate::reader::sync::ReadZip) trait. +pub struct Archive { + pub(crate) size: u64, + pub(crate) encoding: Encoding, + pub(crate) entries: Vec, + pub(crate) comment: Option, +} + +impl Archive { + /// The size of .zip file that was read, in bytes. + pub fn size(&self) -> u64 { + self.size + } + + /// Iterate over all files in this zip, read from the central directory. + pub fn entries(&self) -> impl Iterator { + self.entries.iter() + } + + /// Attempts to look up an entry by name. This is usually a bad idea, + /// as names aren't necessarily normalized in zip archives. + pub fn by_name>(&self, name: N) -> Option<&StoredEntry> { + self.entries.iter().find(|&x| x.name() == name.as_ref()) + } + + /// Returns the detected character encoding for text fields + /// (names, comments) inside this zip archive. + pub fn encoding(&self) -> Encoding { + self.encoding + } + + /// Returns the comment for this archive, if any. When reading + /// a zip file with an empty comment field, this will return None. + pub fn comment(&self) -> Option<&String> { + self.comment.as_ref() + } +} + +/// Describes a zip archive entry (a file, a directory, a symlink) +/// +/// `Entry` contains normalized metadata fields, that can be set when +/// writing a zip archive. Additional metadata, along with the information +/// required to extract an entry, are available in [StoredEntry][] instead. +#[derive(Clone)] +pub struct Entry { + /// Name of the file + /// Must be a relative path, not start with a drive letter (e.g. C:), + /// and must use forward slashes instead of back slashes + pub name: String, + + /// Compression method + /// + /// See [Method][] for more details. + pub method: Method, + + /// Comment is any arbitrary user-defined string shorter than 64KiB + pub comment: Option, + + /// Modified timestamp + pub modified: chrono::DateTime, + + /// Created timestamp + pub created: Option>, + + /// Accessed timestamp + pub accessed: Option>, +} + +/// An entry as stored into an Archive. Contains additional metadata and offset information. +/// +/// Whereas [Entry][] is archive-independent, [StoredEntry][] contains information that is tied to +/// a specific archive. +/// +/// When reading archives, one deals with a list of [StoredEntry][], whereas when writing one, one +/// typically only specifies an [Entry][] and provides the entry's contents: fields like the CRC32 +/// hash, uncompressed size, and compressed size are derived automatically from the input. +#[derive(Clone)] +pub struct StoredEntry { + /// Archive-independent information + /// + /// This contains the entry's name, timestamps, comment, compression method. + pub entry: Entry, + + /// Offset of the local file header in the zip file + /// + /// ```text + /// [optional non-zip data] + /// [local file header 1] <------ header_offset points here + /// [encryption header 1] + /// [file data 1] + /// [data descriptor 1] + /// ... + /// [central directory] + /// [optional zip64 end of central directory info] + /// [end of central directory record] + /// ``` + pub header_offset: u64, + + /// External attributes (zip) + pub external_attrs: u32, + + /// Version of zip supported by the tool that crated this archive. + pub creator_version: Version, + + /// Version of zip needed to extract this archive. + pub reader_version: Version, + + /// General purpose bit flag + /// + /// In the zip format, the most noteworthy flag (bit 11) is for UTF-8 names. + /// Other flags can indicate: encryption (unsupported), various compression + /// settings (depending on the [Method] used). + /// + /// For LZMA, general-purpose bit 1 denotes the EOS marker. + pub flags: u16, + + /// Unix user ID + /// + /// Only present if a Unix extra field or New Unix extra field was found. + pub uid: Option, + + /// Unix group ID + /// + /// Only present if a Unix extra field or New Unix extra field was found. + pub gid: Option, + + /// File mode + pub mode: Mode, + + /// Any extra fields recognized while parsing the file. + /// + /// Most of these should be normalized and accessible as other fields, + /// but they are also made available here raw. + pub extra_fields: Vec, + + pub inner: StoredEntryInner, +} + +#[derive(Clone, Copy, Debug)] +pub struct StoredEntryInner { + /// CRC-32 hash as found in the central directory. + /// + /// Note that this may be zero, and the actual CRC32 might be in the local header, or (more + /// commonly) in the data descriptor instead. + pub crc32: u32, + + /// Size in bytes, after compression + pub compressed_size: u64, + + /// Size in bytes, before compression + /// + /// This will be zero for directories. + pub uncompressed_size: u64, + + /// True if this entry was read from a zip64 archive + pub is_zip64: bool, +} + +impl StoredEntry { + /// Returns the entry's name. See also + /// [sanitized_name()](StoredEntry::sanitized_name), which returns a + /// sanitized version of the name. + /// + /// This should be a relative path, separated by `/`. However, there are zip + /// files in the wild with all sorts of evil variants, so, be conservative + /// in what you accept. + pub fn name(&self) -> &str { + self.entry.name.as_ref() + } + + /// Returns a sanitized version of the entry's name, if it + /// seems safe. In particular, if this method feels like the + /// entry name is trying to do a zip slip (cf. + /// ), it'll return + /// None. + /// + /// Other than that, it will strip any leading slashes on non-Windows OSes. + pub fn sanitized_name(&self) -> Option<&str> { + let name = self.name(); + + // refuse entries with traversed/absolute path to mitigate zip slip + if name.contains("..") { + return None; + } + + #[cfg(windows)] + { + if name.contains(":\\") || name.starts_with("\\") { + return None; + } + Some(name) + } + + #[cfg(not(windows))] + { + // strip absolute prefix on entries pointing to root path + let mut entry_chars = name.chars(); + let mut name = name; + while name.starts_with('/') { + entry_chars.next(); + name = entry_chars.as_str() + } + Some(name) + } + } + + /// The entry's comment, if any. + /// + /// When reading a zip file, an empty comment results in None. + pub fn comment(&self) -> Option<&str> { + self.entry.comment.as_ref().map(|x| x.as_ref()) + } + + /// The compression method used for this entry + #[inline(always)] + pub fn method(&self) -> Method { + self.entry.method + } + + /// This entry's "last modified" timestamp - with caveats + /// + /// Due to the history of the ZIP file format, this may be inaccurate. It may be offset + /// by a few hours, if there is no extended timestamp information. It may have a resolution + /// as low as two seconds, if only MSDOS timestamps are present. It may default to the Unix + /// epoch, if something went really wrong. + /// + /// If you're reading this after the year 2038, or after the year 2108, godspeed. + #[inline(always)] + pub fn modified(&self) -> DateTime { + self.entry.modified + } + + /// This entry's "created" timestamp, if available. + /// + /// See [StoredEntry::modified()] for caveats. + #[inline(always)] + pub fn created(&self) -> Option<&DateTime> { + self.entry.created.as_ref() + } + + /// This entry's "last accessed" timestamp, if available. + /// + /// See [StoredEntry::modified()] for caveats. + #[inline(always)] + pub fn accessed(&self) -> Option<&DateTime> { + self.entry.accessed.as_ref() + } +} + +/// The contents of an entry: a directory, a file, or a symbolic link. +#[derive(Debug)] +pub enum EntryContents { + Directory, + File, + Symlink, +} + +impl StoredEntry { + pub fn contents(&self) -> EntryContents { + if self.mode.has(Mode::SYMLINK) { + EntryContents::Symlink + } else if self.mode.has(Mode::DIR) { + EntryContents::Directory + } else { + EntryContents::File + } + } +} + +/// Compression method used for a file entry. +/// +/// In archives that follow [ISO/IEC 21320-1:2015](https://www.iso.org/standard/60101.html), only +/// [Store][Method::Store] and [Deflate][Method::Deflate] should be used. +/// +/// However, in the wild, it is not too uncommon to encounter [Bzip2][Method::Bzip2], +/// [Lzma][Method::Lzma] or others. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IntoPrimitive, FromPrimitive)] +#[repr(u16)] +pub enum Method { + /// No compression is applied + Store = 0, + + /// [DEFLATE (RFC 1951)](https://www.ietf.org/rfc/rfc1951.txt) + Deflate = 8, + + /// [DEFLATE64](https://deflate64.com/) + Deflate64 = 9, + + /// [BZIP-2](https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf) + Bzip2 = 12, + + /// [LZMA](https://github.com/jljusten/LZMA-SDK/blob/master/DOC/lzma-specification.txt) + Lzma = 14, + + /// [zstd](https://datatracker.ietf.org/doc/html/rfc8878) + Zstd = 93, + + /// [MP3](https://www.iso.org/obp/ui/#iso:std:iso-iec:11172:-3:ed-1:v1:en) + Mp3 = 94, + + /// [XZ](https://tukaani.org/xz/xz-file-format.txt) + Xz = 95, + + /// [JPEG](https://jpeg.org/jpeg/) + Jpeg = 96, + + /// [WavPack](https://www.wavpack.com/) + WavPack = 97, + + /// [PPMd](https://en.wikipedia.org/wiki/Prediction_by_partial_matching) + Ppmd = 98, + + /// AE-x encryption marker (see Appendix E of appnote) + Aex = 99, + + /// A compression method that isn't recognized by this crate. + #[num_enum(catch_all)] + Unrecognized(u16), +} diff --git a/rc-zip/src/parse/date_time.rs b/rc-zip/src/parse/date_time.rs new file mode 100644 index 0000000..baeee9a --- /dev/null +++ b/rc-zip/src/parse/date_time.rs @@ -0,0 +1,104 @@ +use chrono::{ + offset::{LocalResult, TimeZone, Utc}, + DateTime, Timelike, +}; +use std::fmt; +use winnow::{ + binary::{le_u16, le_u64}, + seq, PResult, Parser, Partial, +}; + +/// A timestamp in MS-DOS format +/// +/// Represents dates from year 1980 to 2180, with 2 second precision. +#[derive(Clone, Copy, Eq, PartialEq)] +pub struct MsdosTimestamp { + pub time: u16, + pub date: u16, +} + +impl fmt::Debug for MsdosTimestamp { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.to_datetime() { + Some(dt) => write!(f, "MsdosTimestamp({})", dt), + None => write!(f, "MsdosTimestamp(?)"), + } + } +} + +impl MsdosTimestamp { + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + seq! {Self { + time: le_u16, + date: le_u16, + }} + .parse_next(i) + } + + /// Attempts to convert to a chrono UTC date time + pub fn to_datetime(&self) -> Option> { + // see https://docs.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-dosdatetimetofiletime + let date = match { + // bits 0-4: day of the month (1-31) + let d = (self.date & 0b1_1111) as u32; + // bits 5-8: month (1 = january, 2 = february and so on) + let m = ((self.date >> 5) & 0b1111) as u32; + // bits 9-15: year offset from 1980 + let y = ((self.date >> 9) + 1980) as i32; + Utc.with_ymd_and_hms(y, m, d, 0, 0, 0) + } { + LocalResult::Single(date) => date, + _ => return None, + }; + + // bits 0-4: second divided by 2 + let s = (self.time & 0b1_1111) as u32 * 2; + // bits 5-10: minute (0-59) + let m = (self.time >> 5 & 0b11_1111) as u32; + // bits 11-15: hour (0-23 on a 24-hour clock) + let h = (self.time >> 11) as u32; + date.with_hour(h)?.with_minute(m)?.with_second(s) + } +} + +/// A timestamp in NTFS format. +#[derive(Clone, Copy, Eq, PartialEq)] +pub struct NtfsTimestamp { + pub timestamp: u64, +} + +impl fmt::Debug for NtfsTimestamp { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.to_datetime() { + Some(dt) => write!(f, "NtfsTimestamp({})", dt), + None => write!(f, "NtfsTimestamp(?)"), + } + } +} + +impl NtfsTimestamp { + /// Parse an MS-DOS timestamp from a byte slice + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + le_u64.map(|timestamp| Self { timestamp }).parse_next(i) + } + + /// Attempts to convert to a chrono UTC date time + pub fn to_datetime(&self) -> Option> { + // windows timestamp resolution + let ticks_per_second = 10_000_000; + let secs = (self.timestamp / ticks_per_second) as i64; + let nsecs = ((self.timestamp % ticks_per_second) * 100) as u32; + let epoch = Utc.with_ymd_and_hms(1601, 1, 1, 0, 0, 0).single()?; + match Utc.timestamp_opt(epoch.timestamp() + secs, nsecs) { + LocalResult::Single(date) => Some(date), + _ => None, + } + } +} + +pub(crate) fn zero_datetime() -> chrono::DateTime { + chrono::DateTime::from_naive_utc_and_offset( + chrono::naive::NaiveDateTime::from_timestamp_opt(0, 0).unwrap(), + chrono::offset::Utc, + ) +} diff --git a/rc-zip/src/parse/directory_header.rs b/rc-zip/src/parse/directory_header.rs new file mode 100644 index 0000000..15798be --- /dev/null +++ b/rc-zip/src/parse/directory_header.rs @@ -0,0 +1,249 @@ +use chrono::{offset::TimeZone, DateTime, Utc}; +use tracing::trace; +use winnow::{ + binary::{le_u16, le_u32}, + prelude::PResult, + token::tag, + Parser, Partial, +}; + +use crate::{ + encoding::detect_utf8, zero_datetime, Encoding, Entry, Error, ExtraField, ExtraFieldSettings, + FormatError, HostSystem, Mode, MsdosMode, MsdosTimestamp, NtfsAttr, StoredEntry, + StoredEntryInner, UnixMode, Version, ZipBytes, ZipString, +}; + +/// 4.3.12 Central directory structure: File header +pub struct DirectoryHeader { + // version made by + pub creator_version: Version, + // version needed to extract + pub reader_version: Version, + // general purpose bit flag + pub flags: u16, + // compression method + pub method: u16, + // last mod file datetime + pub modified: MsdosTimestamp, + // crc32 + pub crc32: u32, + // compressed size + pub compressed_size: u32, + // uncompressed size + pub uncompressed_size: u32, + // disk number start + pub disk_nbr_start: u16, + // internal file attributes + pub internal_attrs: u16, + // external file attributes + pub external_attrs: u32, + // relative offset of local header + pub header_offset: u32, + + // name + pub name: ZipString, + // extra + pub extra: ZipBytes, // comment + pub comment: ZipString, +} + +impl DirectoryHeader { + const SIGNATURE: &'static str = "PK\x01\x02"; + + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + _ = tag(Self::SIGNATURE).parse_next(i)?; + let creator_version = Version::parser.parse_next(i)?; + let reader_version = Version::parser.parse_next(i)?; + let flags = le_u16.parse_next(i)?; + let method = le_u16.parse_next(i)?; + let modified = MsdosTimestamp::parser.parse_next(i)?; + let crc32 = le_u32.parse_next(i)?; + let compressed_size = le_u32.parse_next(i)?; + let uncompressed_size = le_u32.parse_next(i)?; + let name_len = le_u16.parse_next(i)?; + let extra_len = le_u16.parse_next(i)?; + let comment_len = le_u16.parse_next(i)?; + let disk_nbr_start = le_u16.parse_next(i)?; + let internal_attrs = le_u16.parse_next(i)?; + let external_attrs = le_u32.parse_next(i)?; + let header_offset = le_u32.parse_next(i)?; + + let name = ZipString::parser(name_len).parse_next(i)?; + let extra = ZipBytes::parser(extra_len).parse_next(i)?; + let comment = ZipString::parser(comment_len).parse_next(i)?; + + Ok(Self { + creator_version, + reader_version, + flags, + method, + modified, + crc32, + compressed_size, + uncompressed_size, + disk_nbr_start, + internal_attrs, + external_attrs, + header_offset, + name, + extra, + comment, + }) + } +} + +impl DirectoryHeader { + pub fn is_non_utf8(&self) -> bool { + let (valid1, require1) = detect_utf8(&self.name.0[..]); + let (valid2, require2) = detect_utf8(&self.comment.0[..]); + if !valid1 || !valid2 { + // definitely not utf-8 + return true; + } + + if !require1 && !require2 { + // name and comment only use single-byte runes that overlap with UTF-8 + return false; + } + + // Might be UTF-8, might be some other encoding; preserve existing flag. + // Some ZIP writers use UTF-8 encoding without setting the UTF-8 flag. + // Since it is impossible to always distinguish valid UTF-8 from some + // other encoding (e.g., GBK or Shift-JIS), we trust the flag. + self.flags & 0x800 == 0 + } + + pub fn as_stored_entry( + &self, + is_zip64: bool, + encoding: Encoding, + global_offset: u64, + ) -> Result { + let mut comment: Option = None; + if let Some(comment_field) = self.comment.clone().into_option() { + comment = Some(encoding.decode(&comment_field.0)?); + } + + let name = encoding.decode(&self.name.0)?; + + let mut compressed_size = self.compressed_size as u64; + let mut uncompressed_size = self.uncompressed_size as u64; + let mut header_offset = self.header_offset as u64 + global_offset; + + let mut modified: Option> = None; + let mut created: Option> = None; + let mut accessed: Option> = None; + + let mut uid: Option = None; + let mut gid: Option = None; + + let mut extra_fields: Vec = Vec::new(); + + let settings = ExtraFieldSettings { + needs_compressed_size: self.compressed_size == !0u32, + needs_uncompressed_size: self.uncompressed_size == !0u32, + needs_header_offset: self.header_offset == !0u32, + }; + + let mut slice = Partial::new(&self.extra.0[..]); + while !slice.is_empty() { + match ExtraField::mk_parser(settings).parse_next(&mut slice) { + Ok(ef) => { + match &ef { + ExtraField::Zip64(z64) => { + if let Some(n) = z64.uncompressed_size { + uncompressed_size = n; + } + if let Some(n) = z64.compressed_size { + compressed_size = n; + } + if let Some(n) = z64.header_offset { + header_offset = n; + } + } + ExtraField::Timestamp(ts) => { + modified = Utc.timestamp_opt(ts.mtime as i64, 0).single(); + } + ExtraField::Ntfs(nf) => { + for attr in &nf.attrs { + // note: other attributes are unsupported + if let NtfsAttr::Attr1(attr) = attr { + modified = attr.mtime.to_datetime(); + created = attr.ctime.to_datetime(); + accessed = attr.atime.to_datetime(); + } + } + } + ExtraField::Unix(uf) => { + modified = Utc.timestamp_opt(uf.mtime as i64, 0).single(); + if uid.is_none() { + uid = Some(uf.uid as u32); + } + if gid.is_none() { + gid = Some(uf.gid as u32); + } + } + ExtraField::NewUnix(uf) => { + uid = Some(uf.uid as u32); + gid = Some(uf.uid as u32); + } + _ => {} + }; + extra_fields.push(ef); + } + Err(e) => { + trace!("extra field error: {:#?}", e); + return Err(FormatError::InvalidExtraField.into()); + } + } + } + + let modified = match modified { + Some(m) => Some(m), + None => self.modified.to_datetime(), + }; + + let mut mode: Mode = match self.creator_version.host_system() { + HostSystem::Unix | HostSystem::Osx => UnixMode(self.external_attrs >> 16).into(), + HostSystem::WindowsNtfs | HostSystem::Vfat | HostSystem::MsDos => { + MsdosMode(self.external_attrs).into() + } + _ => Mode(0), + }; + if name.ends_with('/') { + // believe it or not, this is straight from the APPNOTE + mode |= Mode::DIR + }; + + Ok(StoredEntry { + entry: Entry { + name, + method: self.method.into(), + comment, + modified: modified.unwrap_or_else(zero_datetime), + created, + accessed, + }, + + creator_version: self.creator_version, + reader_version: self.reader_version, + flags: self.flags, + + inner: StoredEntryInner { + crc32: self.crc32, + compressed_size, + uncompressed_size, + is_zip64, + }, + header_offset, + + uid, + gid, + mode, + + extra_fields, + + external_attrs: self.external_attrs, + }) + } +} diff --git a/rc-zip/src/parse/eocd.rs b/rc-zip/src/parse/eocd.rs new file mode 100644 index 0000000..65e747d --- /dev/null +++ b/rc-zip/src/parse/eocd.rs @@ -0,0 +1,264 @@ +use tracing::trace; +use winnow::{ + binary::{le_u16, le_u32, le_u64, length_take}, + seq, + token::tag, + PResult, Parser, Partial, +}; + +use crate::{Error, FormatError, ZipString}; + +/// 4.3.16 End of central directory record: +#[derive(Debug)] +pub struct EndOfCentralDirectoryRecord { + /// number of this disk + pub disk_nbr: u16, + /// number of the disk with the start of the central directory + pub dir_disk_nbr: u16, + /// total number of entries in the central directory on this disk + pub dir_records_this_disk: u16, + /// total number of entries in the central directory + pub directory_records: u16, + // size of the central directory + pub directory_size: u32, + /// offset of start of central directory with respect to the starting disk number + pub directory_offset: u32, + /// .ZIP file comment + pub comment: ZipString, +} + +impl EndOfCentralDirectoryRecord { + /// Does not include comment size & comment data + const MIN_LENGTH: usize = 20; + const SIGNATURE: &'static str = "PK\x05\x06"; + + pub fn find_in_block(b: &[u8]) -> Option> { + for i in (0..(b.len() - Self::MIN_LENGTH + 1)).rev() { + let mut input = Partial::new(&b[i..]); + if let Ok(directory) = Self::parser.parse_next(&mut input) { + return Some(Located { + offset: i as u64, + inner: directory, + }); + } + } + None + } + + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + let _ = tag(Self::SIGNATURE).parse_next(i)?; + seq! {Self { + disk_nbr: le_u16, + dir_disk_nbr: le_u16, + dir_records_this_disk: le_u16, + directory_records: le_u16, + directory_size: le_u32, + directory_offset: le_u32, + comment: length_take(le_u16).map(ZipString::from), + }} + .parse_next(i) + } +} + +/// 4.3.15 Zip64 end of central directory locator +#[derive(Debug)] +pub struct EndOfCentralDirectory64Locator { + /// number of the disk with the start of the zip64 end of central directory + pub dir_disk_number: u32, + /// relative offset of the zip64 end of central directory record + pub directory_offset: u64, + /// total number of disks + pub total_disks: u32, +} + +impl EndOfCentralDirectory64Locator { + pub const LENGTH: usize = 20; + const SIGNATURE: &'static str = "PK\x06\x07"; + + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + _ = tag(Self::SIGNATURE).parse_next(i)?; + seq! {Self { + dir_disk_number: le_u32, + directory_offset: le_u64, + total_disks: le_u32, + }} + .parse_next(i) + } +} + +/// 4.3.14 Zip64 end of central directory record +#[derive(Debug)] +pub struct EndOfCentralDirectory64Record { + /// size of zip64 end of central directory record + pub record_size: u64, + /// version made by + pub creator_version: u16, + /// version needed to extract + pub reader_version: u16, + /// number of this disk + pub disk_nbr: u32, + /// number of the disk with the start of the central directory + pub dir_disk_nbr: u32, + // total number of entries in the central directory on this disk + pub dir_records_this_disk: u64, + // total number of entries in the central directory + pub directory_records: u64, + // size of the central directory + pub directory_size: u64, + // offset of the start of central directory with respect to the + // starting disk number + pub directory_offset: u64, +} + +impl EndOfCentralDirectory64Record { + const SIGNATURE: &'static str = "PK\x06\x06"; + + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + _ = tag(Self::SIGNATURE).parse_next(i)?; + seq! {Self { + record_size: le_u64, + creator_version: le_u16, + reader_version: le_u16, + disk_nbr: le_u32, + dir_disk_nbr: le_u32, + dir_records_this_disk: le_u64, + directory_records: le_u64, + directory_size: le_u64, + directory_offset: le_u64, + }} + .parse_next(i) + } +} + +#[derive(Debug)] +pub struct Located { + pub offset: u64, + pub inner: T, +} + +impl std::ops::Deref for Located { + type Target = T; + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +impl std::ops::DerefMut for Located { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} + +/// Coalesces zip and zip64 "end of central directory" record info +pub struct EndOfCentralDirectory { + pub dir: Located, + pub dir64: Option>, + pub global_offset: i64, +} + +impl EndOfCentralDirectory { + pub fn new( + size: u64, + dir: Located, + dir64: Option>, + ) -> Result { + let mut res = Self { + dir, + dir64, + global_offset: 0, + }; + + // + // Pure .zip files look like this: + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // <------directory_size-----> + // [ Data 1 ][ Data 2 ][ Central directory ][ ??? ] + // ^ ^ ^ + // 0 directory_offset directory_end_offset + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // + // But there exist some valid zip archives with padding at the beginning, like so: + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // <--global_offset-> <------directory_size-----> + // [ Padding ][ Data 1 ][ Data 2 ][ Central directory ][ ??? ] + // ^ ^ ^ ^ + // 0 global_offset computed_directory_offset directory_end_offset + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // + // (e.g. https://www.icculus.org/mojosetup/ installers are ELF binaries with a .zip file appended) + // + // `directory_end_offfset` is found by scanning the file (so it accounts for padding), but + // `directory_offset` is found by reading a data structure (so it does not account for padding). + // If we just trusted `directory_offset`, we'd be reading the central directory at the wrong place: + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // <------directory_size-----> + // [ Padding ][ Data 1 ][ Data 2 ][ Central directory ][ ??? ] + // ^ ^ ^ + // 0 directory_offset - woops! directory_end_offset + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + let computed_directory_offset = res.located_directory_offset() - res.directory_size(); + + // did we find a valid offset? + if (0..size).contains(&computed_directory_offset) { + // that's different from the recorded one? + if computed_directory_offset != res.directory_offset() { + // then assume the whole file is offset + res.global_offset = + computed_directory_offset as i64 - res.directory_offset() as i64; + res.set_directory_offset(computed_directory_offset); + } + } + + // make sure directory_offset points to somewhere in our file + trace!( + "directory offset = {}, valid range = 0..{}", + res.directory_offset(), + size + ); + if !(0..size).contains(&res.directory_offset()) { + return Err(FormatError::DirectoryOffsetPointsOutsideFile.into()); + } + + Ok(res) + } + + pub fn located_directory_offset(&self) -> u64 { + match self.dir64.as_ref() { + Some(d64) => d64.offset, + None => self.dir.offset, + } + } + + pub fn directory_offset(&self) -> u64 { + match self.dir64.as_ref() { + Some(d64) => d64.directory_offset, + None => self.dir.directory_offset as u64, + } + } + + pub fn directory_size(&self) -> u64 { + match self.dir64.as_ref() { + Some(d64) => d64.directory_size, + None => self.dir.directory_size as u64, + } + } + + pub fn set_directory_offset(&mut self, offset: u64) { + match self.dir64.as_mut() { + Some(d64) => d64.directory_offset = offset, + None => self.dir.directory_offset = offset as u32, + }; + } + + pub fn directory_records(&self) -> u64 { + match self.dir64.as_ref() { + Some(d64) => d64.directory_records, + None => self.dir.directory_records as u64, + } + } + + pub fn comment(&self) -> &ZipString { + &self.dir.comment + } +} diff --git a/rc-zip/src/parse/extra_field.rs b/rc-zip/src/parse/extra_field.rs new file mode 100644 index 0000000..daf31f1 --- /dev/null +++ b/rc-zip/src/parse/extra_field.rs @@ -0,0 +1,291 @@ +use tracing::trace; +use winnow::{ + binary::{le_u16, le_u32, le_u64, le_u8, length_take}, + combinator::{cond, opt, preceded, repeat_till}, + error::{ErrMode, ErrorKind, ParserError, StrContext}, + seq, + token::{tag, take}, + PResult, Parser, Partial, +}; + +use crate::{NtfsTimestamp, ZipBytes}; + +/// 4.4.28 extra field: (Variable) +pub(crate) struct ExtraFieldRecord<'a> { + pub(crate) tag: u16, + pub(crate) payload: &'a [u8], +} + +impl<'a> ExtraFieldRecord<'a> { + pub(crate) fn parser(i: &mut Partial<&'a [u8]>) -> PResult { + seq! {Self { + tag: le_u16, + payload: length_take(le_u16), + }} + .parse_next(i) + } +} + +// Useful because zip64 extended information extra field has fixed order *but* +// optional fields. From the appnote: +// +// If one of the size or offset fields in the Local or Central directory record +// is too small to hold the required data, a Zip64 extended information record +// is created. The order of the fields in the zip64 extended information record +// is fixed, but the fields MUST only appear if the corresponding Local or +// Central directory record field is set to 0xFFFF or 0xFFFFFFFF. +#[derive(Debug, Clone, Copy)] +pub(crate) struct ExtraFieldSettings { + pub(crate) needs_uncompressed_size: bool, + pub(crate) needs_compressed_size: bool, + pub(crate) needs_header_offset: bool, +} + +/// Information stored in the central directory header `extra` field +/// +/// This typically contains timestamps, file sizes and offsets, file mode, uid/gid, etc. +/// +/// See `extrafld.txt` in this crate's source distribution. +#[derive(Clone)] +pub enum ExtraField { + /// Zip64 extended information extra field + Zip64(ExtraZip64Field), + /// Extended timestamp + Timestamp(ExtraTimestampField), + /// UNIX & Info-Zip UNIX + Unix(ExtraUnixField), + /// New UNIX extra field + NewUnix(ExtraNewUnixField), + /// NTFS (Win9x/WinNT FileTimes) + Ntfs(ExtraNtfsField), + /// Unknown extra field, with tag + Unknown { tag: u16 }, +} + +impl ExtraField { + pub(crate) fn mk_parser( + settings: ExtraFieldSettings, + ) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult { + move |i| { + use ExtraField as EF; + let rec = ExtraFieldRecord::parser.parse_next(i)?; + trace!("parsing extra field record, tag {:04x}", rec.tag); + let payload = &mut Partial::new(rec.payload); + + let variant = match rec.tag { + ExtraZip64Field::TAG => opt(ExtraZip64Field::mk_parser(settings).map(EF::Zip64)) + .context(StrContext::Label("zip64")) + .parse_next(payload)?, + ExtraTimestampField::TAG => opt(ExtraTimestampField::parser.map(EF::Timestamp)) + .context(StrContext::Label("timestamp")) + .parse_next(payload)?, + ExtraNtfsField::TAG => { + opt(ExtraNtfsField::parse.map(EF::Ntfs)).parse_next(payload)? + } + ExtraUnixField::TAG | ExtraUnixField::TAG_INFOZIP => { + opt(ExtraUnixField::parser.map(EF::Unix)).parse_next(payload)? + } + ExtraNewUnixField::TAG => { + opt(ExtraNewUnixField::parser.map(EF::NewUnix)).parse_next(payload)? + } + _ => None, + } + .unwrap_or(EF::Unknown { tag: rec.tag }); + + Ok(variant) + } + } +} + +/// 4.5.3 -Zip64 Extended Information Extra Field (0x0001) +#[derive(Clone, Default)] +pub struct ExtraZip64Field { + pub uncompressed_size: Option, + pub compressed_size: Option, + pub header_offset: Option, +} + +impl ExtraZip64Field { + const TAG: u16 = 0x0001; + + pub(crate) fn mk_parser( + settings: ExtraFieldSettings, + ) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult { + move |i| { + // N.B: we ignore "disk start number" + seq! {Self { + uncompressed_size: cond(settings.needs_uncompressed_size, le_u64), + compressed_size: cond(settings.needs_compressed_size, le_u64), + header_offset: cond(settings.needs_header_offset, le_u64), + }} + .parse_next(i) + } + } +} + +/// Extended timestamp extra field +#[derive(Clone)] +pub struct ExtraTimestampField { + /// number of seconds since epoch + pub mtime: u32, +} + +impl ExtraTimestampField { + const TAG: u16 = 0x5455; + + fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + preceded( + // 1 byte of flags, if bit 0 is set, modification time is present + le_u8.verify(|x| x & 0b1 != 0), + seq! {Self { mtime: le_u32 }}, + ) + .parse_next(i) + } +} + +/// 4.5.7 -UNIX Extra Field (0x000d): +#[derive(Clone)] +pub struct ExtraUnixField { + /// file last access time + pub atime: u32, + /// file last modification time + pub mtime: u32, + /// file user id + pub uid: u16, + /// file group id + pub gid: u16, + /// variable length data field + pub data: ZipBytes, +} + +impl ExtraUnixField { + const TAG: u16 = 0x000d; + const TAG_INFOZIP: u16 = 0x5855; + + fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + let t_size = le_u16.parse_next(i)? - 12; + seq! {Self { + atime: le_u32, + mtime: le_u32, + uid: le_u16, + gid: le_u16, + data: ZipBytes::parser(t_size), + }} + .parse_next(i) + } +} + +/// Info-ZIP New Unix Extra Field: +/// ==================================== +/// +/// Currently stores Unix UIDs/GIDs up to 32 bits. +/// (Last Revision 20080509) +/// +/// ```text +/// Value Size Description +/// ----- ---- ----------- +/// 0x7875 Short tag for this extra block type ("ux") +/// TSize Short total data size for this block +/// Version 1 byte version of this extra field, currently 1 +/// UIDSize 1 byte Size of UID field +/// UID Variable UID for this entry +/// GIDSize 1 byte Size of GID field +/// GID Variable GID for this entry +/// ``` +#[derive(Clone)] +pub struct ExtraNewUnixField { + pub uid: u64, + pub gid: u64, +} + +impl ExtraNewUnixField { + const TAG: u16 = 0x7875; + + fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + let _ = tag("\x01").parse_next(i)?; + seq! {Self { + uid: Self::parse_variable_length_integer, + gid: Self::parse_variable_length_integer, + }} + .parse_next(i) + } + + fn parse_variable_length_integer(i: &mut Partial<&'_ [u8]>) -> PResult { + let slice = length_take(le_u8).parse_next(i)?; + if let Some(u) = match slice.len() { + 1 => Some(le_u8.parse_peek(slice)?.1 as u64), + 2 => Some(le_u16.parse_peek(slice)?.1 as u64), + 4 => Some(le_u32.parse_peek(slice)?.1 as u64), + 8 => Some(le_u64.parse_peek(slice)?.1), + _ => None, + } { + Ok(u) + } else { + Err(ErrMode::from_error_kind(i, ErrorKind::Alt)) + } + } +} + +/// 4.5.5 -NTFS Extra Field (0x000a): +#[derive(Clone)] +pub struct ExtraNtfsField { + pub attrs: Vec, +} + +impl ExtraNtfsField { + const TAG: u16 = 0x000a; + + fn parse(i: &mut Partial<&'_ [u8]>) -> PResult { + let _ = take(4_usize).parse_next(i)?; // reserved (unused) + seq! {Self { + // from the winnow docs: + // Parsers like repeat do not know when an eof is from insufficient + // data or the end of the stream, causing them to always report + // Incomplete. + // using repeat_till with eof combinator to work around this: + attrs: repeat_till(0.., NtfsAttr::parse, winnow::combinator::eof).map(|x| x.0), + }} + .parse_next(i) + } +} + +/// NTFS attribute for zip entries (mostly timestamps) +#[derive(Clone)] +pub enum NtfsAttr { + Attr1(NtfsAttr1), + Unknown { tag: u16 }, +} + +impl NtfsAttr { + fn parse(i: &mut Partial<&'_ [u8]>) -> PResult { + let tag = le_u16.parse_next(i)?; + trace!("parsing NTFS attribute, tag {:04x}", tag); + let payload = length_take(le_u16).parse_next(i)?; + + match tag { + 0x0001 => NtfsAttr1::parser + .parse_peek(Partial::new(payload)) + .map(|(_, attr)| NtfsAttr::Attr1(attr)), + _ => Ok(NtfsAttr::Unknown { tag }), + } + } +} + +#[derive(Clone)] +pub struct NtfsAttr1 { + pub mtime: NtfsTimestamp, + pub atime: NtfsTimestamp, + pub ctime: NtfsTimestamp, +} + +impl NtfsAttr1 { + fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + trace!("parsing NTFS attr 1, input len is {}", i.len()); + seq! {Self { + mtime: NtfsTimestamp::parser, + atime: NtfsTimestamp::parser, + ctime: NtfsTimestamp::parser, + }} + .parse_next(i) + } +} diff --git a/rc-zip/src/parse/local.rs b/rc-zip/src/parse/local.rs new file mode 100644 index 0000000..8409176 --- /dev/null +++ b/rc-zip/src/parse/local.rs @@ -0,0 +1,189 @@ +use crate::{Error, Method, MsdosTimestamp, UnsupportedError, Version, ZipBytes, ZipString}; + +use winnow::{ + binary::{le_u16, le_u32, le_u64, le_u8}, + combinator::opt, + error::{ContextError, ErrMode, ErrorKind, FromExternalError}, + seq, + token::tag, + PResult, Parser, Partial, +}; + +#[derive(Debug)] +/// 4.3.7 Local file header +pub struct LocalFileHeaderRecord { + /// version needed to extract + pub reader_version: Version, + /// general purpose bit flag + pub flags: u16, + /// compression method + pub method: Method, + /// last mod file datetime + pub modified: MsdosTimestamp, + /// crc-32 + pub crc32: u32, + /// compressed size + pub compressed_size: u32, + /// uncompressed size + pub uncompressed_size: u32, + // file name + pub name: ZipString, + // extra field + pub extra: ZipBytes, + + // method-specific fields + pub method_specific: MethodSpecific, +} + +#[derive(Debug)] +/// Method-specific properties following the local file header +pub enum MethodSpecific { + None, + Lzma(LzmaProperties), +} + +impl LocalFileHeaderRecord { + pub const SIGNATURE: &'static str = "PK\x03\x04"; + + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + let _ = tag(Self::SIGNATURE).parse_next(i)?; + + let reader_version = Version::parser.parse_next(i)?; + let flags = le_u16.parse_next(i)?; + let method = le_u16.parse_next(i).map(Method::from)?; + let modified = MsdosTimestamp::parser.parse_next(i)?; + let crc32 = le_u32.parse_next(i)?; + let compressed_size = le_u32.parse_next(i)?; + let uncompressed_size = le_u32.parse_next(i)?; + + let name_len = le_u16.parse_next(i)?; + let extra_len = le_u16.parse_next(i)?; + + let name = ZipString::parser(name_len).parse_next(i)?; + let extra = ZipBytes::parser(extra_len).parse_next(i)?; + + let method_specific = match method { + Method::Lzma => { + let lzma_properties = LzmaProperties::parser.parse_next(i)?; + if let Err(e) = lzma_properties.error_if_unsupported() { + return Err(ErrMode::Cut(ContextError::from_external_error( + i, + ErrorKind::Verify, + e, + ))); + } + MethodSpecific::Lzma(lzma_properties) + } + _ => MethodSpecific::None, + }; + + Ok(Self { + reader_version, + flags, + method, + modified, + crc32, + compressed_size, + uncompressed_size, + name, + extra, + method_specific, + }) + } + + pub fn has_data_descriptor(&self) -> bool { + // 4.3.9.1 This descriptor MUST exist if bit 3 of the general + // purpose bit flag is set (see below). + self.flags & 0b1000 != 0 + } +} + +/// 4.3.9 Data descriptor: +#[derive(Debug)] +pub struct DataDescriptorRecord { + /// CRC32 checksum + pub crc32: u32, + /// Compressed size + pub compressed_size: u64, + /// Uncompressed size + pub uncompressed_size: u64, +} + +impl DataDescriptorRecord { + const SIGNATURE: &'static str = "PK\x07\x08"; + + pub fn mk_parser(is_zip64: bool) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult { + move |i| { + // From appnote.txt: + // + // 4.3.9.3 Although not originally assigned a signature, the value + // 0x08074b50 has commonly been adopted as a signature value for the + // data descriptor record. Implementers SHOULD be aware that ZIP files + // MAY be encountered with or without this signature marking data + // descriptors and SHOULD account for either case when reading ZIP files + // to ensure compatibility. + let _ = opt(tag(Self::SIGNATURE)).parse_next(i)?; + + if is_zip64 { + seq! {Self { + crc32: le_u32, + compressed_size: le_u64, + uncompressed_size: le_u64, + }} + .parse_next(i) + } else { + seq! {Self { + crc32: le_u32, + compressed_size: le_u32.map(|x| x as u64), + uncompressed_size: le_u32.map(|x| x as u64), + }} + .parse_next(i) + } + } + } +} + +/// 5.8.5 LZMA Properties header +#[derive(Debug)] +pub struct LzmaProperties { + /// major version + pub major: u8, + /// minor version + pub minor: u8, + /// properties size + pub properties_size: u16, +} + +impl LzmaProperties { + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + seq! {Self { + major: le_u8, + minor: le_u8, + properties_size: le_u16, + }} + .parse_next(i) + } + + pub fn error_if_unsupported(&self) -> Result<(), Error> { + if (self.major, self.minor) != (2, 0) { + return Err(Error::Unsupported( + UnsupportedError::LzmaVersionUnsupported { + minor: self.minor, + major: self.major, + }, + )); + } + + const LZMA_PROPERTIES_SIZE: u16 = 5; + if self.properties_size != LZMA_PROPERTIES_SIZE { + return Err(Error::Unsupported( + UnsupportedError::LzmaPropertiesHeaderWrongSize { + expected: 5, + actual: self.properties_size, + }, + )); + } + + Ok(()) + } +} diff --git a/rc-zip/src/parse/mod.rs b/rc-zip/src/parse/mod.rs new file mode 100644 index 0000000..cd09c61 --- /dev/null +++ b/rc-zip/src/parse/mod.rs @@ -0,0 +1,35 @@ +//! Contain winnow parsers for most elements that make up a ZIP file, like the +//! end-of-central-directory record, local file headers, and central directory +//! headers. +//! +//! All parsers here are based off of the PKWARE appnote.txt, which you can find +//! in the source repository. + +pub use crate::encoding::Encoding; + +mod archive; +pub use archive::*; + +mod extra_field; +pub use extra_field::*; + +mod mode; +pub use mode::*; + +mod version; +pub use version::*; + +mod date_time; +pub use date_time::*; + +mod directory_header; +pub use directory_header::*; + +mod eocd; +pub use eocd::*; + +mod local; +pub use local::*; + +mod raw; +pub use raw::*; diff --git a/rc-zip/src/parse/mode.rs b/rc-zip/src/parse/mode.rs new file mode 100644 index 0000000..1baff51 --- /dev/null +++ b/rc-zip/src/parse/mode.rs @@ -0,0 +1,239 @@ +use std::fmt; + +/// Mode represents a file's mode and permission bits. +/// The bits have the same definition on all systems, +/// but not all bits apply to all systems. +/// +/// It is modelled after Go's `os.FileMode`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct Mode(pub u32); + +impl Mode { + /// d: is a directory + pub const DIR: Self = Self(1 << 31); + /// a: append-only + pub const APPEND: Self = Self(1 << 30); + /// l: exclusive use + pub const EXCLUSIVE: Self = Self(1 << 29); + /// T: temporary file; Plan 9 only + pub const TEMPORARY: Self = Self(1 << 28); + /// L: symbolic link + pub const SYMLINK: Self = Self(1 << 27); + /// D: device file + pub const DEVICE: Self = Self(1 << 26); + /// p: named pipe (FIFO) + pub const NAMED_PIPE: Self = Self(1 << 25); + /// S: Unix domain socket + pub const SOCKET: Self = Self(1 << 24); + /// u: setuid + pub const SETUID: Self = Self(1 << 23); + /// g: setgid + pub const SETGID: Self = Self(1 << 22); + /// c: Unix character device, when DEVICE is set + pub const CHAR_DEVICE: Self = Self(1 << 21); + /// t: sticky + pub const STICKY: Self = Self(1 << 20); + /// ?: non-regular file; nothing else is known + pub const IRREGULAR: Self = Self(1 << 19); +} + +impl fmt::Display for Mode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut w = 0; + if self.has(Self::DIR) { + write!(f, "d")?; + w += 1; + } + if self.has(Self::APPEND) { + write!(f, "a")?; + w += 1; + } + if self.has(Self::EXCLUSIVE) { + write!(f, "l")?; + w += 1; + } + if self.has(Self::TEMPORARY) { + write!(f, "T")?; + w += 1; + } + if self.has(Self::SYMLINK) { + write!(f, "L")?; + w += 1; + } + if self.has(Self::DEVICE) { + write!(f, "D")?; + w += 1; + } + if self.has(Self::NAMED_PIPE) { + write!(f, "p")?; + w += 1; + } + if self.has(Self::SOCKET) { + write!(f, "S")?; + w += 1; + } + if self.has(Self::SETUID) { + write!(f, "u")?; + w += 1; + } + if self.has(Self::SETGID) { + write!(f, "g")?; + w += 1; + } + if self.has(Self::CHAR_DEVICE) { + write!(f, "c")?; + w += 1; + } + if self.has(Self::STICKY) { + write!(f, "t")?; + w += 1; + } + if self.has(Self::IRREGULAR) { + write!(f, "?")?; + w += 1; + } + if w == 0 { + write!(f, "-")?; + } + + let rwx = "rwxrwxrwx"; + for (i, c) in rwx.char_indices() { + if self.has(Mode(1 << (9 - 1 - i))) { + write!(f, "{}", c)?; + } else { + write!(f, "-")?; + } + } + + Ok(()) + } +} + +impl From for Mode { + fn from(m: UnixMode) -> Self { + let mut mode = Mode(m.0 & 0o777); + + match m & UnixMode::IFMT { + UnixMode::IFBLK => mode |= Mode::DEVICE, + UnixMode::IFCHR => mode |= Mode::DEVICE & Mode::CHAR_DEVICE, + UnixMode::IFDIR => mode |= Mode::DIR, + UnixMode::IFIFO => mode |= Mode::NAMED_PIPE, + UnixMode::IFLNK => mode |= Mode::SYMLINK, + UnixMode::IFREG => { /* nothing to do */ } + UnixMode::IFSOCK => mode |= Mode::SOCKET, + _ => {} + } + + if m.has(UnixMode::ISGID) { + mode |= Mode::SETGID + } + if m.has(UnixMode::ISUID) { + mode |= Mode::SETUID + } + if m.has(UnixMode::ISVTX) { + mode |= Mode::STICKY + } + + mode + } +} + +impl From for Mode { + fn from(m: MsdosMode) -> Self { + let mut mode = if m.has(MsdosMode::DIR) { + Mode::DIR | Mode(0o777) + } else { + Mode(0o666) + }; + if m.has(MsdosMode::READ_ONLY) { + mode &= Mode(0o222); + } + + mode + } +} + +impl From for Mode { + fn from(u: u32) -> Self { + Mode(u) + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct UnixMode(pub u32); + +impl UnixMode { + pub const IFMT: Self = Self(0xf000); + pub const IFSOCK: Self = Self(0xc000); + pub const IFLNK: Self = Self(0xa000); + pub const IFREG: Self = Self(0x8000); + pub const IFBLK: Self = Self(0x6000); + pub const IFDIR: Self = Self(0x4000); + pub const IFCHR: Self = Self(0x2000); + pub const IFIFO: Self = Self(0x1000); + pub const ISUID: Self = Self(0x800); + pub const ISGID: Self = Self(0x400); + pub const ISVTX: Self = Self(0x200); +} + +impl From for UnixMode { + fn from(u: u32) -> Self { + UnixMode(u) + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct MsdosMode(pub u32); + +impl MsdosMode { + pub const DIR: Self = Self(0x10); + pub const READ_ONLY: Self = Self(0x01); +} + +impl From for MsdosMode { + fn from(u: u32) -> Self { + MsdosMode(u) + } +} + +macro_rules! derive_bitops { + ($T: ty) => { + impl std::ops::BitOr for $T { + type Output = Self; + + fn bitor(self, rhs: Self) -> Self { + Self(self.0 | rhs.0) + } + } + + impl std::ops::BitOrAssign for $T { + fn bitor_assign(&mut self, rhs: Self) { + self.0 |= rhs.0; + } + } + + impl std::ops::BitAnd for $T { + type Output = Self; + + fn bitand(self, rhs: Self) -> Self { + Self(self.0 & rhs.0) + } + } + + impl std::ops::BitAndAssign for $T { + fn bitand_assign(&mut self, rhs: Self) { + self.0 &= rhs.0; + } + } + + impl $T { + pub fn has(&self, rhs: Self) -> bool { + self.0 & rhs.0 != 0 + } + } + }; +} + +derive_bitops!(Mode); +derive_bitops!(UnixMode); +derive_bitops!(MsdosMode); diff --git a/rc-zip/src/parse/raw.rs b/rc-zip/src/parse/raw.rs new file mode 100644 index 0000000..fb978ab --- /dev/null +++ b/rc-zip/src/parse/raw.rs @@ -0,0 +1,77 @@ +use pretty_hex::PrettyHex; +use std::fmt; +use winnow::{stream::ToUsize, token::take, PResult, Parser, Partial}; + +/// A raw zip string, with no specific encoding. +/// +/// This is used while parsing a zip archive's central directory, +/// before we know what encoding is used. +#[derive(Clone)] +pub struct ZipString(pub Vec); + +impl<'a> From<&'a [u8]> for ZipString { + fn from(slice: &'a [u8]) -> Self { + Self(slice.into()) + } +} + +impl fmt::Debug for ZipString { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match std::str::from_utf8(&self.0) { + Ok(s) => write!(f, "{:?}", s), + Err(_) => write!(f, "[non-utf8 string: {}]", self.0.hex_dump()), + } + } +} + +impl ZipString { + pub(crate) fn parser(count: C) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult + where + C: ToUsize, + { + let count = count.to_usize(); + move |i| (take(count).map(|slice: &[u8]| Self(slice.into()))).parse_next(i) + } + + pub(crate) fn into_option(self) -> Option { + if !self.0.is_empty() { + Some(self) + } else { + None + } + } +} + +/// A raw u8 slice, with no specific structure. +/// +/// This is used while parsing a zip archive, when we want +/// to retain an owned slice to be parsed later. +#[derive(Clone)] +pub struct ZipBytes(pub Vec); + +impl fmt::Debug for ZipBytes { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + const MAX_SHOWN_SIZE: usize = 10; + let data = &self.0[..]; + let (slice, extra) = if data.len() > MAX_SHOWN_SIZE { + (&self.0[..MAX_SHOWN_SIZE], Some(data.len() - MAX_SHOWN_SIZE)) + } else { + (&self.0[..], None) + }; + write!(f, "{}", slice.hex_dump())?; + if let Some(extra) = extra { + write!(f, " (+ {} bytes)", extra)?; + } + Ok(()) + } +} + +impl ZipBytes { + pub(crate) fn parser(count: C) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult + where + C: ToUsize, + { + let count = count.to_usize(); + move |i| (take(count).map(|slice: &[u8]| Self(slice.into()))).parse_next(i) + } +} diff --git a/rc-zip/src/parse/version.rs b/rc-zip/src/parse/version.rs new file mode 100644 index 0000000..1b9ac8f --- /dev/null +++ b/rc-zip/src/parse/version.rs @@ -0,0 +1,133 @@ +use std::fmt; +use winnow::{binary::le_u16, PResult, Parser, Partial}; + +/// A zip version (either created by, or required when reading an archive). +/// +/// Versions determine which features are supported by a tool, and +/// which features are required when reading a file. +/// +/// For more information, see the [.ZIP Application Note](https://support.pkware.com/display/PKZIP/APPNOTE), section 4.4.2. +#[derive(Clone, Copy, PartialEq, Eq, Hash)] +pub struct Version(pub u16); + +impl fmt::Debug for Version { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{:?} v{}.{}", + self.host_system(), + self.major(), + self.minor() + ) + } +} + +impl Version { + /// Parse a version from a byte slice + pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + le_u16.map(Self).parse_next(i) + } + + /// Identifies the host system on which the zip attributes are compatible. + pub fn host_system(&self) -> HostSystem { + match self.host() { + 0 => HostSystem::MsDos, + 1 => HostSystem::Amiga, + 2 => HostSystem::OpenVms, + 3 => HostSystem::Unix, + 4 => HostSystem::VmCms, + 5 => HostSystem::AtariSt, + 6 => HostSystem::Os2Hpfs, + 7 => HostSystem::Macintosh, + 8 => HostSystem::ZSystem, + 9 => HostSystem::CpM, + 10 => HostSystem::WindowsNtfs, + 11 => HostSystem::Mvs, + 12 => HostSystem::Vse, + 13 => HostSystem::AcornRisc, + 14 => HostSystem::Vfat, + 15 => HostSystem::AlternateMvs, + 16 => HostSystem::BeOs, + 17 => HostSystem::Tandem, + 18 => HostSystem::Os400, + 19 => HostSystem::Osx, + n => HostSystem::Unknown(n), + } + } + + /// Integer host system + pub fn host(&self) -> u8 { + (self.0 >> 8) as u8 + } + + /// Integer version, e.g. 45 for Zip version 4.5 + pub fn version(&self) -> u8 { + (self.0 & 0xff) as u8 + } + + /// ZIP specification major version + /// + /// See APPNOTE, section 4.4.2.1 + pub fn major(&self) -> u32 { + self.version() as u32 / 10 + } + + /// ZIP specification minor version + /// + /// See APPNOTE, section 4.4.2.1 + pub fn minor(&self) -> u32 { + self.version() as u32 % 10 + } +} + +/// System on which an archive was created, as encoded into a version u16. +/// +/// See APPNOTE, section 4.4.2.2 +#[derive(Debug)] +pub enum HostSystem { + /// MS-DOS and OS/2 (FAT / VFAT / FAT32 file systems) + MsDos, + /// Amiga + Amiga, + /// OpenVMS + OpenVms, + /// UNIX + Unix, + /// VM/CMS + VmCms, + /// Atari ST + AtariSt, + /// OS/2 H.P.F.S + Os2Hpfs, + /// Macintosh (see `Osx`) + Macintosh, + /// Z-System + ZSystem, + /// CP/M + CpM, + /// Windows NTFS + WindowsNtfs, + /// MVS (OS/390 - Z/OS) + Mvs, + /// VSE + Vse, + /// Acorn Risc + AcornRisc, + /// VFAT + Vfat, + /// alternate MVS + AlternateMvs, + /// BeOS + BeOs, + /// Tandem + Tandem, + /// OS/400 + Os400, + /// OS X (Darwin) + Osx, + /// Unknown host system + /// + /// Values 20 through 255 are currently unused, as of + /// APPNOTE.TXT 6.3.6 (April 26, 2019) + Unknown(u8), +} From 455b5b9083cdefb34209c636d1d4cde2041850ef Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 12:13:04 +0100 Subject: [PATCH 20/49] Document every single exported symbol --- Cargo.lock | 1 + rc-zip-sync/src/entry_reader/mod.rs | 4 +- rc-zip-sync/src/lib.rs | 6 ++- rc-zip-sync/src/read_zip.rs | 26 ++++++----- rc-zip-sync/tests/integration_tests.rs | 22 +++------- rc-zip-tokio/src/entry_reader/mod.rs | 2 +- rc-zip-tokio/src/lib.rs | 6 ++- rc-zip-tokio/src/read_zip.rs | 21 +++++---- rc-zip/Cargo.toml | 1 + rc-zip/src/error.rs | 48 ++++++++++++++++---- rc-zip/src/fsm/archive.rs | 26 +++++------ rc-zip/src/fsm/entry.rs | 32 ++++++++++++++ rc-zip/src/fsm/mod.rs | 13 +++--- rc-zip/src/lib.rs | 13 ++++++ rc-zip/src/parse/archive.rs | 10 +++++ rc-zip/src/parse/date_time.rs | 5 +++ rc-zip/src/parse/directory_header.rs | 50 ++++++++++++++------- rc-zip/src/parse/eocd.rs | 61 ++++++++++++++++++++------ rc-zip/src/parse/extra_field.rs | 28 +++++++++++- rc-zip/src/parse/local.rs | 28 ++++++++++-- rc-zip/src/parse/mode.rs | 27 ++++++++++++ 21 files changed, 325 insertions(+), 105 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 763131c..e363f87 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -827,6 +827,7 @@ dependencies = [ "cfg-if", "chardetng", "chrono", + "crc32fast", "encoding_rs", "num_enum", "oem_cp", diff --git a/rc-zip-sync/src/entry_reader/mod.rs b/rc-zip-sync/src/entry_reader/mod.rs index 94a663f..2e5b57f 100644 --- a/rc-zip-sync/src/entry_reader/mod.rs +++ b/rc-zip-sync/src/entry_reader/mod.rs @@ -61,7 +61,7 @@ enum State { Transitioning, } -pub struct EntryReader +pub(crate) struct EntryReader where R: io::Read, { @@ -240,7 +240,7 @@ where { const DEFAULT_BUFFER_SIZE: usize = 256 * 1024; - pub fn new(entry: &StoredEntry, get_reader: F) -> Self + pub(crate) fn new(entry: &StoredEntry, get_reader: F) -> Self where F: Fn(u64) -> R, { diff --git a/rc-zip-sync/src/lib.rs b/rc-zip-sync/src/lib.rs index f5996fb..66e4a06 100644 --- a/rc-zip-sync/src/lib.rs +++ b/rc-zip-sync/src/lib.rs @@ -1,3 +1,8 @@ +//! A library for reading zip files synchronously using std I/O traits, +//! built on top of [rc-zip](https://crates.io/crates/rc-zip). + +#![warn(missing_docs)] + macro_rules! transition { ($state: expr => ($pattern: pat) $body: expr) => { $state = if let $pattern = std::mem::take(&mut $state) { @@ -13,5 +18,4 @@ mod entry_reader; mod read_zip; // re-exports -pub use entry_reader::EntryReader; pub use read_zip::{HasCursor, ReadZip, ReadZipWithSize, SyncArchive, SyncStoredEntry}; diff --git a/rc-zip-sync/src/read_zip.rs b/rc-zip-sync/src/read_zip.rs index fa21a2e..5044980 100644 --- a/rc-zip-sync/src/read_zip.rs +++ b/rc-zip-sync/src/read_zip.rs @@ -3,13 +3,14 @@ use rc_zip::{ Archive, Error, StoredEntry, }; -use crate::EntryReader; +use crate::entry_reader::EntryReader; use std::{io::Read, ops::Deref}; /// A trait for reading something as a zip archive (blocking I/O model) /// /// See also [ReadZip]. pub trait ReadZipWithSize { + /// The type of the file to read from. type File: HasCursor; /// Reads self as a zip archive. @@ -24,6 +25,7 @@ pub trait ReadZipWithSize { /// /// See also [ReadZipWithSize]. pub trait ReadZip { + /// The type of the file to read from. type File: HasCursor; /// Reads self as a zip archive. @@ -41,23 +43,23 @@ where fn read_zip_with_size(&self, size: u64) -> Result, Error> { tracing::trace!(%size, "read_zip_with_size"); - let mut ar = ArchiveFsm::new(size); + let mut fsm = ArchiveFsm::new(size); loop { - if let Some(offset) = ar.wants_read() { - tracing::trace!(%offset, "read_zip_with_size: wants_read, space len = {}", ar.space().len()); - match self.cursor_at(offset).read(ar.space()) { + if let Some(offset) = fsm.wants_read() { + tracing::trace!(%offset, "read_zip_with_size: wants_read, space len = {}", fsm.space().len()); + match self.cursor_at(offset).read(fsm.space()) { Ok(read_bytes) => { tracing::trace!(%read_bytes, "read_zip_with_size: read"); if read_bytes == 0 { return Err(Error::IO(std::io::ErrorKind::UnexpectedEof.into())); } - ar.fill(read_bytes); + fsm.fill(read_bytes); } Err(err) => return Err(Error::IO(err)), } } - match ar.process()? { + fsm = match fsm.process()? { FsmResult::Done(archive) => { tracing::trace!("read_zip_with_size: done"); return Ok(SyncArchive { @@ -65,9 +67,7 @@ where archive, }); } - FsmResult::Continue => { - tracing::trace!("read_zip_with_size: continue"); - } + FsmResult::Continue(fsm) => fsm, } } } @@ -89,6 +89,7 @@ impl ReadZip for Vec { } } +/// A zip archive, read synchronously from a file or other I/O resource. pub struct SyncArchive<'a, F> where F: HasCursor, @@ -133,6 +134,7 @@ where } } +/// A zip entry, read synchronously from a file or other I/O resource. pub struct SyncStoredEntry<'a, F> { file: &'a F, entry: &'a StoredEntry, @@ -151,8 +153,7 @@ where F: HasCursor, { /// Returns a reader for the entry. - pub fn reader(&self) -> EntryReader<::Cursor<'a>> { - tracing::trace!("Creating EntryReader"); + pub fn reader(&self) -> impl Read + 'a { EntryReader::new(self.entry, |offset| self.file.cursor_at(offset)) } @@ -166,6 +167,7 @@ where /// A sliceable I/O resource: we can ask for a [Read] at a given offset. pub trait HasCursor { + /// The type of [Read] returned by [cursor_at]. type Cursor<'a>: Read + 'a where Self: 'a; diff --git a/rc-zip-sync/tests/integration_tests.rs b/rc-zip-sync/tests/integration_tests.rs index 4fd962e..f34dda1 100644 --- a/rc-zip-sync/tests/integration_tests.rs +++ b/rc-zip-sync/tests/integration_tests.rs @@ -55,7 +55,6 @@ impl ZipTest { } if let Some(exp_encoding) = self.expected_encoding { - println!("{}: should be {}", self.name(), exp_encoding); assert_eq!(archive.encoding(), exp_encoding); } @@ -351,10 +350,10 @@ fn state_machine() { let cases = test_cases(); let case = cases.iter().find(|x| x.name() == "zip64.zip").unwrap(); let bs = case.bytes(); - let mut zar = ArchiveFsm::new(bs.len() as u64); + let mut fsm = ArchiveFsm::new(bs.len() as u64); let archive = 'read_zip: loop { - if let Some(offset) = zar.wants_read() { + if let Some(offset) = fsm.wants_read() { let increment = 128usize; let offset = offset as usize; let slice = if offset + increment > bs.len() { @@ -363,29 +362,22 @@ fn state_machine() { &bs[offset..offset + increment] }; - let len = cmp::min(slice.len(), zar.space().len()); - println!( - "slice len: {}, zar space len: {}", - slice.len(), - zar.space().len() - ); - zar.space()[..len].copy_from_slice(&slice[..len]); + let len = cmp::min(slice.len(), fsm.space().len()); + fsm.space()[..len].copy_from_slice(&slice[..len]); match len { 0 => panic!("EOF!"), read_bytes => { - println!("at {}, zar read {} bytes", offset, read_bytes); - zar.fill(read_bytes); + fsm.fill(read_bytes); } } } - match zar.process() { + fsm = match fsm.process() { Ok(res) => match res { - FsmResult::Continue => {} + FsmResult::Continue(fsm) => fsm, FsmResult::Done(archive) => break 'read_zip archive, }, Err(err) => { - println!("zar processing error: {:#?}", err); panic!("{}", err) } } diff --git a/rc-zip-tokio/src/entry_reader/mod.rs b/rc-zip-tokio/src/entry_reader/mod.rs index 62a2dd1..8d83861 100644 --- a/rc-zip-tokio/src/entry_reader/mod.rs +++ b/rc-zip-tokio/src/entry_reader/mod.rs @@ -54,7 +54,7 @@ pin_project_lite::pin_project! { } pin_project_lite::pin_project! { - pub struct AsyncEntryReader + pub(crate) struct AsyncEntryReader where R: AsyncRead, { diff --git a/rc-zip-tokio/src/lib.rs b/rc-zip-tokio/src/lib.rs index 36af3f0..39b8413 100644 --- a/rc-zip-tokio/src/lib.rs +++ b/rc-zip-tokio/src/lib.rs @@ -1,3 +1,8 @@ +//! A library for reading zip files asynchronously using tokio I/O traits, +//! based on top of [rc-zip](https://crates.io/crates/rc-zip). + +#![warn(missing_docs)] + macro_rules! transition_async { ($state: expr => ($pattern: pat) $body: expr) => { *$state.as_mut() = if let $pattern = std::mem::take($state.as_mut().get_mut()) { @@ -13,7 +18,6 @@ mod entry_reader; mod read_zip; // re-exports -pub use entry_reader::AsyncEntryReader; pub use read_zip::{ AsyncArchive, AsyncReadZip, AsyncReadZipWithSize, AsyncStoredEntry, HasAsyncCursor, }; diff --git a/rc-zip-tokio/src/read_zip.rs b/rc-zip-tokio/src/read_zip.rs index 438bceb..986f3e0 100644 --- a/rc-zip-tokio/src/read_zip.rs +++ b/rc-zip-tokio/src/read_zip.rs @@ -9,12 +9,13 @@ use rc_zip::{ Archive, Error, StoredEntry, }; -use crate::AsyncEntryReader; +use crate::entry_reader::AsyncEntryReader; /// A trait for reading something as a zip archive (blocking I/O model) /// /// See also [ReadZip]. pub trait AsyncReadZipWithSize { + /// The type of the file to read from. type File: HasAsyncCursor; /// Reads self as a zip archive. @@ -33,6 +34,7 @@ pub trait AsyncReadZipWithSize { /// /// See also [ReadZipWithSize]. pub trait AsyncReadZip { + /// The type of the file to read from. type File: HasAsyncCursor; /// Reads self as a zip archive. @@ -50,28 +52,28 @@ where type File = F; async fn read_zip_with_size_async(&self, size: u64) -> Result, Error> { - let mut ar = ArchiveFsm::new(size); + let mut fsm = ArchiveFsm::new(size); loop { - if let Some(offset) = ar.wants_read() { - match self.cursor_at(offset).read(ar.space()).await { + if let Some(offset) = fsm.wants_read() { + match self.cursor_at(offset).read(fsm.space()).await { Ok(read_bytes) => { if read_bytes == 0 { return Err(Error::IO(io::ErrorKind::UnexpectedEof.into())); } - ar.fill(read_bytes); + fsm.fill(read_bytes); } Err(err) => return Err(Error::IO(err)), } } - match ar.process()? { + fsm = match fsm.process()? { FsmResult::Done(archive) => { return Ok(AsyncArchive { file: self, archive, }) } - FsmResult::Continue => {} + FsmResult::Continue(fsm) => fsm, } } } @@ -93,6 +95,7 @@ impl AsyncReadZip for Vec { } } +/// A zip archive, read asynchronously from a file or other I/O resource. pub struct AsyncArchive<'a, F> where F: HasAsyncCursor, @@ -137,6 +140,7 @@ where } } +/// A single entry in a zip archive, read asynchronously from a file or other I/O resource. pub struct AsyncStoredEntry<'a, F> { file: &'a F, entry: &'a StoredEntry, @@ -155,7 +159,7 @@ where F: HasAsyncCursor, { /// Returns a reader for the entry. - pub fn reader(&self) -> AsyncEntryReader<::Cursor<'a>> { + pub fn reader(&self) -> impl AsyncRead + Unpin + '_ { tracing::trace!("Creating EntryReader"); AsyncEntryReader::new(self.entry, |offset| self.file.cursor_at(offset)) } @@ -170,6 +174,7 @@ where /// A sliceable I/O resource: we can ask for a [Read] at a given offset. pub trait HasAsyncCursor { + /// The type returned by [cursor_at]. type Cursor<'a>: AsyncRead + Unpin + 'a where Self: 'a; diff --git a/rc-zip/Cargo.toml b/rc-zip/Cargo.toml index 65284b0..d963900 100644 --- a/rc-zip/Cargo.toml +++ b/rc-zip/Cargo.toml @@ -27,3 +27,4 @@ thiserror = "1.0.56" chardetng = "0.1.17" num_enum = "0.7.2" cfg-if = "1.0.0" +crc32fast = "1.3.2" diff --git a/rc-zip/src/error.rs b/rc-zip/src/error.rs index dceb7b1..2c7272d 100644 --- a/rc-zip/src/error.rs +++ b/rc-zip/src/error.rs @@ -29,27 +29,45 @@ pub enum Error { } impl Error { + /// Create a new error indicating that the given method is not supported. pub fn method_not_supported(method: Method) -> Self { Self::Unsupported(UnsupportedError::MethodNotSupported(method)) } + /// Create a new error indicating that the given method is not enabled. pub fn method_not_enabled(method: Method) -> Self { Self::Unsupported(UnsupportedError::MethodNotEnabled(method)) } } +/// Some part of the zip format is not supported by this crate. #[derive(Debug, thiserror::Error)] pub enum UnsupportedError { + /// The compression method is not supported. #[error("compression method not supported: {0:?}")] MethodNotSupported(Method), + /// The compression method is supported, but not enabled in this build. #[error("compression method supported, but not enabled in this build: {0:?}")] MethodNotEnabled(Method), + /// The zip file uses a version of LZMA that is not supported. #[error("only LZMA2.0 is supported, found LZMA{minor}.{major}")] - LzmaVersionUnsupported { minor: u8, major: u8 }, + LzmaVersionUnsupported { + /// major version read from LZMA properties header, cf. appnote 5.8.8 + major: u8, + /// minor version read from LZMA properties header, cf. appnote 5.8.8 + minor: u8, + }, + + /// The LZMA properties header is not the expected size. #[error("LZMA properties header wrong size: expected {expected} bytes, got {actual} bytes")] - LzmaPropertiesHeaderWrongSize { expected: u16, actual: u16 }, + LzmaPropertiesHeaderWrongSize { + /// expected size in bytes + expected: u16, + /// actual size in bytes, read from a u16, cf. appnote 5.8.8 + actual: u16, + }, } /// Specific zip format errors, mostly due to invalid zip archives but that could also stem from @@ -80,7 +98,12 @@ pub enum FormatError { /// a certain number of files, but we weren't able to read the same number of central directory /// headers. #[error("invalid central record: expected to read {expected} files, got {actual}")] - InvalidCentralRecord { expected: u16, actual: u16 }, + InvalidCentralRecord { + /// expected number of files + expected: u16, + /// actual number of files + actual: u16, + }, /// An extra field (that we support) was not decoded correctly. /// @@ -94,7 +117,9 @@ pub enum FormatError { /// claimed_records_count * minimum_entry_size, we know it's not a valid zip file. #[error("impossible number of files: claims to have {claimed_records_count}, but zip size is {zip_size}")] ImpossibleNumberOfFiles { + /// number of files claimed in the end of central directory record claimed_records_count: u64, + /// total size of the zip file zip_size: u64, }, @@ -108,14 +133,21 @@ pub enum FormatError { /// The uncompressed size didn't match #[error("uncompressed size didn't match: expected {expected}, got {actual}")] - WrongSize { expected: u64, actual: u64 }, + WrongSize { + /// expected size in bytes (from the local header, data descriptor, etc.) + expected: u64, + /// actual size in bytes (from decompressing the entry) + actual: u64, + }, /// The CRC-32 checksum didn't match. #[error("checksum didn't match: expected {expected:x?}, got {actual:x?}")] - WrongChecksum { expected: u32, actual: u32 }, - - #[error("lzma properties larger than max")] - LzmaPropertiesLargerThanMax, + WrongChecksum { + /// expected checksum (from the data descriptor, etc.) + expected: u32, + /// actual checksum (from decompressing the entry) + actual: u32, + }, } impl From for std::io::Error { diff --git a/rc-zip/src/fsm/archive.rs b/rc-zip/src/fsm/archive.rs index cca0c80..60c003a 100644 --- a/rc-zip/src/fsm/archive.rs +++ b/rc-zip/src/fsm/archive.rs @@ -46,9 +46,6 @@ enum State { directory_headers: Vec, }, - /// Done! - Done, - #[default] Transitioning, } @@ -130,7 +127,6 @@ impl ArchiveFsm { ref eocd, .. } => Some(buffer.read_offset(eocd.directory_offset())), - S::Done { .. } => panic!("Called wants_read() on ArchiveReader in Done state"), S::Transitioning => unreachable!(), } } @@ -171,7 +167,7 @@ impl ArchiveFsm { /// /// A result of [FsmResult::Done] contains the [Archive], and indicates that no /// method should ever be called again on this reader. - pub fn process(&mut self) -> Result, Error> { + pub fn process(mut self) -> Result, Error> { use State as S; match self.state { S::ReadEocd { @@ -184,7 +180,7 @@ impl ArchiveFsm { haystack_size, "ReadEocd | need more data" ); - return Ok(FsmResult::Continue); + return Ok(FsmResult::Continue(self)); } match { @@ -216,14 +212,14 @@ impl ArchiveFsm { directory_headers: vec![], } }); - Ok(FsmResult::Continue) + Ok(FsmResult::Continue(self)) } else { trace!("ReadEocd | transition to ReadEocd64Locator"); transition!(self.state => (S::ReadEocd { mut buffer, .. }) { buffer.reset(); S::ReadEocd64Locator { buffer, eocdr } }); - Ok(FsmResult::Continue) + Ok(FsmResult::Continue(self)) } } } @@ -233,7 +229,7 @@ impl ArchiveFsm { match EndOfCentralDirectory64Locator::parser.parse_peek(input) { Err(ErrMode::Incomplete(_)) => { // need more data - Ok(FsmResult::Continue) + Ok(FsmResult::Continue(self)) } Err(ErrMode::Backtrack(_)) | Err(ErrMode::Cut(_)) => { // we don't have a zip64 end of central directory locator - that's ok! @@ -247,7 +243,7 @@ impl ArchiveFsm { directory_headers: vec![], } }); - Ok(FsmResult::Continue) + Ok(FsmResult::Continue(self)) } Ok((_, locator)) => { trace!( @@ -262,7 +258,7 @@ impl ArchiveFsm { eocdr, } }); - Ok(FsmResult::Continue) + Ok(FsmResult::Continue(self)) } } } @@ -271,7 +267,7 @@ impl ArchiveFsm { match EndOfCentralDirectory64Record::parser.parse_peek(input) { Err(ErrMode::Incomplete(_)) => { // need more data - Ok(FsmResult::Continue) + Ok(FsmResult::Continue(self)) } Err(ErrMode::Backtrack(_)) | Err(ErrMode::Cut(_)) => { // at this point, we really expected to have a zip64 end @@ -291,7 +287,7 @@ impl ArchiveFsm { directory_headers: vec![], } }); - Ok(FsmResult::Continue) + Ok(FsmResult::Continue(self)) } } } @@ -402,7 +398,6 @@ impl ArchiveFsm { comment = Some(encoding.decode(&eocd.comment().0)?); } - self.state = S::Done; return Ok(FsmResult::Done(Archive { size: self.size, comment, @@ -426,9 +421,8 @@ impl ArchiveFsm { buffer.consume(consumed); // need more data - Ok(FsmResult::Continue) + Ok(FsmResult::Continue(self)) } - S::Done { .. } => panic!("Called process() on ArchiveReader in Done state"), S::Transitioning => unreachable!(), } } diff --git a/rc-zip/src/fsm/entry.rs b/rc-zip/src/fsm/entry.rs index d4c908a..01794af 100644 --- a/rc-zip/src/fsm/entry.rs +++ b/rc-zip/src/fsm/entry.rs @@ -1,5 +1,36 @@ +// FIXME: remove +#![allow(unused)] + +use oval::Buffer; + +use crate::{DataDescriptorRecord, LocalFileHeaderRecord}; + +struct EntryReadMetrics { + uncompressed_size: u64, + crc32: u32, +} + #[derive(Default)] enum State { + ReadLocalHeader { + buffer: Buffer, + }, + ReadData { + hasher: crc32fast::Hasher, + uncompressed_size: u64, + header: LocalFileHeaderRecord, + }, + ReadDataDescriptor { + metrics: EntryReadMetrics, + header: LocalFileHeaderRecord, + buffer: Buffer, + }, + Validate { + metrics: EntryReadMetrics, + header: LocalFileHeaderRecord, + descriptor: Option, + }, + /// Done! Done, @@ -7,6 +38,7 @@ enum State { Transition, } +/// A state machine that can parse a zip entry pub struct EntryFsm { state: State, } diff --git a/rc-zip/src/fsm/mod.rs b/rc-zip/src/fsm/mod.rs index 33a0404..41709d2 100644 --- a/rc-zip/src/fsm/mod.rs +++ b/rc-zip/src/fsm/mod.rs @@ -24,11 +24,10 @@ mod entry; pub use entry::EntryFsm; /// Indicates whether or not the state machine has completed its work -pub enum FsmResult { - /// Indicates that the state machine still has work to do, and - /// needs either data or a call to process - Continue, - /// Indicates that the state machine has completed its work, and - /// the result is the value provided - Done(T), +pub enum FsmResult { + /// The I/O loop needs to continue, the state machine is given back. + Continue(M), + + /// The state machine is done, and the result is returned. + Done(R), } diff --git a/rc-zip/src/lib.rs b/rc-zip/src/lib.rs index 7193e91..16f68d3 100644 --- a/rc-zip/src/lib.rs +++ b/rc-zip/src/lib.rs @@ -1,3 +1,16 @@ +#![warn(missing_docs)] + +//! rc-zip is a [sans-io](https://sans-io.readthedocs.io/how-to-sans-io.html) library for reading zip files. +//! +//! It's made up of a bunch of types representing the various parts of a zip +//! file, winnow parsers that can turn byte buffers into those types, and +//! state machines that can use those parsers to read zip files from a stream. +//! +//! [rc-zip-sync](https://crates.io/crates/rc-zip-sync) and +//! [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio) build on top of this +//! to provide a higher-level API for reading zip files, from sync and async +//! code respectively. + mod encoding; mod error; diff --git a/rc-zip/src/parse/archive.rs b/rc-zip/src/parse/archive.rs index 421b72a..0d38843 100644 --- a/rc-zip/src/parse/archive.rs +++ b/rc-zip/src/parse/archive.rs @@ -142,9 +142,13 @@ pub struct StoredEntry { /// but they are also made available here raw. pub extra_fields: Vec, + /// These fields are cheap to clone and needed for entry readers, + /// hence them being in a separate struct pub inner: StoredEntryInner, } +/// Fields required to read an entry properly, typically cloned into owned entry +/// readers. #[derive(Clone, Copy, Debug)] pub struct StoredEntryInner { /// CRC-32 hash as found in the central directory. @@ -259,12 +263,18 @@ impl StoredEntry { /// The contents of an entry: a directory, a file, or a symbolic link. #[derive(Debug)] pub enum EntryContents { + /// The entry is a directory Directory, + + /// The entry is a file File, + + /// The entry is a symbolic link Symlink, } impl StoredEntry { + /// Determine [EntryContents] of this entry based on its mode. pub fn contents(&self) -> EntryContents { if self.mode.has(Mode::SYMLINK) { EntryContents::Symlink diff --git a/rc-zip/src/parse/date_time.rs b/rc-zip/src/parse/date_time.rs index baeee9a..3bcddc8 100644 --- a/rc-zip/src/parse/date_time.rs +++ b/rc-zip/src/parse/date_time.rs @@ -13,7 +13,10 @@ use winnow::{ /// Represents dates from year 1980 to 2180, with 2 second precision. #[derive(Clone, Copy, Eq, PartialEq)] pub struct MsdosTimestamp { + /// Time in 2-second intervals pub time: u16, + + /// Date in MS-DOS format, cf. https://docs.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-dosdatetimetofiletime pub date: u16, } @@ -27,6 +30,7 @@ impl fmt::Debug for MsdosTimestamp { } impl MsdosTimestamp { + /// Parser for MS-DOS timestamps pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { seq! {Self { time: le_u16, @@ -64,6 +68,7 @@ impl MsdosTimestamp { /// A timestamp in NTFS format. #[derive(Clone, Copy, Eq, PartialEq)] pub struct NtfsTimestamp { + /// Timestamp in 100ns intervals since 1601-01-01 00:00:00 UTC pub timestamp: u64, } diff --git a/rc-zip/src/parse/directory_header.rs b/rc-zip/src/parse/directory_header.rs index 15798be..eb51cc9 100644 --- a/rc-zip/src/parse/directory_header.rs +++ b/rc-zip/src/parse/directory_header.rs @@ -15,41 +15,56 @@ use crate::{ /// 4.3.12 Central directory structure: File header pub struct DirectoryHeader { - // version made by + /// version made by pub creator_version: Version, - // version needed to extract + + /// version needed to extract pub reader_version: Version, - // general purpose bit flag + + /// general purpose bit flag pub flags: u16, - // compression method + + /// compression method pub method: u16, - // last mod file datetime + + /// last mod file datetime pub modified: MsdosTimestamp, - // crc32 + + /// crc32 hash pub crc32: u32, - // compressed size + + /// compressed size pub compressed_size: u32, - // uncompressed size + + /// uncompressed size pub uncompressed_size: u32, - // disk number start + + /// disk number start pub disk_nbr_start: u16, - // internal file attributes + + /// internal file attributes pub internal_attrs: u16, - // external file attributes + + /// external file attributes pub external_attrs: u32, - // relative offset of local header + + /// relative offset of local header pub header_offset: u32, - // name - pub name: ZipString, - // extra - pub extra: ZipBytes, // comment + /// name + pub name: ZipString, // FIXME: should this be Cow? + + /// extra + pub extra: ZipBytes, // FIXME: should this be Cow<[u8]>? + + /// comment pub comment: ZipString, } impl DirectoryHeader { const SIGNATURE: &'static str = "PK\x01\x02"; + /// Parser for the central directory file header pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { _ = tag(Self::SIGNATURE).parse_next(i)?; let creator_version = Version::parser.parse_next(i)?; @@ -93,6 +108,7 @@ impl DirectoryHeader { } impl DirectoryHeader { + /// Returns true if the name or comment is not valid UTF-8 pub fn is_non_utf8(&self) -> bool { let (valid1, require1) = detect_utf8(&self.name.0[..]); let (valid2, require2) = detect_utf8(&self.comment.0[..]); @@ -113,6 +129,8 @@ impl DirectoryHeader { self.flags & 0x800 == 0 } + /// Converts the directory header into a stored entry: this involves + /// parsing the extra fields and converting the timestamps. pub fn as_stored_entry( &self, is_zip64: bool, diff --git a/rc-zip/src/parse/eocd.rs b/rc-zip/src/parse/eocd.rs index 65e747d..35edce1 100644 --- a/rc-zip/src/parse/eocd.rs +++ b/rc-zip/src/parse/eocd.rs @@ -13,16 +13,22 @@ use crate::{Error, FormatError, ZipString}; pub struct EndOfCentralDirectoryRecord { /// number of this disk pub disk_nbr: u16, + /// number of the disk with the start of the central directory pub dir_disk_nbr: u16, + /// total number of entries in the central directory on this disk pub dir_records_this_disk: u16, + /// total number of entries in the central directory pub directory_records: u16, - // size of the central directory + + /// size of the central directory pub directory_size: u32, + /// offset of start of central directory with respect to the starting disk number pub directory_offset: u32, + /// .ZIP file comment pub comment: ZipString, } @@ -32,6 +38,7 @@ impl EndOfCentralDirectoryRecord { const MIN_LENGTH: usize = 20; const SIGNATURE: &'static str = "PK\x05\x06"; + /// Find the end of central directory record in a block of data pub fn find_in_block(b: &[u8]) -> Option> { for i in (0..(b.len() - Self::MIN_LENGTH + 1)).rev() { let mut input = Partial::new(&b[i..]); @@ -45,6 +52,7 @@ impl EndOfCentralDirectoryRecord { None } + /// Parser for the end of central directory record pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { let _ = tag(Self::SIGNATURE).parse_next(i)?; seq! {Self { @@ -72,9 +80,11 @@ pub struct EndOfCentralDirectory64Locator { } impl EndOfCentralDirectory64Locator { + /// Length of the locator pub const LENGTH: usize = 20; const SIGNATURE: &'static str = "PK\x06\x07"; + /// Parser for the zip64 end of central directory locator pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { _ = tag(Self::SIGNATURE).parse_next(i)?; seq! {Self { @@ -91,28 +101,37 @@ impl EndOfCentralDirectory64Locator { pub struct EndOfCentralDirectory64Record { /// size of zip64 end of central directory record pub record_size: u64, + /// version made by pub creator_version: u16, + /// version needed to extract pub reader_version: u16, + /// number of this disk pub disk_nbr: u32, + /// number of the disk with the start of the central directory pub dir_disk_nbr: u32, - // total number of entries in the central directory on this disk + + /// total number of entries in the central directory on this disk pub dir_records_this_disk: u64, - // total number of entries in the central directory + + /// total number of entries in the central directory pub directory_records: u64, - // size of the central directory + + /// size of the central directory pub directory_size: u64, - // offset of the start of central directory with respect to the - // starting disk number + + /// offset of the start of central directory with respect to the + /// starting disk number pub directory_offset: u64, } impl EndOfCentralDirectory64Record { const SIGNATURE: &'static str = "PK\x06\x06"; + /// Parser for the zip64 end of central directory record pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { _ = tag(Self::SIGNATURE).parse_next(i)?; seq! {Self { @@ -130,9 +149,13 @@ impl EndOfCentralDirectory64Record { } } +/// A zip structure and its location in the input file #[derive(Debug)] pub struct Located { + /// Absolute by offset from the start of the file pub offset: u64, + + /// The structure itself pub inner: T, } @@ -151,13 +174,19 @@ impl std::ops::DerefMut for Located { /// Coalesces zip and zip64 "end of central directory" record info pub struct EndOfCentralDirectory { + /// The end of central directory record pub dir: Located, + + /// The zip64 end of central directory record pub dir64: Option>, + + /// Zip files may be prepended by arbitrary data, this is how much + /// data is at the beginning of the file that isn't part of the zip pub global_offset: i64, } impl EndOfCentralDirectory { - pub fn new( + pub(crate) fn new( size: u64, dir: Located, dir64: Option>, @@ -223,42 +252,48 @@ impl EndOfCentralDirectory { Ok(res) } - pub fn located_directory_offset(&self) -> u64 { + #[inline] + pub(crate) fn located_directory_offset(&self) -> u64 { match self.dir64.as_ref() { Some(d64) => d64.offset, None => self.dir.offset, } } - pub fn directory_offset(&self) -> u64 { + #[inline] + pub(crate) fn directory_offset(&self) -> u64 { match self.dir64.as_ref() { Some(d64) => d64.directory_offset, None => self.dir.directory_offset as u64, } } - pub fn directory_size(&self) -> u64 { + #[inline] + pub(crate) fn directory_size(&self) -> u64 { match self.dir64.as_ref() { Some(d64) => d64.directory_size, None => self.dir.directory_size as u64, } } - pub fn set_directory_offset(&mut self, offset: u64) { + #[inline] + pub(crate) fn set_directory_offset(&mut self, offset: u64) { match self.dir64.as_mut() { Some(d64) => d64.directory_offset = offset, None => self.dir.directory_offset = offset as u32, }; } - pub fn directory_records(&self) -> u64 { + #[inline] + pub(crate) fn directory_records(&self) -> u64 { match self.dir64.as_ref() { Some(d64) => d64.directory_records, None => self.dir.directory_records as u64, } } - pub fn comment(&self) -> &ZipString { + #[inline] + pub(crate) fn comment(&self) -> &ZipString { &self.dir.comment } } diff --git a/rc-zip/src/parse/extra_field.rs b/rc-zip/src/parse/extra_field.rs index daf31f1..224eef9 100644 --- a/rc-zip/src/parse/extra_field.rs +++ b/rc-zip/src/parse/extra_field.rs @@ -59,7 +59,10 @@ pub enum ExtraField { /// NTFS (Win9x/WinNT FileTimes) Ntfs(ExtraNtfsField), /// Unknown extra field, with tag - Unknown { tag: u16 }, + Unknown { + /// tag of the extra field + tag: u16, + }, } impl ExtraField { @@ -100,8 +103,13 @@ impl ExtraField { /// 4.5.3 -Zip64 Extended Information Extra Field (0x0001) #[derive(Clone, Default)] pub struct ExtraZip64Field { + /// 64-bit uncompressed size pub uncompressed_size: Option, + + /// 64-bit compressed size pub compressed_size: Option, + + /// 64-bit header offset pub header_offset: Option, } @@ -194,7 +202,10 @@ impl ExtraUnixField { /// ``` #[derive(Clone)] pub struct ExtraNewUnixField { + /// file user id pub uid: u64, + + /// file group id pub gid: u64, } @@ -229,6 +240,7 @@ impl ExtraNewUnixField { /// 4.5.5 -NTFS Extra Field (0x000a): #[derive(Clone)] pub struct ExtraNtfsField { + /// NTFS attributes pub attrs: Vec, } @@ -252,8 +264,14 @@ impl ExtraNtfsField { /// NTFS attribute for zip entries (mostly timestamps) #[derive(Clone)] pub enum NtfsAttr { + /// NTFS attribute 1, which contains modified/accessed/created timestamps Attr1(NtfsAttr1), - Unknown { tag: u16 }, + + /// Unknown NTFS attribute + Unknown { + /// tag of the attribute + tag: u16, + }, } impl NtfsAttr { @@ -271,10 +289,16 @@ impl NtfsAttr { } } +/// NTFS attribute 1, which contains modified/accessed/created timestamps #[derive(Clone)] pub struct NtfsAttr1 { + /// modified time pub mtime: NtfsTimestamp, + + /// accessed time pub atime: NtfsTimestamp, + + /// created time pub ctime: NtfsTimestamp, } diff --git a/rc-zip/src/parse/local.rs b/rc-zip/src/parse/local.rs index 8409176..bca3bfe 100644 --- a/rc-zip/src/parse/local.rs +++ b/rc-zip/src/parse/local.rs @@ -14,37 +14,50 @@ use winnow::{ pub struct LocalFileHeaderRecord { /// version needed to extract pub reader_version: Version, + /// general purpose bit flag pub flags: u16, + /// compression method pub method: Method, + /// last mod file datetime pub modified: MsdosTimestamp, + /// crc-32 pub crc32: u32, + /// compressed size pub compressed_size: u32, + /// uncompressed size pub uncompressed_size: u32, - // file name + + /// file name pub name: ZipString, - // extra field + + /// extra field pub extra: ZipBytes, - // method-specific fields + /// method-specific fields pub method_specific: MethodSpecific, } #[derive(Debug)] /// Method-specific properties following the local file header pub enum MethodSpecific { + /// No method-specific properties None, + + /// LZMA properties Lzma(LzmaProperties), } impl LocalFileHeaderRecord { + /// The signature for a local file header pub const SIGNATURE: &'static str = "PK\x03\x04"; + /// Parser for the local file header pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { let _ = tag(Self::SIGNATURE).parse_next(i)?; @@ -91,6 +104,8 @@ impl LocalFileHeaderRecord { }) } + /// Check for the presence of the bit flag that indicates a data descriptor + /// is present after the file data. pub fn has_data_descriptor(&self) -> bool { // 4.3.9.1 This descriptor MUST exist if bit 3 of the general // purpose bit flag is set (see below). @@ -112,6 +127,7 @@ pub struct DataDescriptorRecord { impl DataDescriptorRecord { const SIGNATURE: &'static str = "PK\x07\x08"; + /// Create a parser for the data descriptor record. pub fn mk_parser(is_zip64: bool) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult { move |i| { // From appnote.txt: @@ -155,7 +171,12 @@ pub struct LzmaProperties { } impl LzmaProperties { + /// Parser for the LZMA properties header. pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + // Note: the actual properties (5 bytes, contains dictionary size, + // and various other settings) is not actually read, because lzma-rs + // reads those properties itself. + seq! {Self { major: le_u8, minor: le_u8, @@ -164,6 +185,7 @@ impl LzmaProperties { .parse_next(i) } + /// Check if the LZMA version is supported. pub fn error_if_unsupported(&self) -> Result<(), Error> { if (self.major, self.minor) != (2, 0) { return Err(Error::Unsupported( diff --git a/rc-zip/src/parse/mode.rs b/rc-zip/src/parse/mode.rs index 1baff51..9185eec 100644 --- a/rc-zip/src/parse/mode.rs +++ b/rc-zip/src/parse/mode.rs @@ -159,20 +159,42 @@ impl From for Mode { } } +/// UnixMode represents the file mode and permission bits for Unix systems. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct UnixMode(pub u32); impl UnixMode { + /// bit mask for the file type bit fields pub const IFMT: Self = Self(0xf000); + + /// the file is a socket pub const IFSOCK: Self = Self(0xc000); + + /// the file is a symbolic link pub const IFLNK: Self = Self(0xa000); + + /// the file is a regular file pub const IFREG: Self = Self(0x8000); + + /// the file is a block device pub const IFBLK: Self = Self(0x6000); + + /// the file is a directory pub const IFDIR: Self = Self(0x4000); + + /// the file is a character device pub const IFCHR: Self = Self(0x2000); + + /// the file is a FIFO pub const IFIFO: Self = Self(0x1000); + + /// the file is set-user-ID pub const ISUID: Self = Self(0x800); + + /// the file is set-group-ID pub const ISGID: Self = Self(0x400); + + /// the file is sticky pub const ISVTX: Self = Self(0x200); } @@ -182,11 +204,15 @@ impl From for UnixMode { } } +/// MsdosMode represents the file mode and permission bits for MS-DOS #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct MsdosMode(pub u32); impl MsdosMode { + /// the file is a directory pub const DIR: Self = Self(0x10); + + /// the file is read-only pub const READ_ONLY: Self = Self(0x01); } @@ -227,6 +253,7 @@ macro_rules! derive_bitops { } impl $T { + /// Check if the mode has the given bits set. pub fn has(&self, rhs: Self) -> bool { self.0 & rhs.0 != 0 } From 444a1f2f012139e639b8ebcd86f321c465dbbc0b Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 12:23:03 +0100 Subject: [PATCH 21/49] Take care of docs --- rc-zip-sync/examples/jean.rs | 18 +++++++++--------- rc-zip-sync/src/entry_reader/mod.rs | 4 ++-- rc-zip-sync/src/lib.rs | 1 + rc-zip-sync/src/read_zip.rs | 3 ++- rc-zip-sync/tests/integration_tests.rs | 14 +++++++++----- rc-zip-tokio/src/entry_reader/mod.rs | 4 ++-- rc-zip-tokio/src/lib.rs | 1 + rc-zip-tokio/src/read_zip.rs | 3 ++- rc-zip/src/encoding.rs | 8 +++++--- rc-zip/src/error.rs | 2 +- rc-zip/src/fsm/archive.rs | 9 ++++++--- rc-zip/src/fsm/entry.rs | 2 +- rc-zip/src/fsm/mod.rs | 2 ++ rc-zip/src/lib.rs | 11 +++-------- rc-zip/src/parse/archive.rs | 5 ++++- rc-zip/src/parse/directory_header.rs | 11 ++++++++--- rc-zip/src/parse/eocd.rs | 5 ++++- rc-zip/src/parse/extra_field.rs | 2 +- rc-zip/src/parse/local.rs | 5 ++++- rc-zip/src/parse/mod.rs | 4 ++-- 20 files changed, 69 insertions(+), 45 deletions(-) diff --git a/rc-zip-sync/examples/jean.rs b/rc-zip-sync/examples/jean.rs index df76da1..bf613d4 100644 --- a/rc-zip-sync/examples/jean.rs +++ b/rc-zip-sync/examples/jean.rs @@ -1,7 +1,7 @@ use cfg_if::cfg_if; use clap::{Parser, Subcommand}; use humansize::{format_size, BINARY}; -use rc_zip::EntryContents; +use rc_zip::parse::{Archive, EntryContents, Method, Version}; use rc_zip_sync::ReadZip; use std::{ @@ -75,7 +75,7 @@ fn main() { } fn do_main(cli: Cli) -> Result<(), Box> { - fn info(archive: &rc_zip::Archive) { + fn info(archive: &Archive) { if let Some(comment) = archive.comment() { println!("Comment:\n{}", comment); } @@ -84,9 +84,9 @@ fn do_main(cli: Cli) -> Result<(), Box> { println!("Found Zip64 end of central directory locator") } - let mut creator_versions = HashSet::::new(); - let mut reader_versions = HashSet::::new(); - let mut methods = HashSet::::new(); + let mut creator_versions = HashSet::::new(); + let mut reader_versions = HashSet::::new(); + let mut methods = HashSet::::new(); let mut compressed_size: u64 = 0; let mut uncompressed_size: u64 = 0; let mut num_dirs = 0; @@ -97,13 +97,13 @@ fn do_main(cli: Cli) -> Result<(), Box> { creator_versions.insert(entry.creator_version); reader_versions.insert(entry.reader_version); match entry.contents() { - rc_zip::EntryContents::Symlink => { + EntryContents::Symlink => { num_symlinks += 1; } - rc_zip::EntryContents::Directory => { + EntryContents::Directory => { num_dirs += 1; } - rc_zip::EntryContents::File => { + EntryContents::File => { methods.insert(entry.method()); num_files += 1; compressed_size += entry.inner.compressed_size; @@ -160,7 +160,7 @@ fn do_main(cli: Cli) -> Result<(), Box> { gid = Optional(entry.gid), ); - if let rc_zip::EntryContents::Symlink = entry.contents() { + if let EntryContents::Symlink = entry.contents() { let mut target = String::new(); entry.reader().read_to_string(&mut target).unwrap(); print!("\t{target}", target = target); diff --git a/rc-zip-sync/src/entry_reader/mod.rs b/rc-zip-sync/src/entry_reader/mod.rs index 2e5b57f..56613ef 100644 --- a/rc-zip-sync/src/entry_reader/mod.rs +++ b/rc-zip-sync/src/entry_reader/mod.rs @@ -16,8 +16,8 @@ mod zstd_dec; use cfg_if::cfg_if; use oval::Buffer; use rc_zip::{ - DataDescriptorRecord, Error, FormatError, LocalFileHeaderRecord, Method, StoredEntry, - StoredEntryInner, + error::{Error, FormatError}, + parse::{DataDescriptorRecord, LocalFileHeaderRecord, Method, StoredEntry, StoredEntryInner}, }; use std::io; use tracing::trace; diff --git a/rc-zip-sync/src/lib.rs b/rc-zip-sync/src/lib.rs index 66e4a06..230886e 100644 --- a/rc-zip-sync/src/lib.rs +++ b/rc-zip-sync/src/lib.rs @@ -18,4 +18,5 @@ mod entry_reader; mod read_zip; // re-exports +pub use rc_zip; pub use read_zip::{HasCursor, ReadZip, ReadZipWithSize, SyncArchive, SyncStoredEntry}; diff --git a/rc-zip-sync/src/read_zip.rs b/rc-zip-sync/src/read_zip.rs index 5044980..8f8aef3 100644 --- a/rc-zip-sync/src/read_zip.rs +++ b/rc-zip-sync/src/read_zip.rs @@ -1,6 +1,7 @@ use rc_zip::{ + error::Error, fsm::{ArchiveFsm, FsmResult}, - Archive, Error, StoredEntry, + parse::{Archive, StoredEntry}, }; use crate::entry_reader::EntryReader; diff --git a/rc-zip-sync/tests/integration_tests.rs b/rc-zip-sync/tests/integration_tests.rs index f34dda1..2a30d04 100644 --- a/rc-zip-sync/tests/integration_tests.rs +++ b/rc-zip-sync/tests/integration_tests.rs @@ -2,7 +2,11 @@ use chrono::{ offset::{FixedOffset, Utc}, DateTime, TimeZone, Timelike, }; -use rc_zip::{Archive, Encoding}; +use rc_zip::{ + encoding::Encoding, + error::Error, + parse::{Archive, EntryContents}, +}; use rc_zip_sync::{HasCursor, ReadZip, SyncArchive, SyncStoredEntry}; use std::{cmp, fs::File, path::PathBuf}; @@ -17,7 +21,7 @@ struct ZipTest { expected_encoding: Option, comment: Option<&'static str>, files: Vec, - error: Option, + error: Option, } impl Default for ZipTest { @@ -33,7 +37,7 @@ impl Default for ZipTest { } impl ZipTest { - fn check(&self, archive: Result, rc_zip::Error>) { + fn check(&self, archive: Result, Error>) { let case_bytes = self.bytes(); if let Some(expected) = &self.error { @@ -111,7 +115,7 @@ impl ZipTestFile { assert!(entry.comment().is_none()); match entry.contents() { - rc_zip::EntryContents::File => { + EntryContents::File => { let actual_bytes = entry.bytes().unwrap(); match &self.content { @@ -131,7 +135,7 @@ impl ZipTestFile { } } } - rc_zip::EntryContents::Symlink | rc_zip::EntryContents::Directory => { + EntryContents::Symlink | EntryContents::Directory => { assert!(matches!(self.content, FileContent::Unchecked)); } } diff --git a/rc-zip-tokio/src/entry_reader/mod.rs b/rc-zip-tokio/src/entry_reader/mod.rs index 8d83861..88e6d3f 100644 --- a/rc-zip-tokio/src/entry_reader/mod.rs +++ b/rc-zip-tokio/src/entry_reader/mod.rs @@ -4,8 +4,8 @@ mod deflate_dec; use cfg_if::cfg_if; use oval::Buffer; use rc_zip::{ - DataDescriptorRecord, Error, FormatError, LocalFileHeaderRecord, Method, StoredEntry, - StoredEntryInner, + error::{Error, FormatError}, + parse::{DataDescriptorRecord, LocalFileHeaderRecord, Method, StoredEntry, StoredEntryInner}, }; use std::{io, pin::Pin, task}; use tokio::io::AsyncRead; diff --git a/rc-zip-tokio/src/lib.rs b/rc-zip-tokio/src/lib.rs index 39b8413..16d8128 100644 --- a/rc-zip-tokio/src/lib.rs +++ b/rc-zip-tokio/src/lib.rs @@ -18,6 +18,7 @@ mod entry_reader; mod read_zip; // re-exports +pub use rc_zip; pub use read_zip::{ AsyncArchive, AsyncReadZip, AsyncReadZipWithSize, AsyncStoredEntry, HasAsyncCursor, }; diff --git a/rc-zip-tokio/src/read_zip.rs b/rc-zip-tokio/src/read_zip.rs index 986f3e0..f886795 100644 --- a/rc-zip-tokio/src/read_zip.rs +++ b/rc-zip-tokio/src/read_zip.rs @@ -5,8 +5,9 @@ use positioned_io::{RandomAccessFile, ReadAt}; use tokio::io::{AsyncRead, AsyncReadExt, ReadBuf}; use rc_zip::{ + error::Error, fsm::{ArchiveFsm, FsmResult}, - Archive, Error, StoredEntry, + parse::{Archive, StoredEntry}, }; use crate::entry_reader::AsyncEntryReader; diff --git a/rc-zip/src/encoding.rs b/rc-zip/src/encoding.rs index 6d126bc..1d2c1ec 100644 --- a/rc-zip/src/encoding.rs +++ b/rc-zip/src/encoding.rs @@ -1,6 +1,7 @@ -//! zip entry paths may be encoded in a variety of character encodings. +//! Character encodings used in ZIP files. //! -//! Historically, CP-437 was used, but many modern zip files use UTF-8 with an +//! ZIP entry paths may be encoded in a variety of character encodings: +//! historically, CP-437 was used, but many modern zip files use UTF-8 with an //! optional UTF-8 flag. //! //! Others use the system's local character encoding, and we have no choice but @@ -11,7 +12,7 @@ use std::fmt; /// Encodings supported by this crate #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum Encoding { - /// UTF-8 + /// [UTF-8](https://en.wikipedia.org/wiki/UTF-8), opt-in for ZIP files. Utf8, /// [Codepage 437](https://en.wikipedia.org/wiki/Code_page_437), also known as @@ -52,6 +53,7 @@ pub enum DecodingError { #[error("text too large to be converted")] StringTooLarge, + /// Text is not valid in the given encoding. #[error("encoding error: {0}")] EncodingError(&'static str), } diff --git a/rc-zip/src/error.rs b/rc-zip/src/error.rs index 2c7272d..699407e 100644 --- a/rc-zip/src/error.rs +++ b/rc-zip/src/error.rs @@ -1,6 +1,6 @@ //! All error types used in this crate -use crate::Method; +use crate::parse::Method; use super::encoding; diff --git a/rc-zip/src/fsm/archive.rs b/rc-zip/src/fsm/archive.rs index 60c003a..5690e2a 100644 --- a/rc-zip/src/fsm/archive.rs +++ b/rc-zip/src/fsm/archive.rs @@ -1,8 +1,11 @@ use super::FsmResult; use crate::{ - encoding::Encoding, Archive, DirectoryHeader, EndOfCentralDirectory, - EndOfCentralDirectory64Locator, EndOfCentralDirectory64Record, EndOfCentralDirectoryRecord, - Error, FormatError, Located, StoredEntry, + encoding::Encoding, + error::{Error, FormatError}, + parse::{ + Archive, DirectoryHeader, EndOfCentralDirectory, EndOfCentralDirectory64Locator, + EndOfCentralDirectory64Record, EndOfCentralDirectoryRecord, Located, StoredEntry, + }, }; use tracing::trace; diff --git a/rc-zip/src/fsm/entry.rs b/rc-zip/src/fsm/entry.rs index 01794af..6a1d693 100644 --- a/rc-zip/src/fsm/entry.rs +++ b/rc-zip/src/fsm/entry.rs @@ -3,7 +3,7 @@ use oval::Buffer; -use crate::{DataDescriptorRecord, LocalFileHeaderRecord}; +use crate::parse::{DataDescriptorRecord, LocalFileHeaderRecord}; struct EntryReadMetrics { uncompressed_size: u64, diff --git a/rc-zip/src/fsm/mod.rs b/rc-zip/src/fsm/mod.rs index 41709d2..d43c324 100644 --- a/rc-zip/src/fsm/mod.rs +++ b/rc-zip/src/fsm/mod.rs @@ -1,3 +1,5 @@ +//! State machines built atop parsers, ready to bring your own I/O with. +//! //! Parsers are just part of the puzzle when it comes to zip files: finding the //! central directory is non-trivial and involves seeking around the input: //! [ArchiveFsm] provides a state machine to handle this. diff --git a/rc-zip/src/lib.rs b/rc-zip/src/lib.rs index 16f68d3..7346817 100644 --- a/rc-zip/src/lib.rs +++ b/rc-zip/src/lib.rs @@ -11,12 +11,7 @@ //! to provide a higher-level API for reading zip files, from sync and async //! code respectively. -mod encoding; - -mod error; -pub use error::*; - -mod parse; -pub use parse::*; - +pub mod encoding; +pub mod error; pub mod fsm; +pub mod parse; diff --git a/rc-zip/src/parse/archive.rs b/rc-zip/src/parse/archive.rs index 0d38843..354189e 100644 --- a/rc-zip/src/parse/archive.rs +++ b/rc-zip/src/parse/archive.rs @@ -1,7 +1,10 @@ use chrono::{DateTime, Utc}; use num_enum::{FromPrimitive, IntoPrimitive}; -use crate::{Encoding, ExtraField, Mode, Version}; +use crate::{ + encoding::Encoding, + parse::{ExtraField, Mode, Version}, +}; /// An Archive contains general information about a zip files, /// along with a list of [entries][StoredEntry]. diff --git a/rc-zip/src/parse/directory_header.rs b/rc-zip/src/parse/directory_header.rs index eb51cc9..db38717 100644 --- a/rc-zip/src/parse/directory_header.rs +++ b/rc-zip/src/parse/directory_header.rs @@ -8,9 +8,14 @@ use winnow::{ }; use crate::{ - encoding::detect_utf8, zero_datetime, Encoding, Entry, Error, ExtraField, ExtraFieldSettings, - FormatError, HostSystem, Mode, MsdosMode, MsdosTimestamp, NtfsAttr, StoredEntry, - StoredEntryInner, UnixMode, Version, ZipBytes, ZipString, + encoding::detect_utf8, + encoding::Encoding, + error::{Error, FormatError}, + parse::{ + zero_datetime, Entry, ExtraField, ExtraFieldSettings, HostSystem, Mode, MsdosMode, + MsdosTimestamp, NtfsAttr, StoredEntry, StoredEntryInner, UnixMode, Version, ZipBytes, + ZipString, + }, }; /// 4.3.12 Central directory structure: File header diff --git a/rc-zip/src/parse/eocd.rs b/rc-zip/src/parse/eocd.rs index 35edce1..386b091 100644 --- a/rc-zip/src/parse/eocd.rs +++ b/rc-zip/src/parse/eocd.rs @@ -6,7 +6,10 @@ use winnow::{ PResult, Parser, Partial, }; -use crate::{Error, FormatError, ZipString}; +use crate::{ + error::{Error, FormatError}, + parse::ZipString, +}; /// 4.3.16 End of central directory record: #[derive(Debug)] diff --git a/rc-zip/src/parse/extra_field.rs b/rc-zip/src/parse/extra_field.rs index 224eef9..9b3693b 100644 --- a/rc-zip/src/parse/extra_field.rs +++ b/rc-zip/src/parse/extra_field.rs @@ -8,7 +8,7 @@ use winnow::{ PResult, Parser, Partial, }; -use crate::{NtfsTimestamp, ZipBytes}; +use crate::parse::{NtfsTimestamp, ZipBytes}; /// 4.4.28 extra field: (Variable) pub(crate) struct ExtraFieldRecord<'a> { diff --git a/rc-zip/src/parse/local.rs b/rc-zip/src/parse/local.rs index bca3bfe..fc73ef6 100644 --- a/rc-zip/src/parse/local.rs +++ b/rc-zip/src/parse/local.rs @@ -1,4 +1,7 @@ -use crate::{Error, Method, MsdosTimestamp, UnsupportedError, Version, ZipBytes, ZipString}; +use crate::{ + error::{Error, UnsupportedError}, + parse::{Method, MsdosTimestamp, Version, ZipBytes, ZipString}, +}; use winnow::{ binary::{le_u16, le_u32, le_u64, le_u8}, diff --git a/rc-zip/src/parse/mod.rs b/rc-zip/src/parse/mod.rs index cd09c61..962c24e 100644 --- a/rc-zip/src/parse/mod.rs +++ b/rc-zip/src/parse/mod.rs @@ -1,3 +1,5 @@ +//! Parsers and types for the various elements that make up a ZIP file. +//! //! Contain winnow parsers for most elements that make up a ZIP file, like the //! end-of-central-directory record, local file headers, and central directory //! headers. @@ -5,8 +7,6 @@ //! All parsers here are based off of the PKWARE appnote.txt, which you can find //! in the source repository. -pub use crate::encoding::Encoding; - mod archive; pub use archive::*; From 955a9e89586f612e9134c83a01ab921cd77ce695 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 13:26:47 +0100 Subject: [PATCH 22/49] Simplify ArchiveFsm --- rc-zip/src/fsm/archive.rs | 152 +++++++++++++++----------------------- rc-zip/src/fsm/entry.rs | 20 +++-- 2 files changed, 76 insertions(+), 96 deletions(-) diff --git a/rc-zip/src/fsm/archive.rs b/rc-zip/src/fsm/archive.rs index 5690e2a..34c69da 100644 --- a/rc-zip/src/fsm/archive.rs +++ b/rc-zip/src/fsm/archive.rs @@ -18,33 +18,44 @@ use winnow::{ /// [ArchiveReader] parses a valid zip archive into an [Archive]. In particular, this struct finds /// an end of central directory record, parses the entire central directory, detects text encoding, /// and normalizes metadata. +/// +/// Look at the integration tests or [rc-zip-sync](https://crates.io/crates/rc-zip-sync) +/// for examples on how to use this struct. pub struct ArchiveFsm { - // Size of the entire zip file + /// Size of the entire zip file size: u64, + + /// Current stage: finding the eocd, reading the eocd, reading the eocd64 + /// locator, reading the eocd64, or reading the central directory state: State, + + /// Buffer for reading data from the file + buffer: Buffer, } #[derive(Default)] enum State { /// Finding and reading the end of central directory record - ReadEocd { buffer: Buffer, haystack_size: u64 }, + ReadEocd { + /// size of the haystack in which we're looking for the end of central + /// directory record. + /// this may be less than 65KiB if the file is smaller than that. + haystack_size: u64, + }, /// Reading the zip64 end of central directory record. ReadEocd64Locator { - buffer: Buffer, eocdr: Located, }, /// Reading the zip64 end of central directory record. ReadEocd64 { - buffer: Buffer, eocdr64_offset: u64, eocdr: Located, }, /// Reading all headers from the central directory ReadCentralDirectory { - buffer: Buffer, eocd: EndOfCentralDirectory, directory_headers: Vec, }, @@ -53,24 +64,6 @@ enum State { Transitioning, } -impl State { - fn get_buffer_mut(&mut self) -> Option<&mut Buffer> { - use State as S; - match self { - S::ReadEocd { ref mut buffer, .. } => Some(buffer), - S::ReadEocd64Locator { ref mut buffer, .. } => Some(buffer), - S::ReadEocd64 { ref mut buffer, .. } => Some(buffer), - S::ReadCentralDirectory { ref mut buffer, .. } => Some(buffer), - _ => None, - } - } - - fn expect_buffer_mut(&mut self) -> &mut Buffer { - self.get_buffer_mut() - .expect("called expect_buffer_mut() on invalid state") - } -} - impl ArchiveFsm { /// This should be > 65KiB, because the section at the end of the /// file that we check for end of central directory record is 65KiB. @@ -91,10 +84,8 @@ impl ArchiveFsm { Self { size, - state: State::ReadEocd { - buffer: Buffer::with_capacity(Self::DEFAULT_BUFFER_SIZE), - haystack_size, - }, + buffer: Buffer::with_capacity(Self::DEFAULT_BUFFER_SIZE), + state: State::ReadEocd { haystack_size }, } } @@ -109,27 +100,17 @@ impl ArchiveFsm { pub fn wants_read(&self) -> Option { use State as S; match self.state { - S::ReadEocd { - ref buffer, - haystack_size, - } => Some(buffer.read_offset(self.size - haystack_size)), - S::ReadEocd64Locator { - ref buffer, - ref eocdr, - } => { + S::ReadEocd { haystack_size } => { + Some(self.buffer.read_offset(self.size - haystack_size)) + } + S::ReadEocd64Locator { ref eocdr } => { let length = EndOfCentralDirectory64Locator::LENGTH as u64; - Some(buffer.read_offset(eocdr.offset - length)) + Some(self.buffer.read_offset(eocdr.offset - length)) + } + S::ReadEocd64 { eocdr64_offset, .. } => Some(self.buffer.read_offset(eocdr64_offset)), + S::ReadCentralDirectory { ref eocd, .. } => { + Some(self.buffer.read_offset(eocd.directory_offset())) } - S::ReadEocd64 { - ref buffer, - eocdr64_offset, - .. - } => Some(buffer.read_offset(eocdr64_offset)), - S::ReadCentralDirectory { - ref buffer, - ref eocd, - .. - } => Some(buffer.read_offset(eocd.directory_offset())), S::Transitioning => unreachable!(), } } @@ -138,15 +119,14 @@ impl ArchiveFsm { /// write to #[inline] pub fn space(&mut self) -> &mut [u8] { - let buf = self.state.expect_buffer_mut(); trace!( - available_space = buf.available_space(), + available_space = self.buffer.available_space(), "space() | available_space" ); - if buf.available_space() == 0 { - buf.shift(); + if self.buffer.available_space() == 0 { + self.buffer.shift(); } - buf.space() + self.buffer.space() } /// after having written data to the buffer, use this function @@ -157,7 +137,7 @@ impl ArchiveFsm { /// buffer #[inline] pub fn fill(&mut self, count: usize) -> usize { - self.state.expect_buffer_mut().fill(count) + self.buffer.fill(count) } /// Process buffered data @@ -173,21 +153,14 @@ impl ArchiveFsm { pub fn process(mut self) -> Result, Error> { use State as S; match self.state { - S::ReadEocd { - ref mut buffer, - haystack_size, - } => { - if buffer.read_bytes() < haystack_size { - trace!( - read_bytes = buffer.read_bytes(), - haystack_size, - "ReadEocd | need more data" - ); + S::ReadEocd { haystack_size } => { + if self.buffer.read_bytes() < haystack_size { + // read the entire haystack before we can continue return Ok(FsmResult::Continue(self)); } match { - let haystack = &buffer.data()[..haystack_size as usize]; + let haystack = &self.buffer.data()[..haystack_size as usize]; EndOfCentralDirectoryRecord::find_in_block(haystack) } { None => Err(FormatError::DirectoryEndSignatureNotFound.into()), @@ -197,7 +170,7 @@ impl ArchiveFsm { size = self.size, "ReadEocd | found end of central directory record" ); - buffer.reset(); + self.buffer.reset(); eocdr.offset += self.size - haystack_size; if eocdr.offset < EndOfCentralDirectory64Locator::LENGTH as u64 { @@ -207,10 +180,8 @@ impl ArchiveFsm { eocd64locator_length = EndOfCentralDirectory64Locator::LENGTH, "no room for an EOCD64 locator, definitely not a zip64 file" ); - transition!(self.state => (S::ReadEocd { mut buffer, .. }) { - buffer.reset(); + transition!(self.state => (S::ReadEocd { .. }) { S::ReadCentralDirectory { - buffer, eocd: EndOfCentralDirectory::new(self.size, eocdr, None)?, directory_headers: vec![], } @@ -218,17 +189,17 @@ impl ArchiveFsm { Ok(FsmResult::Continue(self)) } else { trace!("ReadEocd | transition to ReadEocd64Locator"); - transition!(self.state => (S::ReadEocd { mut buffer, .. }) { - buffer.reset(); - S::ReadEocd64Locator { buffer, eocdr } + self.buffer.reset(); + transition!(self.state => (S::ReadEocd { .. }) { + S::ReadEocd64Locator { eocdr } }); Ok(FsmResult::Continue(self)) } } } } - S::ReadEocd64Locator { ref mut buffer, .. } => { - let input = Partial::new(buffer.data()); + S::ReadEocd64Locator { .. } => { + let input = Partial::new(self.buffer.data()); match EndOfCentralDirectory64Locator::parser.parse_peek(input) { Err(ErrMode::Incomplete(_)) => { // need more data @@ -237,11 +208,13 @@ impl ArchiveFsm { Err(ErrMode::Backtrack(_)) | Err(ErrMode::Cut(_)) => { // we don't have a zip64 end of central directory locator - that's ok! trace!("ReadEocd64Locator | no zip64 end of central directory locator"); - trace!("ReadEocd64Locator | data we got: {:02x?}", buffer.data()); - transition!(self.state => (S::ReadEocd64Locator { mut buffer, eocdr }) { - buffer.reset(); + trace!( + "ReadEocd64Locator | data we got: {:02x?}", + self.buffer.data() + ); + self.buffer.reset(); + transition!(self.state => (S::ReadEocd64Locator { eocdr }) { S::ReadCentralDirectory { - buffer, eocd: EndOfCentralDirectory::new(self.size, eocdr, None)?, directory_headers: vec![], } @@ -253,10 +226,9 @@ impl ArchiveFsm { ?locator, "ReadEocd64Locator | found zip64 end of central directory locator" ); - transition!(self.state => (S::ReadEocd64Locator { mut buffer, eocdr }) { - buffer.reset(); + self.buffer.reset(); + transition!(self.state => (S::ReadEocd64Locator { eocdr }) { S::ReadEocd64 { - buffer, eocdr64_offset: locator.directory_offset, eocdr, } @@ -265,8 +237,8 @@ impl ArchiveFsm { } } } - S::ReadEocd64 { ref mut buffer, .. } => { - let input = Partial::new(buffer.data()); + S::ReadEocd64 { .. } => { + let input = Partial::new(self.buffer.data()); match EndOfCentralDirectory64Record::parser.parse_peek(input) { Err(ErrMode::Incomplete(_)) => { // need more data @@ -279,10 +251,9 @@ impl ArchiveFsm { Err(FormatError::Directory64EndRecordInvalid.into()) } Ok((_, eocdr64)) => { - transition!(self.state => (S::ReadEocd64 { mut buffer, eocdr, eocdr64_offset }) { - buffer.reset(); + self.buffer.reset(); + transition!(self.state => (S::ReadEocd64 { eocdr, eocdr64_offset }) { S::ReadCentralDirectory { - buffer, eocd: EndOfCentralDirectory::new(self.size, eocdr, Some(Located { offset: eocdr64_offset, inner: eocdr64 @@ -295,17 +266,16 @@ impl ArchiveFsm { } } S::ReadCentralDirectory { - ref mut buffer, ref eocd, ref mut directory_headers, } => { trace!( "ReadCentralDirectory | process(), available: {}", - buffer.available_data() + self.buffer.available_data() ); - let mut input = Partial::new(buffer.data()); + let mut input = Partial::new(self.buffer.data()); trace!( - initial_offset = input.as_bytes().offset_from(&buffer.data()), + initial_offset = input.as_bytes().offset_from(&self.buffer.data()), initial_len = input.len(), "initial offset & len" ); @@ -314,7 +284,7 @@ impl ArchiveFsm { Ok(dh) => { trace!( input_empty_now = input.is_empty(), - offset = input.as_bytes().offset_from(&buffer.data()), + offset = input.as_bytes().offset_from(&self.buffer.data()), len = input.len(), "ReadCentralDirectory | parsed directory header" ); @@ -419,9 +389,9 @@ impl ArchiveFsm { } } } - let consumed = input.as_bytes().offset_from(&buffer.data()); + let consumed = input.as_bytes().offset_from(&self.buffer.data()); tracing::trace!(%consumed, "ReadCentralDirectory total consumed"); - buffer.consume(consumed); + self.buffer.consume(consumed); // need more data Ok(FsmResult::Continue(self)) diff --git a/rc-zip/src/fsm/entry.rs b/rc-zip/src/fsm/entry.rs index 6a1d693..1d995d7 100644 --- a/rc-zip/src/fsm/entry.rs +++ b/rc-zip/src/fsm/entry.rs @@ -3,7 +3,12 @@ use oval::Buffer; -use crate::parse::{DataDescriptorRecord, LocalFileHeaderRecord}; +use crate::{ + error::Error, + parse::{DataDescriptorRecord, LocalFileHeaderRecord, Method, StoredEntryInner}, +}; + +use super::FsmResult; struct EntryReadMetrics { uncompressed_size: u64, @@ -21,9 +26,9 @@ enum State { header: LocalFileHeaderRecord, }, ReadDataDescriptor { + buffer: Buffer, metrics: EntryReadMetrics, header: LocalFileHeaderRecord, - buffer: Buffer, }, Validate { metrics: EntryReadMetrics, @@ -31,9 +36,6 @@ enum State { descriptor: Option, }, - /// Done! - Done, - #[default] Transition, } @@ -41,4 +43,12 @@ enum State { /// A state machine that can parse a zip entry pub struct EntryFsm { state: State, + entry: StoredEntryInner, + method: Method, +} + +impl EntryFsm { + fn process(mut self, outbuf: &mut [u8]) -> Result, Error> { + todo!() + } } From 3c0f483f54fb49980abdd1db95c24a6e8ba80c34 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 16:23:21 +0100 Subject: [PATCH 23/49] Fix docs --- Justfile | 3 ++ rc-zip-sync/src/read_zip.rs | 18 +++---- .../src/{read_zip.rs => async_read_zip.rs} | 23 ++++---- rc-zip-tokio/src/lib.rs | 4 +- rc-zip/src/fsm/archive.rs | 54 +++++++++---------- rc-zip/src/parse/archive.rs | 11 ++-- rc-zip/src/parse/date_time.rs | 2 +- 7 files changed, 56 insertions(+), 59 deletions(-) rename rc-zip-tokio/src/{read_zip.rs => async_read_zip.rs} (91%) diff --git a/Justfile b/Justfile index 3a0c6cf..1e869da 100644 --- a/Justfile +++ b/Justfile @@ -6,6 +6,9 @@ _default: check: cargo hack clippy --each-feature +docs: + RUSTDOCFLAGS="-D warnings" cargo doc --all-features --no-deps + # Run all tests locally test *args: cargo nextest run {{args}} --all-features diff --git a/rc-zip-sync/src/read_zip.rs b/rc-zip-sync/src/read_zip.rs index 8f8aef3..6747d06 100644 --- a/rc-zip-sync/src/read_zip.rs +++ b/rc-zip-sync/src/read_zip.rs @@ -7,7 +7,7 @@ use rc_zip::{ use crate::entry_reader::EntryReader; use std::{io::Read, ops::Deref}; -/// A trait for reading something as a zip archive (blocking I/O model) +/// A trait for reading something as a zip archive /// /// See also [ReadZip]. pub trait ReadZipWithSize { @@ -15,14 +15,11 @@ pub trait ReadZipWithSize { type File: HasCursor; /// Reads self as a zip archive. - /// - /// This functions blocks until the entire archive has been read. - /// It is not compatible with non-blocking or async I/O. fn read_zip_with_size(&self, size: u64) -> Result, Error>; } -/// A trait for reading something as a zip archive (blocking I/O model), -/// when we can tell size from self. +/// A trait for reading something as a zip archive when we can tell size from +/// self. /// /// See also [ReadZipWithSize]. pub trait ReadZip { @@ -30,9 +27,6 @@ pub trait ReadZip { type File: HasCursor; /// Reads self as a zip archive. - /// - /// This functions blocks until the entire archive has been read. - /// It is not compatible with non-blocking or async I/O. fn read_zip(&self) -> Result, Error>; } @@ -91,6 +85,10 @@ impl ReadZip for Vec { } /// A zip archive, read synchronously from a file or other I/O resource. +/// +/// This only contains metadata for the archive and its entries. Separate +/// readers can be created for arbitraries entries on-demand using +/// [SyncStoredEntry::reader]. pub struct SyncArchive<'a, F> where F: HasCursor, @@ -168,7 +166,7 @@ where /// A sliceable I/O resource: we can ask for a [Read] at a given offset. pub trait HasCursor { - /// The type of [Read] returned by [cursor_at]. + /// The type of [Read] returned by [HasCursor::cursor_at]. type Cursor<'a>: Read + 'a where Self: 'a; diff --git a/rc-zip-tokio/src/read_zip.rs b/rc-zip-tokio/src/async_read_zip.rs similarity index 91% rename from rc-zip-tokio/src/read_zip.rs rename to rc-zip-tokio/src/async_read_zip.rs index f886795..f096b19 100644 --- a/rc-zip-tokio/src/read_zip.rs +++ b/rc-zip-tokio/src/async_read_zip.rs @@ -12,17 +12,14 @@ use rc_zip::{ use crate::entry_reader::AsyncEntryReader; -/// A trait for reading something as a zip archive (blocking I/O model) +/// A trait for reading something as a zip archive. /// -/// See also [ReadZip]. +/// See also [AsyncReadZip]. pub trait AsyncReadZipWithSize { /// The type of the file to read from. type File: HasAsyncCursor; /// Reads self as a zip archive. - /// - /// This functions blocks until the entire archive has been read. - /// It is not compatible with non-blocking or async I/O. #[allow(async_fn_in_trait)] async fn read_zip_with_size_async( &self, @@ -30,18 +27,16 @@ pub trait AsyncReadZipWithSize { ) -> Result, Error>; } -/// A trait for reading something as a zip archive (blocking I/O model), -/// when we can tell size from self. +/// A zip archive, read asynchronously from a file or other I/O resource. /// -/// See also [ReadZipWithSize]. +/// This only contains metadata for the archive and its entries. Separate +/// readers can be created for arbitraries entries on-demand using +/// [AsyncStoredEntry::reader]. pub trait AsyncReadZip { /// The type of the file to read from. type File: HasAsyncCursor; /// Reads self as a zip archive. - /// - /// This functions blocks until the entire archive has been read. - /// It is not compatible with non-blocking or async I/O. #[allow(async_fn_in_trait)] async fn read_zip_async(&self) -> Result, Error>; } @@ -173,14 +168,14 @@ where } } -/// A sliceable I/O resource: we can ask for a [Read] at a given offset. +/// A sliceable I/O resource: we can ask for an [AsyncRead] at a given offset. pub trait HasAsyncCursor { - /// The type returned by [cursor_at]. + /// The type returned by [HasAsyncCursor::cursor_at]. type Cursor<'a>: AsyncRead + Unpin + 'a where Self: 'a; - /// Returns a [Read] at the given offset. + /// Returns an [AsyncRead] at the given offset. fn cursor_at(&self, offset: u64) -> Self::Cursor<'_>; } diff --git a/rc-zip-tokio/src/lib.rs b/rc-zip-tokio/src/lib.rs index 16d8128..7224a8d 100644 --- a/rc-zip-tokio/src/lib.rs +++ b/rc-zip-tokio/src/lib.rs @@ -15,10 +15,10 @@ macro_rules! transition_async { mod decoder; mod entry_reader; -mod read_zip; +mod async_read_zip; // re-exports pub use rc_zip; -pub use read_zip::{ +pub use async_read_zip::{ AsyncArchive, AsyncReadZip, AsyncReadZipWithSize, AsyncStoredEntry, HasAsyncCursor, }; diff --git a/rc-zip/src/fsm/archive.rs b/rc-zip/src/fsm/archive.rs index 34c69da..1bb6989 100644 --- a/rc-zip/src/fsm/archive.rs +++ b/rc-zip/src/fsm/archive.rs @@ -15,12 +15,21 @@ use winnow::{ Parser, Partial, }; -/// [ArchiveReader] parses a valid zip archive into an [Archive]. In particular, this struct finds +/// [ArchiveFsm] parses a valid zip archive into an [Archive]. In particular, this struct finds /// an end of central directory record, parses the entire central directory, detects text encoding, /// and normalizes metadata. /// -/// Look at the integration tests or [rc-zip-sync](https://crates.io/crates/rc-zip-sync) -/// for examples on how to use this struct. +/// The loop is as follows: +/// +/// * Call [ArchiveFsm::wants_read] to check if more data is needed. +/// * If it returns `Some(offset)`, read the file at that offset +/// into [ArchiveFsm::space] and then call [ArchiveFsm::fill] with +/// the number of bytes read. +/// * Call [ArchiveFsm::process] to process the data. +/// * If it returns [FsmResult::Continue], loop back to the first step. +/// +/// Look at the integration tests or +/// [rc-zip-sync](https://crates.io/crates/rc-zip-sync) for concrete examples. pub struct ArchiveFsm { /// Size of the entire zip file size: u64, @@ -70,10 +79,6 @@ impl ArchiveFsm { const DEFAULT_BUFFER_SIZE: usize = 256 * 1024; /// Create a new archive reader with a specified file size. - /// - /// Actual reading of the file is performed by calling - /// [wants_read()](ArchiveReader::wants_read()), [read()](ArchiveReader::read()) and - /// [process()](ArchiveReader::process()) in a loop. pub fn new(size: u64) -> Self { let haystack_size: u64 = 65 * 1024; let haystack_size = if size < haystack_size { @@ -89,14 +94,9 @@ impl ArchiveFsm { } } - /// Returns whether or not this reader needs more data to continue. - /// - /// Returns `Some(offset)` if this reader needs to read some data from `offset`. - /// In this case, [read()](ArchiveReader::read()) should be called with a [Read] - /// at the correct offset. - /// - /// Returns `None` if the reader does not need data and [process()](ArchiveReader::process()) - /// can be called directly. + /// If this returns `Some(offset)`, the caller should read data from + /// `offset` into [ArchiveFsm::space] — without forgetting to call + /// [ArchiveFsm::fill] with the number of bytes written. pub fn wants_read(&self) -> Option { use State as S; match self.state { @@ -115,8 +115,9 @@ impl ArchiveFsm { } } - /// returns a mutable slice with all the available space to - /// write to + /// Returns a mutable slice with all the available space to write to + /// + /// After writing to this, call [Self::fill] with the number of bytes written. #[inline] pub fn space(&mut self) -> &mut [u8] { trace!( @@ -129,12 +130,8 @@ impl ArchiveFsm { self.buffer.space() } - /// after having written data to the buffer, use this function - /// to indicate how many bytes were written - /// - /// if there is not enough available space, this function can call - /// `shift()` to move the remaining data to the beginning of the - /// buffer + /// After having written data to [Self::space], call this to indicate how + /// many bytes were written. #[inline] pub fn fill(&mut self, count: usize) -> usize { self.buffer.fill(count) @@ -142,14 +139,15 @@ impl ArchiveFsm { /// Process buffered data /// - /// Errors returned from process() are caused by invalid zip archives, + /// Errors returned from this function are caused by invalid zip archives, /// unsupported format quirks, or implementation bugs - never I/O errors. /// - /// A result of [FsmResult::Continue] indicates one should loop again, - /// starting with [wants_read()](ArchiveReader::wants_read()). + /// A result of [FsmResult::Continue] gives back ownership of the state + /// machine and indicates the I/O loop should continue, starting with + /// [ArchiveFsm::wants_read]. /// - /// A result of [FsmResult::Done] contains the [Archive], and indicates that no - /// method should ever be called again on this reader. + /// A result of [FsmResult::Done] consumes the state machine and returns + /// a fully-parsed [Archive]. pub fn process(mut self) -> Result, Error> { use State as S; match self.state { diff --git a/rc-zip/src/parse/archive.rs b/rc-zip/src/parse/archive.rs index 354189e..4b464eb 100644 --- a/rc-zip/src/parse/archive.rs +++ b/rc-zip/src/parse/archive.rs @@ -6,11 +6,14 @@ use crate::{ parse::{ExtraField, Mode, Version}, }; -/// An Archive contains general information about a zip files, -/// along with a list of [entries][StoredEntry]. +/// An Archive contains general information about a zip files, along with a list +/// of [entries][StoredEntry]. /// -/// It is obtained via an [ArchiveReader](crate::reader::ArchiveReader), or via a higher-level API -/// like the [ReadZip](crate::reader::sync::ReadZip) trait. +/// It is obtained through a state machine like +/// [ArchiveFsm](crate::fsm::ArchiveFsm), although end-users tend to use +/// higher-levelr interfaces like +/// [rc-zip-sync](https://crates.io/crates/rc-zip-sync) or +/// [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio). pub struct Archive { pub(crate) size: u64, pub(crate) encoding: Encoding, diff --git a/rc-zip/src/parse/date_time.rs b/rc-zip/src/parse/date_time.rs index 3bcddc8..2ebdd87 100644 --- a/rc-zip/src/parse/date_time.rs +++ b/rc-zip/src/parse/date_time.rs @@ -16,7 +16,7 @@ pub struct MsdosTimestamp { /// Time in 2-second intervals pub time: u16, - /// Date in MS-DOS format, cf. https://docs.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-dosdatetimetofiletime + /// Date in MS-DOS format, cf. pub date: u16, } From f1160de940085c4ab8744a33220a4a0756feff87 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 16:28:48 +0100 Subject: [PATCH 24/49] Add missing READMEs --- rc-zip-sync/README.md | 6 ++++++ rc-zip-sync/src/lib.rs | 4 ++++ rc-zip-tokio/README.md | 7 +++++++ rc-zip-tokio/src/lib.rs | 8 ++++++-- rc-zip/README.md | 7 +++++++ rc-zip/src/lib.rs | 9 +++++---- 6 files changed, 35 insertions(+), 6 deletions(-) create mode 100644 rc-zip-sync/README.md create mode 100644 rc-zip-tokio/README.md create mode 100644 rc-zip/README.md diff --git a/rc-zip-sync/README.md b/rc-zip-sync/README.md new file mode 100644 index 0000000..64aeaef --- /dev/null +++ b/rc-zip-sync/README.md @@ -0,0 +1,6 @@ +# rc-zip-sync + +This crate implements zip archive reading using std (synchronous) I/O traits, +like `std::io::Read`. + +See also [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio). \ No newline at end of file diff --git a/rc-zip-sync/src/lib.rs b/rc-zip-sync/src/lib.rs index 230886e..4d963c5 100644 --- a/rc-zip-sync/src/lib.rs +++ b/rc-zip-sync/src/lib.rs @@ -1,5 +1,9 @@ //! A library for reading zip files synchronously using std I/O traits, //! built on top of [rc-zip](https://crates.io/crates/rc-zip). +//! +//! See also: +//! +//! * [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio) for using tokio I/O traits #![warn(missing_docs)] diff --git a/rc-zip-tokio/README.md b/rc-zip-tokio/README.md new file mode 100644 index 0000000..2a9196e --- /dev/null +++ b/rc-zip-tokio/README.md @@ -0,0 +1,7 @@ +# rc-zip-tokio + +This crate implements zip archive reading using tokio (asynchronous) I/O traits, +like `tokio:io::AsyncRead`. + +See also [rc-zip-sync](https://crates.io/crates/rc-zip-sync). + diff --git a/rc-zip-tokio/src/lib.rs b/rc-zip-tokio/src/lib.rs index 7224a8d..e2ae709 100644 --- a/rc-zip-tokio/src/lib.rs +++ b/rc-zip-tokio/src/lib.rs @@ -1,5 +1,9 @@ //! A library for reading zip files asynchronously using tokio I/O traits, //! based on top of [rc-zip](https://crates.io/crates/rc-zip). +//! +//! See also: +//! +//! * [rc-zip-sync](https://crates.io/crates/rc-zip-sync) for using std I/O traits #![warn(missing_docs)] @@ -13,12 +17,12 @@ macro_rules! transition_async { }; } +mod async_read_zip; mod decoder; mod entry_reader; -mod async_read_zip; // re-exports -pub use rc_zip; pub use async_read_zip::{ AsyncArchive, AsyncReadZip, AsyncReadZipWithSize, AsyncStoredEntry, HasAsyncCursor, }; +pub use rc_zip; diff --git a/rc-zip/README.md b/rc-zip/README.md new file mode 100644 index 0000000..996cdde --- /dev/null +++ b/rc-zip/README.md @@ -0,0 +1,7 @@ +# rc-zip + +This is the core rc-zip crate, containing types, parses, and state machines, +and that doesn't do any I/O by itself. + +The full README for this crate is the [top-level README](../README.md) in this +repository. diff --git a/rc-zip/src/lib.rs b/rc-zip/src/lib.rs index 7346817..4494fc4 100644 --- a/rc-zip/src/lib.rs +++ b/rc-zip/src/lib.rs @@ -6,10 +6,11 @@ //! file, winnow parsers that can turn byte buffers into those types, and //! state machines that can use those parsers to read zip files from a stream. //! -//! [rc-zip-sync](https://crates.io/crates/rc-zip-sync) and -//! [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio) build on top of this -//! to provide a higher-level API for reading zip files, from sync and async -//! code respectively. +//! This crate is low-level, you may be interested in either of those higher +//! level wrappers: +//! +//! * [rc-zip-sync](https://crates.io/crates/rc-zip-sync) for using std I/O traits +//! * [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio) for using tokio I/O traits pub mod encoding; pub mod error; From 0faf47375bd938f417aac7c411482757b3910672 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 17:01:55 +0100 Subject: [PATCH 25/49] Good WIP EntryFsm --- rc-zip/src/fsm/archive.rs | 52 +++---- rc-zip/src/fsm/entry.rs | 231 ++++++++++++++++++++++++++-- rc-zip/src/fsm/entry/deflate_dec.rs | 0 rc-zip/src/fsm/entry/store_dec.rs | 21 +++ 4 files changed, 265 insertions(+), 39 deletions(-) create mode 100644 rc-zip/src/fsm/entry/deflate_dec.rs create mode 100644 rc-zip/src/fsm/entry/store_dec.rs diff --git a/rc-zip/src/fsm/archive.rs b/rc-zip/src/fsm/archive.rs index 1bb6989..6641d59 100644 --- a/rc-zip/src/fsm/archive.rs +++ b/rc-zip/src/fsm/archive.rs @@ -21,11 +21,11 @@ use winnow::{ /// /// The loop is as follows: /// -/// * Call [ArchiveFsm::wants_read] to check if more data is needed. +/// * Call [Self::wants_read] to check if more data is needed. /// * If it returns `Some(offset)`, read the file at that offset -/// into [ArchiveFsm::space] and then call [ArchiveFsm::fill] with +/// into [Self::space] and then call [Self::fill] with /// the number of bytes read. -/// * Call [ArchiveFsm::process] to process the data. +/// * Call [Self::process] to process the data. /// * If it returns [FsmResult::Continue], loop back to the first step. /// /// Look at the integration tests or @@ -95,8 +95,8 @@ impl ArchiveFsm { } /// If this returns `Some(offset)`, the caller should read data from - /// `offset` into [ArchiveFsm::space] — without forgetting to call - /// [ArchiveFsm::fill] with the number of bytes written. + /// `offset` into [Self::space] — without forgetting to call + /// [Self::fill] with the number of bytes written. pub fn wants_read(&self) -> Option { use State as S; match self.state { @@ -115,28 +115,6 @@ impl ArchiveFsm { } } - /// Returns a mutable slice with all the available space to write to - /// - /// After writing to this, call [Self::fill] with the number of bytes written. - #[inline] - pub fn space(&mut self) -> &mut [u8] { - trace!( - available_space = self.buffer.available_space(), - "space() | available_space" - ); - if self.buffer.available_space() == 0 { - self.buffer.shift(); - } - self.buffer.space() - } - - /// After having written data to [Self::space], call this to indicate how - /// many bytes were written. - #[inline] - pub fn fill(&mut self, count: usize) -> usize { - self.buffer.fill(count) - } - /// Process buffered data /// /// Errors returned from this function are caused by invalid zip archives, @@ -144,7 +122,7 @@ impl ArchiveFsm { /// /// A result of [FsmResult::Continue] gives back ownership of the state /// machine and indicates the I/O loop should continue, starting with - /// [ArchiveFsm::wants_read]. + /// [Self::wants_read]. /// /// A result of [FsmResult::Done] consumes the state machine and returns /// a fully-parsed [Archive]. @@ -397,6 +375,24 @@ impl ArchiveFsm { S::Transitioning => unreachable!(), } } + + /// Returns a mutable slice with all the available space to write to. + /// + /// After writing to this, call [Self::fill] with the number of bytes written. + #[inline] + pub fn space(&mut self) -> &mut [u8] { + if self.buffer.available_space() == 0 { + self.buffer.shift(); + } + self.buffer.space() + } + + /// After having written data to [Self::space], call this to indicate how + /// many bytes were written. + #[inline] + pub fn fill(&mut self, count: usize) -> usize { + self.buffer.fill(count) + } } /// A wrapper around [oval::Buffer] that keeps track of how many bytes we've read since diff --git a/rc-zip/src/fsm/entry.rs b/rc-zip/src/fsm/entry.rs index 1d995d7..8b8871a 100644 --- a/rc-zip/src/fsm/entry.rs +++ b/rc-zip/src/fsm/entry.rs @@ -2,9 +2,17 @@ #![allow(unused)] use oval::Buffer; +use tracing::trace; +use winnow::{error::ErrMode, Parser, Partial}; + +mod store_dec; +use store_dec::StoreDec; + +mod deflate_dec; +use deflate_dec::DeflateDec; use crate::{ - error::Error, + error::{Error, FormatError, UnsupportedError}, parse::{DataDescriptorRecord, LocalFileHeaderRecord, Method, StoredEntryInner}, }; @@ -17,22 +25,38 @@ struct EntryReadMetrics { #[derive(Default)] enum State { - ReadLocalHeader { - buffer: Buffer, - }, + ReadLocalHeader, + ReadData { - hasher: crc32fast::Hasher, - uncompressed_size: u64, + /// The local file header for this entry header: LocalFileHeaderRecord, + + /// Amount of data we have decompressed so far + uncompressed_size: u64, + + /// CRC32 hash of the decompressed data + hasher: crc32fast::Hasher, + + /// The decompression method we're using + decompressor: AnyDecompressor, }, + ReadDataDescriptor { - buffer: Buffer, - metrics: EntryReadMetrics, + /// The local file header for this entry header: LocalFileHeaderRecord, + + /// Size we've decompressed + crc32 hash we've computed + metrics: EntryReadMetrics, }, + Validate { - metrics: EntryReadMetrics, + /// The local file header for this entry header: LocalFileHeaderRecord, + + /// Size we've decompressed + crc32 hash we've computed + metrics: EntryReadMetrics, + + /// The data descriptor for this entry, if any descriptor: Option, }, @@ -45,10 +69,195 @@ pub struct EntryFsm { state: State, entry: StoredEntryInner, method: Method, + buffer: Buffer, + eof: bool, } impl EntryFsm { - fn process(mut self, outbuf: &mut [u8]) -> Result, Error> { - todo!() + /// Create a new state machine for decompressing a zip entry + pub fn new(method: Method, entry: StoredEntryInner) -> Self { + Self { + state: State::ReadLocalHeader, + entry, + method, + buffer: Buffer::with_capacity(256 * 1024), + eof: false, + } + } + + /// If this returns true, the caller should read data from into + /// [Self::space] — without forgetting to call [Self::fill] with the number + /// of bytes written. + pub fn wants_read(&self) -> bool { + match self.state { + State::ReadLocalHeader => true, + State::ReadData { .. } => { + // we want to read if we have space + self.buffer.available_space() > 0 + } + State::ReadDataDescriptor { .. } => true, + State::Validate { .. } => false, + State::Transition => false, + } + } + + pub fn process( + mut self, + out: &mut [u8], + ) -> Result, Error> { + use State as S; + match self.state { + S::ReadLocalHeader => { + let mut input = Partial::new(self.buffer.data()); + match LocalFileHeaderRecord::parser.parse_next(&mut input) { + Ok(header) => { + self.state = S::ReadData { + header, + uncompressed_size: 0, + hasher: crc32fast::Hasher::new(), + decompressor: AnyDecompressor::new(header.method)?, + }; + self.process(out) + } + Err(ErrMode::Incomplete(_)) => Ok(FsmResult::Continue(self)), + Err(_e) => Err(Error::Format(FormatError::InvalidLocalHeader)), + } + } + S::ReadData { + header, + uncompressed_size, + hasher, + mut decompressor, + } => { + let in_buf = self.buffer.data(); + let is_flushing = in_buf.is_empty(); + let outcome = decompressor.decompress(in_buf, out)?; + self.buffer.consume(outcome.bytes_read); + + if outcome.bytes_written == 0 && self.eof { + // we're done, let's read the data descriptor (if there's one) + transition!(self.state => (S::ReadData { header, uncompressed_size, hasher, decompressor }) { + S::ReadDataDescriptor { + header, + metrics: EntryReadMetrics { + uncompressed_size, + crc32: hasher.finalize(), + }, + } + }); + return self.process(out); + } + Ok(FsmResult::Continue((self, outcome))) + } + S::ReadDataDescriptor { header, metrics } => {} + S::Validate { + header, + metrics, + descriptor, + } => { + let expected_crc32 = if self.entry.crc32 != 0 { + self.entry.crc32 + } else if let Some(descriptor) = descriptor.as_ref() { + descriptor.crc32 + } else { + header.crc32 + }; + + let expected_size = if self.entry.uncompressed_size != 0 { + self.entry.uncompressed_size + } else if let Some(descriptor) = descriptor.as_ref() { + descriptor.uncompressed_size + } else { + header.uncompressed_size as u64 + }; + + if expected_size != metrics.uncompressed_size { + return Err(Error::Format(FormatError::WrongSize { + expected: expected_size, + actual: metrics.uncompressed_size, + }) + .into()); + } + + if expected_crc32 != 0 && expected_crc32 != metrics.crc32 { + return Err(Error::Format(FormatError::WrongChecksum { + expected: expected_crc32, + actual: metrics.crc32, + }) + .into()); + } + + Ok(FsmResult::Done(())) + } + S::Transition => { + unreachable!("the state machine should never be in the transition state") + } + } + } + + /// Returns a mutable slice with all the available space to write to. + /// + /// After writing to this, call [Self::fill] with the number of bytes written. + #[inline] + pub fn space(&mut self) -> &mut [u8] { + if self.buffer.available_space() == 0 { + self.buffer.shift(); + } + self.buffer.space() + } + + /// After having written data to [Self::space], call this to indicate how + /// many bytes were written. + /// + /// If this is called with zero, it indicates eof + #[inline] + pub fn fill(&mut self, count: usize) -> usize { + if count == 0 { + self.eof = true; + } + self.buffer.fill(count) + } +} + +enum AnyDecompressor { + Store(StoreDec), + Deflate(DeflateDec), +} + +#[derive(Default, Debug)] +pub struct DecompressOutcome { + /// Number of bytes read from input + pub bytes_read: usize, + + /// Number of bytes written to output + pub bytes_written: usize, +} + +trait Decompressor { + #[inline] + fn decompress(&mut self, in_buf: &[u8], out_buf: &mut [u8]) + -> Result; +} + +impl AnyDecompressor { + fn new(method: Method) -> Result { + let dec = match method { + Method::Store => Self::Store(Default::default()), + Method::Deflate => Self::Deflate(Default::default()), + _ => { + let err = Error::Unsupported(UnsupportedError::MethodNotSupported(method)); + return Err(err); + } + }; + Ok(dec) + } + + #[inline] + fn decompress(&mut self, in_buf: &[u8], out: &mut [u8]) -> Result { + /// forward to the appropriate decompressor + match self { + Self::Store(dec) => dec.decompress(in_buf, out), + Self::Deflate(dec) => dec.decompress(in_buf, out), + } } } diff --git a/rc-zip/src/fsm/entry/deflate_dec.rs b/rc-zip/src/fsm/entry/deflate_dec.rs new file mode 100644 index 0000000..e69de29 diff --git a/rc-zip/src/fsm/entry/store_dec.rs b/rc-zip/src/fsm/entry/store_dec.rs new file mode 100644 index 0000000..eb0e6fa --- /dev/null +++ b/rc-zip/src/fsm/entry/store_dec.rs @@ -0,0 +1,21 @@ +use std::cmp; + +use super::{DecompressOutcome, Decompressor}; + +#[derive(Default)] +pub(crate) struct StoreDec; + +impl Decompressor for StoreDec { + fn decompress( + &mut self, + in_buf: &[u8], + out_buf: &mut [u8], + ) -> Result { + let len = cmp::min(in_buf.len(), out_buf.len()); + out_buf[..len].copy_from_slice(&in_buf[..len]); + Ok(DecompressOutcome { + bytes_read: len, + bytes_written: len, + }) + } +} From 07de9de857bdac4232f25c3cbbf2627e32779f55 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 17:04:23 +0100 Subject: [PATCH 26/49] Almost there.. --- rc-zip/src/fsm/entry.rs | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/rc-zip/src/fsm/entry.rs b/rc-zip/src/fsm/entry.rs index 8b8871a..d5c9084 100644 --- a/rc-zip/src/fsm/entry.rs +++ b/rc-zip/src/fsm/entry.rs @@ -3,7 +3,11 @@ use oval::Buffer; use tracing::trace; -use winnow::{error::ErrMode, Parser, Partial}; +use winnow::{ + error::ErrMode, + stream::{AsBytes, Offset}, + Parser, Partial, +}; mod store_dec; use store_dec::StoreDec; @@ -119,7 +123,9 @@ impl EntryFsm { }; self.process(out) } - Err(ErrMode::Incomplete(_)) => Ok(FsmResult::Continue(self)), + Err(ErrMode::Incomplete(_)) => { + Ok(FsmResult::Continue((self, Default::default()))) + } Err(_e) => Err(Error::Format(FormatError::InvalidLocalHeader)), } } @@ -149,7 +155,24 @@ impl EntryFsm { } Ok(FsmResult::Continue((self, outcome))) } - S::ReadDataDescriptor { header, metrics } => {} + S::ReadDataDescriptor { header, metrics } => { + let mut input = Partial::new(self.buffer.data()); + match DataDescriptorRecord::mk_parser(self.entry.is_zip64).parse_next(&mut input) { + Ok(descriptor) => { + self.buffer + .consume(input.as_bytes().offset_from(&self.buffer.data())); + trace!("data descriptor = {:#?}", descriptor); + transition!(self.state => (S::ReadDataDescriptor { metrics, header, .. }) { + S::Validate { metrics, header, descriptor: Some(descriptor) } + }); + self.process(out) + } + Err(ErrMode::Incomplete(_)) => { + Ok(FsmResult::Continue((self, Default::default()))) + } + Err(_e) => Err(Error::Format(FormatError::InvalidDataDescriptor).into()), + } + } S::Validate { header, metrics, From 31fe4f3b51bfe55c3ab33c8d18007fc971359e71 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 17:29:58 +0100 Subject: [PATCH 27/49] Add Deflate state machine --- .vscode/settings.json | 5 +- Cargo.lock | 1 + rc-zip/Cargo.toml | 5 + rc-zip/src/error.rs | 4 + rc-zip/src/fsm/entry.rs | 23 +++-- rc-zip/src/fsm/entry/deflate_dec.rs | 138 ++++++++++++++++++++++++++++ rc-zip/src/fsm/entry/store_dec.rs | 4 +- 7 files changed, 168 insertions(+), 12 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 8bf6ba4..2e6e863 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,9 +1,6 @@ { "rust-analyzer.cargo.features": [ - "lzma", - "deflate64", - "bzip2", - "zstd" + "rc-zip/deflate" ], "rust-analyzer.linkedProjects": [ "./Cargo.toml" diff --git a/Cargo.lock b/Cargo.lock index e363f87..84a0f6d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -829,6 +829,7 @@ dependencies = [ "chrono", "crc32fast", "encoding_rs", + "miniz_oxide", "num_enum", "oem_cp", "oval", diff --git a/rc-zip/Cargo.toml b/rc-zip/Cargo.toml index d963900..a5f04e7 100644 --- a/rc-zip/Cargo.toml +++ b/rc-zip/Cargo.toml @@ -28,3 +28,8 @@ chardetng = "0.1.17" num_enum = "0.7.2" cfg-if = "1.0.0" crc32fast = "1.3.2" +miniz_oxide = { version = "0.7.1", optional = true } + +[features] +deflate = ["dep:miniz_oxide"] + diff --git a/rc-zip/src/error.rs b/rc-zip/src/error.rs index 699407e..f03256e 100644 --- a/rc-zip/src/error.rs +++ b/rc-zip/src/error.rs @@ -23,6 +23,10 @@ pub enum Error { #[error("io: {0}")] IO(#[from] std::io::Error), + /// Decompression-related error + #[error("{method:?} decompression error: {msg}")] + Decompression { method: Method, msg: String }, + /// Could not read as a zip because size could not be determined #[error("size must be known to open zip file")] UnknownSize, diff --git a/rc-zip/src/fsm/entry.rs b/rc-zip/src/fsm/entry.rs index d5c9084..4c00704 100644 --- a/rc-zip/src/fsm/entry.rs +++ b/rc-zip/src/fsm/entry.rs @@ -10,10 +10,9 @@ use winnow::{ }; mod store_dec; -use store_dec::StoreDec; +#[cfg(feature = "deflate")] mod deflate_dec; -use deflate_dec::DeflateDec; use crate::{ error::{Error, FormatError, UnsupportedError}, @@ -110,7 +109,7 @@ impl EntryFsm { out: &mut [u8], ) -> Result, Error> { use State as S; - match self.state { + match &mut self.state { S::ReadLocalHeader => { let mut input = Partial::new(self.buffer.data()); match LocalFileHeaderRecord::parser.parse_next(&mut input) { @@ -119,7 +118,7 @@ impl EntryFsm { header, uncompressed_size: 0, hasher: crc32fast::Hasher::new(), - decompressor: AnyDecompressor::new(header.method)?, + decompressor: AnyDecompressor::new(self.method)?, }; self.process(out) } @@ -133,7 +132,7 @@ impl EntryFsm { header, uncompressed_size, hasher, - mut decompressor, + decompressor, } => { let in_buf = self.buffer.data(); let is_flushing = in_buf.is_empty(); @@ -243,8 +242,9 @@ impl EntryFsm { } enum AnyDecompressor { - Store(StoreDec), - Deflate(DeflateDec), + Store(store_dec::StoreDec), + #[cfg(feature = "deflate")] + Deflate(deflate_dec::DeflateDec), } #[derive(Default, Debug)] @@ -266,7 +266,15 @@ impl AnyDecompressor { fn new(method: Method) -> Result { let dec = match method { Method::Store => Self::Store(Default::default()), + + #[cfg(feature = "deflate")] Method::Deflate => Self::Deflate(Default::default()), + #[cfg(not(feature = "deflate"))] + Method::Deflate => { + let err = Error::Unsupported(UnsupportedError::MethodNotEnabled(method)); + return Err(err); + } + _ => { let err = Error::Unsupported(UnsupportedError::MethodNotSupported(method)); return Err(err); @@ -280,6 +288,7 @@ impl AnyDecompressor { /// forward to the appropriate decompressor match self { Self::Store(dec) => dec.decompress(in_buf, out), + #[cfg(feature = "deflate")] Self::Deflate(dec) => dec.decompress(in_buf, out), } } diff --git a/rc-zip/src/fsm/entry/deflate_dec.rs b/rc-zip/src/fsm/entry/deflate_dec.rs index e69de29..3e38f75 100644 --- a/rc-zip/src/fsm/entry/deflate_dec.rs +++ b/rc-zip/src/fsm/entry/deflate_dec.rs @@ -0,0 +1,138 @@ +use std::cmp; + +use miniz_oxide::inflate::{ + core::{ + decompress, + inflate_flags::{TINFL_FLAG_IGNORE_ADLER32, TINFL_FLAG_PARSE_ZLIB_HEADER}, + DecompressorOxide, + }, + TINFLStatus, +}; +use tracing::trace; + +use crate::{error::Error, parse::Method}; + +use super::{DecompressOutcome, Decompressor}; + +pub(crate) struct DeflateDec { + /// 64 KiB circular internal buffer. From miniz_oxide docs: + /// + /// > The decompression function normally needs access to 32KiB of the + /// > previously decompressed data (or to the beginning of the decompressed + /// > data if less than 32KiB has been decompressed.) + internal_buffer: Vec, + + /// The position in the internal buffer where we should start writing the + /// next decompressed data. Note that the buffer is circular, so we need to + /// wrap around when we reach the end. + out_pos: usize, + + /// If this is non-zero, there's data *after* [Self::out_pos] we haven't + /// copied to the caller's output buffer yet. As we copy it, we'll decrease + /// this value and increase [Self::out_pos]. When it reaches zero, we'll + /// need to call miniz_oxide again to get more data. + remain_in_internal_buffer: usize, + + /// The miniz_oxide decompressor state + state: DecompressorOxide, +} + +impl Default for DeflateDec { + fn default() -> Self { + Self { + internal_buffer: vec![0u8; Self::INTERNAL_BUFFER_LENGTH], + out_pos: 0, + state: DecompressorOxide::new(), + remain_in_internal_buffer: 0, + } + } +} + +impl Decompressor for DeflateDec { + fn decompress( + &mut self, + in_buf: &[u8], + out_buf: &mut [u8], + ) -> Result { + let mut outcome: DecompressOutcome = Default::default(); + self.copy_to_outbuf(out_buf, &mut outcome); + if outcome.bytes_written > 0 { + return Ok(outcome); + } + + // no output bytes, let's call miniz_oxide + + let mut flags = TINFL_FLAG_IGNORE_ADLER32; + if in_buf.is_empty() { + // `Decompressor` invariant: if in_buf is empty, we're at EOF + } else { + flags |= TINFL_FLAG_PARSE_ZLIB_HEADER; + } + + let (status, bytes_read, bytes_written) = decompress( + &mut self.state, + in_buf, + &mut self.internal_buffer, + self.out_pos, + flags, + ); + outcome.bytes_read += bytes_read; + self.remain_in_internal_buffer += bytes_written; + + match status { + TINFLStatus::FailedCannotMakeProgress => { + return Err(Error::Decompression { method: Method::Deflate, msg: "Failed to make progress: more input data was expected, but the caller indicated there was no more data, so the input stream is likely truncated".to_string() }) + } + TINFLStatus::BadParam => { + return Err(Error::Decompression { method: Method::Deflate, msg: "The output buffer is an invalid size; consider the flags parameter".to_string() }) + } + TINFLStatus::Adler32Mismatch => { + return Err(Error::Decompression { method: Method::Deflate, msg: "The decompression went fine, but the adler32 checksum did not match the one provided in the header.".to_string() }) + } + TINFLStatus::Failed => { + return Err(Error::Decompression { method: Method::Deflate, msg: "Failed to decompress due to invalid data.".to_string() }) + }, + TINFLStatus::Done => { + // eventually this'll return bytes_written == 0 + }, + TINFLStatus::NeedsMoreInput => { + // that's okay, we'll get more input next time + }, + TINFLStatus::HasMoreOutput => { + // that's okay, as long as we return bytes_written > 0 + // the caller will keep calling + }, + } + + self.copy_to_outbuf(out_buf, &mut outcome); + return Ok(outcome); + } +} + +impl DeflateDec { + const INTERNAL_BUFFER_LENGTH: usize = 64 * 1024; + + fn copy_to_outbuf(&mut self, mut out_buf: &mut [u8], outcome: &mut DecompressOutcome) { + // as long as there's room in out_buf and we have remaining data in the + // internal buffer, copy from internal_buffer wrapping as needed, + // decreasing self.remain_in_internal_buffer and increasing self.out_pos + // and outcome.bytes_written + while !out_buf.is_empty() && self.remain_in_internal_buffer > 0 { + let copy_len = cmp::min(self.remain_in_internal_buffer, out_buf.len()); + // take wrapping into account + let copy_len = cmp::min(copy_len, self.internal_buffer.len() - self.out_pos); + trace!("copying {} bytes from internal buffer to out_buf", copy_len); + + out_buf[..copy_len].copy_from_slice(&self.internal_buffer[self.out_pos..][..copy_len]); + self.out_pos += copy_len; + outcome.bytes_written += copy_len; + self.remain_in_internal_buffer -= copy_len; + out_buf = &mut out_buf[copy_len..]; + + // if we've reached the end of the buffer, wrap around + if self.out_pos == self.internal_buffer.len() { + self.out_pos = 0; + } + } + } +} diff --git a/rc-zip/src/fsm/entry/store_dec.rs b/rc-zip/src/fsm/entry/store_dec.rs index eb0e6fa..608343b 100644 --- a/rc-zip/src/fsm/entry/store_dec.rs +++ b/rc-zip/src/fsm/entry/store_dec.rs @@ -1,5 +1,7 @@ use std::cmp; +use crate::error::Error; + use super::{DecompressOutcome, Decompressor}; #[derive(Default)] @@ -10,7 +12,7 @@ impl Decompressor for StoreDec { &mut self, in_buf: &[u8], out_buf: &mut [u8], - ) -> Result { + ) -> Result { let len = cmp::min(in_buf.len(), out_buf.len()); out_buf[..len].copy_from_slice(&in_buf[..len]); Ok(DecompressOutcome { From 7146011a84959a24619dcd41e9fca5abea9ab764 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 17:38:01 +0100 Subject: [PATCH 28/49] FsmEntryReader for rc-zip-sync --- rc-zip-sync/src/entry_reader/mod.rs | 52 +++++++++++++++++++++++++++++ rc-zip-sync/src/read_zip.rs | 7 +++- rc-zip/src/error.rs | 7 +++- rc-zip/src/fsm/entry.rs | 19 +++++++---- 4 files changed, 77 insertions(+), 8 deletions(-) diff --git a/rc-zip-sync/src/entry_reader/mod.rs b/rc-zip-sync/src/entry_reader/mod.rs index 56613ef..3e4bdff 100644 --- a/rc-zip-sync/src/entry_reader/mod.rs +++ b/rc-zip-sync/src/entry_reader/mod.rs @@ -17,6 +17,7 @@ use cfg_if::cfg_if; use oval::Buffer; use rc_zip::{ error::{Error, FormatError}, + fsm::{EntryFsm, FsmResult}, parse::{DataDescriptorRecord, LocalFileHeaderRecord, Method, StoredEntry, StoredEntryInner}, }; use std::io; @@ -314,3 +315,54 @@ where Ok(decoder) } } + +pub(crate) struct FsmEntryReader +where + R: io::Read, +{ + rd: R, + fsm: Option, +} + +impl FsmEntryReader +where + R: io::Read, +{ + pub(crate) fn new(entry: &StoredEntry, get_reader: F) -> Self + where + F: Fn(u64) -> R, + { + Self { + rd: get_reader(entry.header_offset), + fsm: Some(EntryFsm::new(entry.method(), entry.inner)), + } + } +} + +impl io::Read for FsmEntryReader +where + R: io::Read, +{ + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let mut fsm = match self.fsm.take() { + Some(fsm) => fsm, + None => return Ok(0), + }; + + if fsm.wants_read() { + let n = self.rd.read(fsm.space())?; + fsm.fill(n); + } + + match fsm.process(buf)? { + FsmResult::Continue((fsm, outcome)) => { + self.fsm = Some(fsm); + Ok(outcome.bytes_written) + } + FsmResult::Done(()) => { + // neat! + Ok(0) + } + } + } +} diff --git a/rc-zip-sync/src/read_zip.rs b/rc-zip-sync/src/read_zip.rs index 6747d06..b43a83f 100644 --- a/rc-zip-sync/src/read_zip.rs +++ b/rc-zip-sync/src/read_zip.rs @@ -4,7 +4,7 @@ use rc_zip::{ parse::{Archive, StoredEntry}, }; -use crate::entry_reader::EntryReader; +use crate::entry_reader::{EntryReader, FsmEntryReader}; use std::{io::Read, ops::Deref}; /// A trait for reading something as a zip archive @@ -156,6 +156,11 @@ where EntryReader::new(self.entry, |offset| self.file.cursor_at(offset)) } + /// Returns an fsm-based reader for the entry + pub fn fsm_reader(&self) -> impl Read + 'a { + FsmEntryReader::new(self.entry, |offset| self.file.cursor_at(offset)) + } + /// Reads the entire entry into a vector. pub fn bytes(&self) -> std::io::Result> { let mut v = Vec::new(); diff --git a/rc-zip/src/error.rs b/rc-zip/src/error.rs index f03256e..0ba94f9 100644 --- a/rc-zip/src/error.rs +++ b/rc-zip/src/error.rs @@ -25,7 +25,12 @@ pub enum Error { /// Decompression-related error #[error("{method:?} decompression error: {msg}")] - Decompression { method: Method, msg: String }, + Decompression { + /// The compression method that failed + method: Method, + /// Additional information + msg: String, + }, /// Could not read as a zip because size could not be determined #[error("size must be known to open zip file")] diff --git a/rc-zip/src/fsm/entry.rs b/rc-zip/src/fsm/entry.rs index 4c00704..7fc59ec 100644 --- a/rc-zip/src/fsm/entry.rs +++ b/rc-zip/src/fsm/entry.rs @@ -104,6 +104,16 @@ impl EntryFsm { } } + /// Process the input and write the output to the given buffer + /// + /// This function will return `FsmResult::Continue` if it needs more input + /// to continue, or if it needs more space to write to. It will return + /// `FsmResult::Done` when all the input has been decompressed and all + /// the output has been written. + /// + /// Also, after writing all the output, process will read the data + /// descriptor (if any), and make sur the CRC32 hash and the uncompressed + /// size match the expected values. pub fn process( mut self, out: &mut [u8], @@ -197,16 +207,14 @@ impl EntryFsm { return Err(Error::Format(FormatError::WrongSize { expected: expected_size, actual: metrics.uncompressed_size, - }) - .into()); + })); } if expected_crc32 != 0 && expected_crc32 != metrics.crc32 { return Err(Error::Format(FormatError::WrongChecksum { expected: expected_crc32, actual: metrics.crc32, - }) - .into()); + })); } Ok(FsmResult::Done(())) @@ -244,7 +252,7 @@ impl EntryFsm { enum AnyDecompressor { Store(store_dec::StoreDec), #[cfg(feature = "deflate")] - Deflate(deflate_dec::DeflateDec), + Deflate(Box), } #[derive(Default, Debug)] @@ -257,7 +265,6 @@ pub struct DecompressOutcome { } trait Decompressor { - #[inline] fn decompress(&mut self, in_buf: &[u8], out_buf: &mut [u8]) -> Result; } From b1acd92af42aaca9389e7c889bb17d685c597805 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 17:39:56 +0100 Subject: [PATCH 29/49] Now let's debug deflate --- rc-zip-sync/Cargo.toml | 2 +- rc-zip-sync/src/read_zip.rs | 2 +- rc-zip/src/fsm/entry.rs | 2 +- rc-zip/src/fsm/entry/deflate_dec.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/rc-zip-sync/Cargo.toml b/rc-zip-sync/Cargo.toml index 2ef1302..68d46ec 100644 --- a/rc-zip-sync/Cargo.toml +++ b/rc-zip-sync/Cargo.toml @@ -36,7 +36,7 @@ winnow = "0.5.36" [features] default = ["file", "deflate"] file = ["positioned-io"] -deflate = ["dep:flate2"] +deflate = ["dep:flate2", "rc-zip/deflate"] deflate64 = ["dep:deflate64"] lzma = ["dep:lzma-rs"] bzip2 = ["dep:bzip2"] diff --git a/rc-zip-sync/src/read_zip.rs b/rc-zip-sync/src/read_zip.rs index b43a83f..6537041 100644 --- a/rc-zip-sync/src/read_zip.rs +++ b/rc-zip-sync/src/read_zip.rs @@ -164,7 +164,7 @@ where /// Reads the entire entry into a vector. pub fn bytes(&self) -> std::io::Result> { let mut v = Vec::new(); - self.reader().read_to_end(&mut v)?; + self.fsm_reader().read_to_end(&mut v)?; Ok(v) } } diff --git a/rc-zip/src/fsm/entry.rs b/rc-zip/src/fsm/entry.rs index 7fc59ec..3ca8f66 100644 --- a/rc-zip/src/fsm/entry.rs +++ b/rc-zip/src/fsm/entry.rs @@ -179,7 +179,7 @@ impl EntryFsm { Err(ErrMode::Incomplete(_)) => { Ok(FsmResult::Continue((self, Default::default()))) } - Err(_e) => Err(Error::Format(FormatError::InvalidDataDescriptor).into()), + Err(_e) => Err(Error::Format(FormatError::InvalidDataDescriptor)), } } S::Validate { diff --git a/rc-zip/src/fsm/entry/deflate_dec.rs b/rc-zip/src/fsm/entry/deflate_dec.rs index 3e38f75..537cee6 100644 --- a/rc-zip/src/fsm/entry/deflate_dec.rs +++ b/rc-zip/src/fsm/entry/deflate_dec.rs @@ -105,7 +105,7 @@ impl Decompressor for DeflateDec { } self.copy_to_outbuf(out_buf, &mut outcome); - return Ok(outcome); + Ok(outcome) } } From 35ae25e9aa868638695a3a069110eda510b6c5c1 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 17:45:34 +0100 Subject: [PATCH 30/49] We're passing too much data to the decompressor --- rc-zip-sync/src/entry_reader/mod.rs | 4 ++++ rc-zip/src/fsm/entry.rs | 16 +++++++++++++++- rc-zip/src/fsm/entry/deflate_dec.rs | 12 ++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/rc-zip-sync/src/entry_reader/mod.rs b/rc-zip-sync/src/entry_reader/mod.rs index 3e4bdff..975cd96 100644 --- a/rc-zip-sync/src/entry_reader/mod.rs +++ b/rc-zip-sync/src/entry_reader/mod.rs @@ -350,8 +350,12 @@ where }; if fsm.wants_read() { + tracing::trace!("fsm wants read"); let n = self.rd.read(fsm.space())?; + tracing::trace!("read {} bytes", n); fsm.fill(n); + } else { + tracing::trace!("fsm does not want read"); } match fsm.process(buf)? { diff --git a/rc-zip/src/fsm/entry.rs b/rc-zip/src/fsm/entry.rs index 3ca8f66..b2faa92 100644 --- a/rc-zip/src/fsm/entry.rs +++ b/rc-zip/src/fsm/entry.rs @@ -100,7 +100,7 @@ impl EntryFsm { } State::ReadDataDescriptor { .. } => true, State::Validate { .. } => false, - State::Transition => false, + State::Transition => unreachable!(), } } @@ -118,12 +118,26 @@ impl EntryFsm { mut self, out: &mut [u8], ) -> Result, Error> { + tracing::trace!( + state = match &self.state { + State::ReadLocalHeader => "ReadLocalHeader", + State::ReadData { .. } => "ReadData", + State::ReadDataDescriptor { .. } => "ReadDataDescriptor", + State::Validate { .. } => "Validate", + State::Transition => "Transition", + }, + "process" + ); + use State as S; match &mut self.state { S::ReadLocalHeader => { let mut input = Partial::new(self.buffer.data()); match LocalFileHeaderRecord::parser.parse_next(&mut input) { Ok(header) => { + let consumed = input.as_bytes().offset_from(&self.buffer.data()); + tracing::trace!(local_file_header = ?header, consumed, "parsed local file header"); + self.buffer.consume(consumed); self.state = S::ReadData { header, uncompressed_size: 0, diff --git a/rc-zip/src/fsm/entry/deflate_dec.rs b/rc-zip/src/fsm/entry/deflate_dec.rs index 537cee6..c32cd8b 100644 --- a/rc-zip/src/fsm/entry/deflate_dec.rs +++ b/rc-zip/src/fsm/entry/deflate_dec.rs @@ -54,9 +54,21 @@ impl Decompressor for DeflateDec { in_buf: &[u8], out_buf: &mut [u8], ) -> Result { + tracing::trace!( + in_buf_len = in_buf.len(), + out_buf_len = out_buf.len(), + remain_in_internal_buffer = self.remain_in_internal_buffer, + out_pos = self.out_pos, + "DeflateDec::decompress", + ); + let mut outcome: DecompressOutcome = Default::default(); self.copy_to_outbuf(out_buf, &mut outcome); if outcome.bytes_written > 0 { + tracing::trace!( + "returning {} bytes from internal buffer", + outcome.bytes_written + ); return Ok(outcome); } From ecfd942e7da83a9857eabd251eb9223d3f39f1a2 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 17:52:08 +0100 Subject: [PATCH 31/49] More sleuthing --- rc-zip/src/fsm/entry.rs | 60 ++++++++++++++++++++++------- rc-zip/src/fsm/entry/deflate_dec.rs | 13 ++++--- rc-zip/src/fsm/entry/store_dec.rs | 3 +- 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/rc-zip/src/fsm/entry.rs b/rc-zip/src/fsm/entry.rs index b2faa92..dd0e23a 100644 --- a/rc-zip/src/fsm/entry.rs +++ b/rc-zip/src/fsm/entry.rs @@ -1,6 +1,8 @@ // FIXME: remove #![allow(unused)] +use std::cmp; + use oval::Buffer; use tracing::trace; use winnow::{ @@ -34,8 +36,11 @@ enum State { /// The local file header for this entry header: LocalFileHeaderRecord, - /// Amount of data we have decompressed so far - uncompressed_size: u64, + /// Amount of bytes we've fed to the decompressor + compressed_bytes: u64, + + /// Amount of bytes the decompressor has produced + uncompressed_bytes: u64, /// CRC32 hash of the decompressed data hasher: crc32fast::Hasher, @@ -140,7 +145,8 @@ impl EntryFsm { self.buffer.consume(consumed); self.state = S::ReadData { header, - uncompressed_size: 0, + compressed_bytes: 0, + uncompressed_bytes: 0, hasher: crc32fast::Hasher::new(), decompressor: AnyDecompressor::new(self.method)?, }; @@ -154,22 +160,36 @@ impl EntryFsm { } S::ReadData { header, - uncompressed_size, + compressed_bytes, + uncompressed_bytes, hasher, decompressor, } => { let in_buf = self.buffer.data(); - let is_flushing = in_buf.is_empty(); - let outcome = decompressor.decompress(in_buf, out)?; + + // don't feed the decompressor bytes beyond the entry's compressed size + let in_buf_max_len = cmp::min( + in_buf.len(), + self.entry.compressed_size as usize - *compressed_bytes as usize, + ); + let in_buf = &in_buf[..in_buf_max_len]; + + let has_more_input = if *compressed_bytes == self.entry.compressed_size as _ { + HasMoreInput::No + } else { + HasMoreInput::Yes + }; + let outcome = decompressor.decompress(in_buf, out, has_more_input)?; self.buffer.consume(outcome.bytes_read); + *compressed_bytes += outcome.bytes_read as u64; if outcome.bytes_written == 0 && self.eof { // we're done, let's read the data descriptor (if there's one) - transition!(self.state => (S::ReadData { header, uncompressed_size, hasher, decompressor }) { + transition!(self.state => (S::ReadData { header, compressed_bytes, uncompressed_bytes, hasher, decompressor }) { S::ReadDataDescriptor { header, metrics: EntryReadMetrics { - uncompressed_size, + uncompressed_size: uncompressed_bytes, crc32: hasher.finalize(), }, } @@ -278,9 +298,18 @@ pub struct DecompressOutcome { pub bytes_written: usize, } +pub enum HasMoreInput { + Yes, + No, +} + trait Decompressor { - fn decompress(&mut self, in_buf: &[u8], out_buf: &mut [u8]) - -> Result; + fn decompress( + &mut self, + in_buf: &[u8], + out_buf: &mut [u8], + has_more_input: HasMoreInput, + ) -> Result; } impl AnyDecompressor { @@ -305,12 +334,17 @@ impl AnyDecompressor { } #[inline] - fn decompress(&mut self, in_buf: &[u8], out: &mut [u8]) -> Result { + fn decompress( + &mut self, + in_buf: &[u8], + out: &mut [u8], + has_more_input: HasMoreInput, + ) -> Result { /// forward to the appropriate decompressor match self { - Self::Store(dec) => dec.decompress(in_buf, out), + Self::Store(dec) => dec.decompress(in_buf, out, has_more_input), #[cfg(feature = "deflate")] - Self::Deflate(dec) => dec.decompress(in_buf, out), + Self::Deflate(dec) => dec.decompress(in_buf, out, has_more_input), } } } diff --git a/rc-zip/src/fsm/entry/deflate_dec.rs b/rc-zip/src/fsm/entry/deflate_dec.rs index c32cd8b..95cd39a 100644 --- a/rc-zip/src/fsm/entry/deflate_dec.rs +++ b/rc-zip/src/fsm/entry/deflate_dec.rs @@ -3,14 +3,16 @@ use std::cmp; use miniz_oxide::inflate::{ core::{ decompress, - inflate_flags::{TINFL_FLAG_IGNORE_ADLER32, TINFL_FLAG_PARSE_ZLIB_HEADER}, + inflate_flags::{ + TINFL_FLAG_HAS_MORE_INPUT, TINFL_FLAG_IGNORE_ADLER32, TINFL_FLAG_PARSE_ZLIB_HEADER, + }, DecompressorOxide, }, TINFLStatus, }; use tracing::trace; -use crate::{error::Error, parse::Method}; +use crate::{error::Error, fsm::entry::HasMoreInput, parse::Method}; use super::{DecompressOutcome, Decompressor}; @@ -53,6 +55,7 @@ impl Decompressor for DeflateDec { &mut self, in_buf: &[u8], out_buf: &mut [u8], + has_more_input: HasMoreInput, ) -> Result { tracing::trace!( in_buf_len = in_buf.len(), @@ -75,10 +78,8 @@ impl Decompressor for DeflateDec { // no output bytes, let's call miniz_oxide let mut flags = TINFL_FLAG_IGNORE_ADLER32; - if in_buf.is_empty() { - // `Decompressor` invariant: if in_buf is empty, we're at EOF - } else { - flags |= TINFL_FLAG_PARSE_ZLIB_HEADER; + if matches!(has_more_input, HasMoreInput::Yes) { + flags |= TINFL_FLAG_HAS_MORE_INPUT; } let (status, bytes_read, bytes_written) = decompress( diff --git a/rc-zip/src/fsm/entry/store_dec.rs b/rc-zip/src/fsm/entry/store_dec.rs index 608343b..784f23d 100644 --- a/rc-zip/src/fsm/entry/store_dec.rs +++ b/rc-zip/src/fsm/entry/store_dec.rs @@ -2,7 +2,7 @@ use std::cmp; use crate::error::Error; -use super::{DecompressOutcome, Decompressor}; +use super::{DecompressOutcome, Decompressor, HasMoreInput}; #[derive(Default)] pub(crate) struct StoreDec; @@ -12,6 +12,7 @@ impl Decompressor for StoreDec { &mut self, in_buf: &[u8], out_buf: &mut [u8], + _has_more_input: HasMoreInput, ) -> Result { let len = cmp::min(in_buf.len(), out_buf.len()); out_buf[..len].copy_from_slice(&in_buf[..len]); From a48f70e775dd80c062670f43518d656db534be44 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 17:53:35 +0100 Subject: [PATCH 32/49] rc-zip sync works with deflate --- rc-zip/src/fsm/entry.rs | 17 ++++++++++------- rc-zip/src/fsm/entry/deflate_dec.rs | 4 +--- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/rc-zip/src/fsm/entry.rs b/rc-zip/src/fsm/entry.rs index dd0e23a..0f2cbfc 100644 --- a/rc-zip/src/fsm/entry.rs +++ b/rc-zip/src/fsm/entry.rs @@ -1,6 +1,3 @@ -// FIXME: remove -#![allow(unused)] - use std::cmp; use oval::Buffer; @@ -159,11 +156,11 @@ impl EntryFsm { } } S::ReadData { - header, compressed_bytes, uncompressed_bytes, hasher, decompressor, + .. } => { let in_buf = self.buffer.data(); @@ -185,7 +182,7 @@ impl EntryFsm { if outcome.bytes_written == 0 && self.eof { // we're done, let's read the data descriptor (if there's one) - transition!(self.state => (S::ReadData { header, compressed_bytes, uncompressed_bytes, hasher, decompressor }) { + transition!(self.state => (S::ReadData { header, uncompressed_bytes, hasher, .. }) { S::ReadDataDescriptor { header, metrics: EntryReadMetrics { @@ -196,9 +193,15 @@ impl EntryFsm { }); return self.process(out); } + + // write the decompressed data to the hasher + hasher.update(&out[..outcome.bytes_written]); + // update the number of bytes we've decompressed + *uncompressed_bytes += outcome.bytes_written as u64; + Ok(FsmResult::Continue((self, outcome))) } - S::ReadDataDescriptor { header, metrics } => { + S::ReadDataDescriptor { .. } => { let mut input = Partial::new(self.buffer.data()); match DataDescriptorRecord::mk_parser(self.entry.is_zip64).parse_next(&mut input) { Ok(descriptor) => { @@ -340,7 +343,7 @@ impl AnyDecompressor { out: &mut [u8], has_more_input: HasMoreInput, ) -> Result { - /// forward to the appropriate decompressor + // forward to the appropriate decompressor match self { Self::Store(dec) => dec.decompress(in_buf, out, has_more_input), #[cfg(feature = "deflate")] diff --git a/rc-zip/src/fsm/entry/deflate_dec.rs b/rc-zip/src/fsm/entry/deflate_dec.rs index 95cd39a..427d592 100644 --- a/rc-zip/src/fsm/entry/deflate_dec.rs +++ b/rc-zip/src/fsm/entry/deflate_dec.rs @@ -3,9 +3,7 @@ use std::cmp; use miniz_oxide::inflate::{ core::{ decompress, - inflate_flags::{ - TINFL_FLAG_HAS_MORE_INPUT, TINFL_FLAG_IGNORE_ADLER32, TINFL_FLAG_PARSE_ZLIB_HEADER, - }, + inflate_flags::{TINFL_FLAG_HAS_MORE_INPUT, TINFL_FLAG_IGNORE_ADLER32}, DecompressorOxide, }, TINFLStatus, From 53cd358099a1c19b08a90aef0f38f59fa4974201 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 18:08:56 +0100 Subject: [PATCH 33/49] Make rc-zip-tokio use EntryFsm --- Cargo.lock | 39 --- rc-zip-sync/src/entry_reader/mod.rs | 14 +- rc-zip-sync/src/read_zip.rs | 5 +- rc-zip-tokio/Cargo.toml | 7 +- rc-zip-tokio/src/async_read_zip.rs | 5 +- rc-zip-tokio/src/decoder.rs | 122 -------- rc-zip-tokio/src/entry_reader.rs | 76 +++++ rc-zip-tokio/src/entry_reader/deflate_dec.rs | 21 -- rc-zip-tokio/src/entry_reader/mod.rs | 297 ------------------- rc-zip-tokio/src/lib.rs | 11 - 10 files changed, 86 insertions(+), 511 deletions(-) delete mode 100644 rc-zip-tokio/src/decoder.rs create mode 100644 rc-zip-tokio/src/entry_reader.rs delete mode 100644 rc-zip-tokio/src/entry_reader/deflate_dec.rs delete mode 100644 rc-zip-tokio/src/entry_reader/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 84a0f6d..4f15a87 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -89,24 +89,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "async-compression" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a116f46a969224200a0a97f29cfd4c50e7534e4b4826bd23ea2c3c533039c82c" -dependencies = [ - "bzip2", - "deflate64", - "flate2", - "futures-core", - "memchr", - "pin-project-lite", - "tokio", - "xz2", - "zstd", - "zstd-safe", -] - [[package]] name = "autocfg" version = "1.1.0" @@ -569,17 +551,6 @@ dependencies = [ "crc", ] -[[package]] -name = "lzma-sys" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "matchers" version = "0.1.0" @@ -867,7 +838,6 @@ dependencies = [ name = "rc-zip-tokio" version = "2.0.1" dependencies = [ - "async-compression", "cfg-if", "crc32fast", "futures", @@ -1339,15 +1309,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "xz2" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" -dependencies = [ - "lzma-sys", -] - [[package]] name = "zstd" version = "0.13.0" diff --git a/rc-zip-sync/src/entry_reader/mod.rs b/rc-zip-sync/src/entry_reader/mod.rs index 975cd96..41d11b0 100644 --- a/rc-zip-sync/src/entry_reader/mod.rs +++ b/rc-zip-sync/src/entry_reader/mod.rs @@ -241,12 +241,9 @@ where { const DEFAULT_BUFFER_SIZE: usize = 256 * 1024; - pub(crate) fn new(entry: &StoredEntry, get_reader: F) -> Self - where - F: Fn(u64) -> R, - { + pub(crate) fn new(entry: &StoredEntry, rd: R) -> Self { Self { - rd: get_reader(entry.header_offset), + rd, eof: false, state: State::ReadLocalHeader { buffer: Buffer::with_capacity(Self::DEFAULT_BUFFER_SIZE), @@ -328,12 +325,9 @@ impl FsmEntryReader where R: io::Read, { - pub(crate) fn new(entry: &StoredEntry, get_reader: F) -> Self - where - F: Fn(u64) -> R, - { + pub(crate) fn new(entry: &StoredEntry, rd: R) -> Self { Self { - rd: get_reader(entry.header_offset), + rd, fsm: Some(EntryFsm::new(entry.method(), entry.inner)), } } diff --git a/rc-zip-sync/src/read_zip.rs b/rc-zip-sync/src/read_zip.rs index 6537041..fa234ea 100644 --- a/rc-zip-sync/src/read_zip.rs +++ b/rc-zip-sync/src/read_zip.rs @@ -153,12 +153,13 @@ where { /// Returns a reader for the entry. pub fn reader(&self) -> impl Read + 'a { - EntryReader::new(self.entry, |offset| self.file.cursor_at(offset)) + // FIXME: replace with `fsm_reader`` + EntryReader::new(self.entry, self.file.cursor_at(self.entry.header_offset)) } /// Returns an fsm-based reader for the entry pub fn fsm_reader(&self) -> impl Read + 'a { - FsmEntryReader::new(self.entry, |offset| self.file.cursor_at(offset)) + FsmEntryReader::new(self.entry, self.file.cursor_at(self.entry.header_offset)) } /// Reads the entire entry into a vector. diff --git a/rc-zip-tokio/Cargo.toml b/rc-zip-tokio/Cargo.toml index 99e8df9..bea183b 100644 --- a/rc-zip-tokio/Cargo.toml +++ b/rc-zip-tokio/Cargo.toml @@ -21,7 +21,6 @@ positioned-io = { version = "0.3.3" } tokio = { version = "1.35.1", features = ["fs", "io-util", "rt-multi-thread"] } futures = { version = "0.3.30" } pin-project-lite = { version = "0.2.13" } -async-compression = { version = "0.4.6", features = ["tokio"] } oval = "2.0.0" crc32fast = "1.3.2" tracing = "0.1.40" @@ -30,9 +29,5 @@ winnow = "0.5.36" [features] default = ["deflate"] -deflate = ["async-compression/deflate"] -deflate64 = ["async-compression/deflate64"] -lzma = ["async-compression/lzma"] -bzip2 = ["async-compression/bzip2"] -zstd = ["async-compression/zstd"] +deflate = ["rc-zip/deflate"] diff --git a/rc-zip-tokio/src/async_read_zip.rs b/rc-zip-tokio/src/async_read_zip.rs index f096b19..acca9ee 100644 --- a/rc-zip-tokio/src/async_read_zip.rs +++ b/rc-zip-tokio/src/async_read_zip.rs @@ -10,7 +10,7 @@ use rc_zip::{ parse::{Archive, StoredEntry}, }; -use crate::entry_reader::AsyncEntryReader; +use crate::entry_reader::EntryReader; /// A trait for reading something as a zip archive. /// @@ -156,8 +156,7 @@ where { /// Returns a reader for the entry. pub fn reader(&self) -> impl AsyncRead + Unpin + '_ { - tracing::trace!("Creating EntryReader"); - AsyncEntryReader::new(self.entry, |offset| self.file.cursor_at(offset)) + EntryReader::new(self.entry, |offset| self.file.cursor_at(offset)) } /// Reads the entire entry into a vector. diff --git a/rc-zip-tokio/src/decoder.rs b/rc-zip-tokio/src/decoder.rs deleted file mode 100644 index 0c6c71f..0000000 --- a/rc-zip-tokio/src/decoder.rs +++ /dev/null @@ -1,122 +0,0 @@ -use std::{cmp, io, pin::Pin, task}; - -use oval::Buffer; -use tokio::io::{AsyncBufRead, AsyncRead}; - -/// Only allows reading a fixed number of bytes from a [oval::Buffer], -/// used for reading the raw (compressed) data for a single zip file entry. -/// It also allows moving out the inner buffer afterwards. -pub(crate) struct RawEntryReader { - remaining: u64, - inner: Buffer, -} - -impl RawEntryReader { - pub(crate) fn new(inner: Buffer, entry_size: u64) -> Self { - Self { - inner, - remaining: entry_size, - } - } - - pub(crate) fn into_inner(self) -> Buffer { - self.inner - } - - pub(crate) fn get_mut(&mut self) -> &mut Buffer { - &mut self.inner - } -} - -pub(crate) trait AsyncDecoder: AsyncRead -where - R: AsyncRead, -{ - /// Moves the inner reader out of this decoder. - /// self is boxed because decoders are typically used as trait objects. - fn into_inner(self: Box) -> R; - - /// Returns a mutable reference to the inner reader. - fn get_mut(&mut self) -> &mut R; -} - -pin_project_lite::pin_project! { - pub(crate) struct StoreAsyncDecoder - where - R: AsyncRead, - { - #[pin] - inner: R, - } -} - -impl StoreAsyncDecoder -where - R: AsyncRead, -{ - pub(crate) fn new(inner: R) -> Self { - Self { inner } - } -} - -impl AsyncRead for StoreAsyncDecoder -where - R: AsyncRead, -{ - fn poll_read( - self: Pin<&mut Self>, - cx: &mut task::Context<'_>, - buf: &mut tokio::io::ReadBuf<'_>, - ) -> task::Poll> { - let this = self.project(); - this.inner.poll_read(cx, buf) - } -} - -impl AsyncDecoder for StoreAsyncDecoder -where - R: AsyncRead, -{ - fn into_inner(self: Box) -> R { - self.inner - } - - fn get_mut(&mut self) -> &mut R { - &mut self.inner - } -} - -impl AsyncBufRead for RawEntryReader { - fn consume(mut self: Pin<&mut Self>, amt: usize) { - self.as_mut().remaining -= amt as u64; - Buffer::consume(&mut self.inner, amt); - } - - fn poll_fill_buf( - self: Pin<&mut Self>, - _cx: &mut task::Context<'_>, - ) -> task::Poll> { - let max_avail = cmp::min(self.remaining, self.inner.available_data() as u64); - Ok(self.get_mut().inner.data()[..max_avail as _].as_ref()).into() - } -} - -impl AsyncRead for RawEntryReader { - fn poll_read( - mut self: Pin<&mut Self>, - _cx: &mut task::Context<'_>, - buf: &mut tokio::io::ReadBuf<'_>, - ) -> task::Poll> { - let len = cmp::min( - buf.remaining() as u64, - cmp::min(self.remaining, self.inner.available_data() as _), - ) as usize; - tracing::trace!(%len, buf_remaining = buf.remaining(), remaining = self.remaining, available_data = self.inner.available_data(), available_space = self.inner.available_space(), "computing len"); - - buf.put_slice(&self.inner.data()[..len]); - self.as_mut().inner.consume(len); - self.remaining -= len as u64; - - Ok(()).into() - } -} diff --git a/rc-zip-tokio/src/entry_reader.rs b/rc-zip-tokio/src/entry_reader.rs new file mode 100644 index 0000000..ea370b8 --- /dev/null +++ b/rc-zip-tokio/src/entry_reader.rs @@ -0,0 +1,76 @@ +use std::{pin::Pin, task}; + +use pin_project_lite::pin_project; +use rc_zip::{ + fsm::{EntryFsm, FsmResult}, + parse::StoredEntry, +}; +use tokio::io::{AsyncRead, ReadBuf}; + +pin_project! { + pub(crate) struct EntryReader + where + R: AsyncRead, + { + #[pin] + rd: R, + fsm: Option, + } +} + +impl EntryReader +where + R: AsyncRead, +{ + pub(crate) fn new(entry: &StoredEntry, get_reader: F) -> Self + where + F: Fn(u64) -> R, + { + Self { + rd: get_reader(entry.header_offset), + fsm: Some(EntryFsm::new(entry.method(), entry.inner)), + } + } +} + +impl AsyncRead for EntryReader +where + R: AsyncRead, +{ + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut task::Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> task::Poll> { + let this = self.as_mut().project(); + + let mut fsm = match this.fsm.take() { + Some(fsm) => fsm, + None => return Ok(()).into(), + }; + + if fsm.wants_read() { + tracing::trace!("fsm wants read"); + + let mut buf = ReadBuf::new(fsm.space()); + futures::ready!(this.rd.poll_read(cx, &mut buf))?; + let n = buf.filled().len(); + + tracing::trace!("read {} bytes", n); + fsm.fill(n); + } else { + tracing::trace!("fsm does not want read"); + } + + match fsm.process(buf.initialize_unfilled())? { + FsmResult::Continue((fsm, outcome)) => { + *this.fsm = Some(fsm); + buf.advance(outcome.bytes_written); + } + FsmResult::Done(()) => { + // neat! + } + } + Ok(()).into() + } +} diff --git a/rc-zip-tokio/src/entry_reader/deflate_dec.rs b/rc-zip-tokio/src/entry_reader/deflate_dec.rs deleted file mode 100644 index e3034e4..0000000 --- a/rc-zip-tokio/src/entry_reader/deflate_dec.rs +++ /dev/null @@ -1,21 +0,0 @@ -use async_compression::tokio::bufread::DeflateDecoder; -use tokio::io::AsyncBufRead; - -use crate::decoder::{AsyncDecoder, RawEntryReader}; - -impl AsyncDecoder for DeflateDecoder -where - R: AsyncBufRead, -{ - fn into_inner(self: Box) -> R { - Self::into_inner(*self) - } - - fn get_mut(&mut self) -> &mut R { - Self::get_mut(self) - } -} - -pub(crate) fn mk_decoder(r: RawEntryReader) -> impl AsyncDecoder { - DeflateDecoder::new(r) -} diff --git a/rc-zip-tokio/src/entry_reader/mod.rs b/rc-zip-tokio/src/entry_reader/mod.rs deleted file mode 100644 index 88e6d3f..0000000 --- a/rc-zip-tokio/src/entry_reader/mod.rs +++ /dev/null @@ -1,297 +0,0 @@ -#[cfg(feature = "deflate")] -mod deflate_dec; - -use cfg_if::cfg_if; -use oval::Buffer; -use rc_zip::{ - error::{Error, FormatError}, - parse::{DataDescriptorRecord, LocalFileHeaderRecord, Method, StoredEntry, StoredEntryInner}, -}; -use std::{io, pin::Pin, task}; -use tokio::io::AsyncRead; -use tracing::trace; -use winnow::{ - error::ErrMode, - stream::{AsBytes, Offset}, - Parser, Partial, -}; - -use crate::decoder::{AsyncDecoder, RawEntryReader, StoreAsyncDecoder}; - -struct EntryReadMetrics { - uncompressed_size: u64, - crc32: u32, -} - -pin_project_lite::pin_project! { - #[project = StateProj] - #[derive(Default)] - enum State { - ReadLocalHeader { - buffer: Buffer, - }, - ReadData { - hasher: crc32fast::Hasher, - uncompressed_size: u64, - header: LocalFileHeaderRecord, - #[pin] - decoder: Box + Unpin>, - }, - ReadDataDescriptor { - metrics: EntryReadMetrics, - header: LocalFileHeaderRecord, - buffer: Buffer, - }, - Validate { - metrics: EntryReadMetrics, - header: LocalFileHeaderRecord, - descriptor: Option, - }, - Done, - #[default] - Transitioning, - } -} - -pin_project_lite::pin_project! { - pub(crate) struct AsyncEntryReader - where - R: AsyncRead, - { - #[pin] - rd: R, - eof: bool, - #[pin] - state: State, - inner: StoredEntryInner, - method: Method, - } -} - -impl AsyncRead for AsyncEntryReader -where - R: AsyncRead, -{ - fn poll_read( - mut self: Pin<&mut Self>, - cx: &mut task::Context<'_>, - buf: &mut tokio::io::ReadBuf<'_>, - ) -> task::Poll> { - let mut this = self.as_mut().project(); - - use StateProj as S; - match this.state.as_mut().project() { - S::ReadLocalHeader { ref mut buffer } => { - let mut read_buf = tokio::io::ReadBuf::new(buffer.space()); - futures::ready!(this.rd.poll_read(cx, &mut read_buf))?; - let read_bytes = read_buf.filled().len(); - if read_bytes == 0 { - return Err(io::ErrorKind::UnexpectedEof.into()).into(); - } - buffer.fill(read_bytes); - - let mut input = Partial::new(buffer.data()); - match LocalFileHeaderRecord::parser.parse_next(&mut input) { - Ok(header) => { - buffer.consume(input.as_bytes().offset_from(&buffer.data())); - - trace!("local file header: {:#?}", header); - transition_async!(this.state => (State::ReadLocalHeader { buffer }) { - let decoder = method_to_decoder(*this.method, RawEntryReader::new(buffer, this.inner.compressed_size))?; - - State::ReadData { - hasher: crc32fast::Hasher::new(), - uncompressed_size: 0, - decoder, - header, - } - }); - self.poll_read(cx, buf) - } - Err(ErrMode::Incomplete(_)) => { - // try another read - if it returns pending, it'll be propagated - self.poll_read(cx, buf) - } - Err(_e) => Err(Error::Format(FormatError::InvalidLocalHeader).into()).into(), - } - } - S::ReadData { - ref mut uncompressed_size, - ref mut decoder, - ref mut hasher, - .. - } => { - { - let buffer = decoder.as_mut().get_mut().get_mut().get_mut(); - if !*this.eof && buffer.available_data() == 0 { - if buffer.available_space() == 0 { - buffer.shift(); - } - - let mut read_buf = tokio::io::ReadBuf::new(buffer.space()); - futures::ready!(this.rd.poll_read(cx, &mut read_buf))?; - match read_buf.filled().len() { - 0 => { - *this.eof = true; - } - n => { - buffer.fill(n); - } - } - } - } - - let filled_before = buf.filled().len(); - futures::ready!(decoder.as_mut().poll_read(cx, buf))?; - let filled_after = buf.filled().len(); - let read_bytes = filled_after - filled_before; - - match read_bytes { - 0 => { - transition_async!(this.state => (State::ReadData { decoder, header, hasher, uncompressed_size, .. }) { - let limited_reader = decoder.into_inner(); - let buffer = limited_reader.into_inner(); - let metrics = EntryReadMetrics { - crc32: hasher.finalize(), - uncompressed_size, - }; - if header.has_data_descriptor() { - trace!("will read data descriptor (flags = {:x})", header.flags); - State::ReadDataDescriptor { metrics, buffer, header } - } else { - trace!("no data descriptor to read"); - State::Validate { metrics, header, descriptor: None } - } - }); - self.poll_read(cx, buf) - } - n => { - **uncompressed_size += n as u64; - let read_slice = &buf.filled()[filled_before..filled_after]; - hasher.update(read_slice); - Ok(()).into() - } - } - } - S::ReadDataDescriptor { ref mut buffer, .. } => { - trace!( - "read data descriptor, avail data = {}, avail space = {}", - buffer.available_data(), - buffer.available_space() - ); - - let mut input = Partial::new(buffer.data()); - match DataDescriptorRecord::mk_parser(this.inner.is_zip64).parse_next(&mut input) { - Ok(descriptor) => { - buffer.consume(input.as_bytes().offset_from(&buffer.data())); - trace!("data descriptor = {:#?}", descriptor); - transition_async!(this.state => (State::ReadDataDescriptor { metrics, header, .. }) { - State::Validate { metrics, header, descriptor: Some(descriptor) } - }); - self.poll_read(cx, buf) - } - Err(ErrMode::Incomplete(_)) => { - let mut read_buf = tokio::io::ReadBuf::new(buffer.space()); - futures::ready!(this.rd.poll_read(cx, &mut read_buf))?; - let read_bytes = read_buf.filled().len(); - if read_bytes == 0 { - return Err(io::ErrorKind::UnexpectedEof.into()).into(); - } - buffer.fill(read_bytes); - self.poll_read(cx, buf) - } - Err(_e) => Err(Error::Format(FormatError::InvalidLocalHeader).into()).into(), - } - } - S::Validate { - ref metrics, - ref header, - ref descriptor, - } => { - let expected_crc32 = if this.inner.crc32 != 0 { - this.inner.crc32 - } else if let Some(descriptor) = descriptor.as_ref() { - descriptor.crc32 - } else { - header.crc32 - }; - - let expected_size = if this.inner.uncompressed_size != 0 { - this.inner.uncompressed_size - } else if let Some(descriptor) = descriptor.as_ref() { - descriptor.uncompressed_size - } else { - header.uncompressed_size as u64 - }; - - if expected_size != metrics.uncompressed_size { - return Err(Error::Format(FormatError::WrongSize { - expected: expected_size, - actual: metrics.uncompressed_size, - }) - .into()) - .into(); - } - - if expected_crc32 != 0 && expected_crc32 != metrics.crc32 { - return Err(Error::Format(FormatError::WrongChecksum { - expected: expected_crc32, - actual: metrics.crc32, - }) - .into()) - .into(); - } - - *this.state.as_mut().get_mut() = State::Done; - self.poll_read(cx, buf) - } - S::Done => Ok(()).into(), - S::Transitioning => unreachable!(), - } - } -} - -impl AsyncEntryReader -where - R: AsyncRead, -{ - const DEFAULT_BUFFER_SIZE: usize = 256 * 1024; - - pub fn new(entry: &StoredEntry, get_reader: F) -> Self - where - F: Fn(u64) -> R, - { - Self { - rd: get_reader(entry.header_offset), - eof: false, - state: State::ReadLocalHeader { - buffer: Buffer::with_capacity(Self::DEFAULT_BUFFER_SIZE), - }, - method: entry.method(), - inner: entry.inner, - } - } -} - -fn method_to_decoder( - method: Method, - raw_r: RawEntryReader, -) -> Result + Unpin>, Error> { - let decoder: Box + Unpin> = match method { - Method::Store => Box::new(StoreAsyncDecoder::new(raw_r)), - Method::Deflate => { - cfg_if! { - if #[cfg(feature = "deflate")] { - Box::new(deflate_dec::mk_decoder(raw_r)) - } else { - return Err(Error::method_not_enabled(method)); - } - } - } - method => { - return Err(Error::method_not_supported(method)); - } - }; - - Ok(decoder) -} diff --git a/rc-zip-tokio/src/lib.rs b/rc-zip-tokio/src/lib.rs index e2ae709..071204a 100644 --- a/rc-zip-tokio/src/lib.rs +++ b/rc-zip-tokio/src/lib.rs @@ -7,18 +7,7 @@ #![warn(missing_docs)] -macro_rules! transition_async { - ($state: expr => ($pattern: pat) $body: expr) => { - *$state.as_mut() = if let $pattern = std::mem::take($state.as_mut().get_mut()) { - $body - } else { - unreachable!() - }; - }; -} - mod async_read_zip; -mod decoder; mod entry_reader; // re-exports From 2a30e7ff3382dee8897f5231aeb59529e3949a4b Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 18:23:54 +0100 Subject: [PATCH 34/49] Move corpus to rc-zip --- .vscode/settings.json | 3 +- rc-zip-sync/Cargo.toml | 1 + rc-zip-sync/tests/integration_tests.rs | 372 +++++-------------------- rc-zip/Cargo.toml | 1 + rc-zip/src/corpus/mod.rs | 223 +++++++++++++++ rc-zip/src/lib.rs | 3 + 6 files changed, 305 insertions(+), 298 deletions(-) create mode 100644 rc-zip/src/corpus/mod.rs diff --git a/.vscode/settings.json b/.vscode/settings.json index 2e6e863..f843d7b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,7 @@ { "rust-analyzer.cargo.features": [ - "rc-zip/deflate" + "rc-zip/deflate", + "rc-zip/corpus" ], "rust-analyzer.linkedProjects": [ "./Cargo.toml" diff --git a/rc-zip-sync/Cargo.toml b/rc-zip-sync/Cargo.toml index 68d46ec..d6e779d 100644 --- a/rc-zip-sync/Cargo.toml +++ b/rc-zip-sync/Cargo.toml @@ -49,3 +49,4 @@ humansize = "2.1.3" indicatif = "0.17.7" test-log = { version = "0.2.14", default-features = false, features = ["tracing-subscriber", "trace"] } tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } +rc-zip = { version = "2.0.1", path = "../rc-zip", features = ["corpus"] } diff --git a/rc-zip-sync/tests/integration_tests.rs b/rc-zip-sync/tests/integration_tests.rs index 2a30d04..abed241 100644 --- a/rc-zip-sync/tests/integration_tests.rs +++ b/rc-zip-sync/tests/integration_tests.rs @@ -1,327 +1,105 @@ -use chrono::{ - offset::{FixedOffset, Utc}, - DateTime, TimeZone, Timelike, -}; use rc_zip::{ - encoding::Encoding, + corpus::{self, zips_dir, FileContent, ZipTest, ZipTestFile}, error::Error, parse::{Archive, EntryContents}, }; use rc_zip_sync::{HasCursor, ReadZip, SyncArchive, SyncStoredEntry}; -use std::{cmp, fs::File, path::PathBuf}; - -enum ZipSource { - File(&'static str), - Func(&'static str, Box Vec>), -} +use std::{cmp, fs::File}; -struct ZipTest { - source: ZipSource, - expected_encoding: Option, - comment: Option<&'static str>, - files: Vec, - error: Option, -} +fn check_case(test: &ZipTest, archive: Result, Error>) { + let case_bytes = test.bytes(); -impl Default for ZipTest { - fn default() -> Self { - Self { - source: ZipSource::Func("default.zip", Box::new(|| unreachable!())), - expected_encoding: None, - comment: None, - files: vec![], - error: None, - } + if let Some(expected) = &test.error { + let actual = match archive { + Err(e) => e, + Ok(_) => panic!("should have failed"), + }; + let expected = format!("{:#?}", expected); + let actual = format!("{:#?}", actual); + assert_eq!(expected, actual); + return; } -} + let archive = archive.unwrap(); -impl ZipTest { - fn check(&self, archive: Result, Error>) { - let case_bytes = self.bytes(); + assert_eq!(case_bytes.len() as u64, archive.size()); - if let Some(expected) = &self.error { - let actual = match archive { - Err(e) => e, - Ok(_) => panic!("should have failed"), - }; - let expected = format!("{:#?}", expected); - let actual = format!("{:#?}", actual); - assert_eq!(expected, actual); - return; - } - let archive = archive.unwrap(); - - assert_eq!(case_bytes.len() as u64, archive.size()); - - if let Some(expected) = self.comment { - assert_eq!(expected, archive.comment().expect("should have comment")) - } + if let Some(expected) = test.comment { + assert_eq!(expected, archive.comment().expect("should have comment")) + } - if let Some(exp_encoding) = self.expected_encoding { - assert_eq!(archive.encoding(), exp_encoding); - } + if let Some(exp_encoding) = test.expected_encoding { + assert_eq!(archive.encoding(), exp_encoding); + } - assert_eq!( - self.files.len(), - archive.entries().count(), - "{} should have {} entries files", - self.name(), - self.files.len() - ); + assert_eq!( + test.files.len(), + archive.entries().count(), + "{} should have {} entries files", + test.name(), + test.files.len() + ); - for f in &self.files { - f.check(&archive); - } + for f in &test.files { + check_file(f, &archive); } } -struct ZipTestFile { - name: &'static str, - mode: Option, - modified: Option>, - content: FileContent, -} +fn check_file(file: &ZipTestFile, archive: &SyncArchive<'_, F>) { + let entry = archive + .by_name(file.name) + .unwrap_or_else(|| panic!("entry {} should exist", file.name)); -impl ZipTestFile { - fn check(&self, archive: &SyncArchive<'_, F>) { - let entry = archive - .by_name(self.name) - .unwrap_or_else(|| panic!("entry {} should exist", self.name)); + let archive_inner: &Archive = archive; + let entry_inner = archive_inner.by_name(file.name).unwrap(); + assert_eq!(entry.name(), entry_inner.name()); - let archive_inner: &Archive = archive; - let entry_inner = archive_inner.by_name(self.name).unwrap(); - assert_eq!(entry.name(), entry_inner.name()); + check_file_against(file, entry) +} - self.check_against(entry); +fn check_file_against(file: &ZipTestFile, entry: SyncStoredEntry<'_, F>) { + if let Some(expected) = file.modified { + assert_eq!( + expected, + entry.modified(), + "entry {} should have modified = {:?}", + entry.name(), + expected + ) } - fn check_against(&self, entry: SyncStoredEntry<'_, F>) { - if let Some(expected) = self.modified { - assert_eq!( - expected, - entry.modified(), - "entry {} should have modified = {:?}", - entry.name(), - expected - ) - } - - if let Some(mode) = self.mode { - assert_eq!(entry.mode.0 & 0o777, mode); - } + if let Some(mode) = file.mode { + assert_eq!(entry.mode.0 & 0o777, mode); + } - // I have honestly yet to see a zip file _entry_ with a comment. - assert!(entry.comment().is_none()); + // I have honestly yet to see a zip file _entry_ with a comment. + assert!(entry.comment().is_none()); - match entry.contents() { - EntryContents::File => { - let actual_bytes = entry.bytes().unwrap(); + match entry.contents() { + EntryContents::File => { + let actual_bytes = entry.bytes().unwrap(); - match &self.content { - FileContent::Unchecked => { - // ah well - } - FileContent::Bytes(expected_bytes) => { - // first check length - assert_eq!(actual_bytes.len(), expected_bytes.len()); - assert_eq!(&actual_bytes[..], &expected_bytes[..]) - } - FileContent::File(file_path) => { - let expected_bytes = std::fs::read(zips_dir().join(file_path)).unwrap(); - // first check length - assert_eq!(actual_bytes.len(), expected_bytes.len()); - assert_eq!(&actual_bytes[..], &expected_bytes[..]) - } + match &file.content { + FileContent::Unchecked => { + // ah well + } + FileContent::Bytes(expected_bytes) => { + // first check length + assert_eq!(actual_bytes.len(), expected_bytes.len()); + assert_eq!(&actual_bytes[..], &expected_bytes[..]) + } + FileContent::File(file_path) => { + let expected_bytes = std::fs::read(zips_dir().join(file_path)).unwrap(); + // first check length + assert_eq!(actual_bytes.len(), expected_bytes.len()); + assert_eq!(&actual_bytes[..], &expected_bytes[..]) } } - EntryContents::Symlink | EntryContents::Directory => { - assert!(matches!(self.content, FileContent::Unchecked)); - } - } - } -} - -enum FileContent { - Unchecked, - Bytes(Vec), - File(&'static str), -} - -impl Default for ZipTestFile { - fn default() -> Self { - Self { - name: "default", - mode: None, - modified: None, - content: FileContent::Unchecked, } - } -} - -impl ZipTest { - fn name(&self) -> &'static str { - match &self.source { - ZipSource::File(name) => name, - ZipSource::Func(name, _f) => name, + EntryContents::Symlink | EntryContents::Directory => { + assert!(matches!(file.content, FileContent::Unchecked)); } } - - // Read source archive from disk - fn bytes(&self) -> Vec { - match &self.source { - ZipSource::File(name) => std::fs::read(zips_dir().join(name)).unwrap(), - ZipSource::Func(_name, f) => f(), - } - } -} - -fn zips_dir() -> PathBuf { - PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .parent() - .unwrap() - .join("testdata") -} - -fn time_zone(hours: i32) -> FixedOffset { - FixedOffset::east_opt(hours * 3600).unwrap() -} - -fn date( - (year, month, day): (i32, u32, u32), - (hour, min, sec): (u32, u32, u32), - nsec: u32, - offset: FixedOffset, -) -> Option> { - Some( - offset - .with_ymd_and_hms(year, month, day, hour, min, sec) - .single()? - .with_nanosecond(nsec)? - .into(), - ) -} - -fn test_cases() -> Vec { - vec![ - ZipTest { - source: ZipSource::File("zip64.zip"), - files: vec![ZipTestFile { - name: "README", - content: FileContent::Bytes( - "This small file is in ZIP64 format.\n".as_bytes().into(), - ), - modified: Some(date((2012, 8, 10), (14, 33, 32), 0, time_zone(0)).unwrap()), - mode: Some(0o644), - }], - ..Default::default() - }, - ZipTest { - source: ZipSource::File("test.zip"), - comment: Some("This is a zipfile comment."), - expected_encoding: Some(Encoding::Utf8), - files: vec![ - ZipTestFile { - name: "test.txt", - content: FileContent::Bytes("This is a test text file.\n".as_bytes().into()), - modified: Some(date((2010, 9, 5), (12, 12, 1), 0, time_zone(10)).unwrap()), - mode: Some(0o644), - }, - ZipTestFile { - name: "gophercolor16x16.png", - content: FileContent::File("gophercolor16x16.png"), - modified: Some(date((2010, 9, 5), (15, 52, 58), 0, time_zone(10)).unwrap()), - mode: Some(0o644), - }, - ], - ..Default::default() - }, - ZipTest { - source: ZipSource::File("cp-437.zip"), - expected_encoding: Some(Encoding::Cp437), - files: vec![ZipTestFile { - name: "français", - ..Default::default() - }], - ..Default::default() - }, - ZipTest { - source: ZipSource::File("shift-jis.zip"), - expected_encoding: Some(Encoding::ShiftJis), - files: vec![ - ZipTestFile { - name: "should-be-jis/", - ..Default::default() - }, - ZipTestFile { - name: "should-be-jis/ot_運命のワルツネぞなぞ小さな楽しみ遊びま.longboi", - ..Default::default() - }, - ], - ..Default::default() - }, - ZipTest { - source: ZipSource::File("utf8-winrar.zip"), - expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { - name: "世界", - content: FileContent::Bytes(vec![]), - modified: Some(date((2017, 11, 6), (21, 9, 27), 867862500, time_zone(0)).unwrap()), - ..Default::default() - }], - ..Default::default() - }, - #[cfg(feature = "lzma")] - ZipTest { - source: ZipSource::File("found-me-lzma.zip"), - expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { - name: "found-me.txt", - content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), - modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), - ..Default::default() - }], - ..Default::default() - }, - #[cfg(feature = "deflate64")] - ZipTest { - source: ZipSource::File("found-me-deflate64.zip"), - expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { - name: "found-me.txt", - content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), - modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), - ..Default::default() - }], - ..Default::default() - }, - // same with bzip2 - #[cfg(feature = "bzip2")] - ZipTest { - source: ZipSource::File("found-me-bzip2.zip"), - expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { - name: "found-me.txt", - content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), - modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), - ..Default::default() - }], - ..Default::default() - }, - // same with zstd - #[cfg(feature = "zstd")] - ZipTest { - source: ZipSource::File("found-me-zstd.zip"), - expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { - name: "found-me.txt", - content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), - modified: Some(date((2024, 1, 31), (6, 10, 25), 800491400, time_zone(0)).unwrap()), - ..Default::default() - }], - ..Default::default() - }, - ] } #[test_log::test] @@ -341,9 +119,9 @@ fn read_from_file() { #[test_log::test] fn real_world_files() { - for case in test_cases() { + for case in corpus::test_cases() { tracing::trace!("============ testing {}", case.name()); - case.check(case.bytes().read_zip()); + check_case(&case, case.bytes().read_zip()) } } @@ -351,7 +129,7 @@ fn real_world_files() { fn state_machine() { use rc_zip::fsm::{ArchiveFsm, FsmResult}; - let cases = test_cases(); + let cases = corpus::test_cases(); let case = cases.iter().find(|x| x.name() == "zip64.zip").unwrap(); let bs = case.bytes(); let mut fsm = ArchiveFsm::new(bs.len() as u64); diff --git a/rc-zip/Cargo.toml b/rc-zip/Cargo.toml index a5f04e7..d3c12ae 100644 --- a/rc-zip/Cargo.toml +++ b/rc-zip/Cargo.toml @@ -32,4 +32,5 @@ miniz_oxide = { version = "0.7.1", optional = true } [features] deflate = ["dep:miniz_oxide"] +corpus = [] diff --git a/rc-zip/src/corpus/mod.rs b/rc-zip/src/corpus/mod.rs new file mode 100644 index 0000000..a20f9e3 --- /dev/null +++ b/rc-zip/src/corpus/mod.rs @@ -0,0 +1,223 @@ +#![allow(missing_docs)] + +//! A corpus of zip files for testing. + +use std::path::PathBuf; + +use chrono::{DateTime, FixedOffset, TimeZone, Timelike, Utc}; + +use crate::{encoding::Encoding, error::Error}; + +pub enum ZipSource { + File(&'static str), + Func(&'static str, Box Vec>), +} + +pub struct ZipTest { + pub source: ZipSource, + pub expected_encoding: Option, + pub comment: Option<&'static str>, + pub files: Vec, + pub error: Option, +} + +impl Default for ZipTest { + fn default() -> Self { + Self { + source: ZipSource::Func("default.zip", Box::new(|| unreachable!())), + expected_encoding: None, + comment: None, + files: vec![], + error: None, + } + } +} + +impl ZipTest { + pub fn name(&self) -> &'static str { + match &self.source { + ZipSource::File(name) => name, + ZipSource::Func(name, _f) => name, + } + } + + // Read source archive from disk + pub fn bytes(&self) -> Vec { + match &self.source { + ZipSource::File(name) => std::fs::read(zips_dir().join(name)).unwrap(), + ZipSource::Func(_name, f) => f(), + } + } +} + +pub struct ZipTestFile { + pub name: &'static str, + pub mode: Option, + pub modified: Option>, + pub content: FileContent, +} + +pub enum FileContent { + Unchecked, + Bytes(Vec), + File(&'static str), +} + +impl Default for ZipTestFile { + fn default() -> Self { + Self { + name: "default", + mode: None, + modified: None, + content: FileContent::Unchecked, + } + } +} + +pub fn zips_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .join("testdata") +} + +fn time_zone(hours: i32) -> FixedOffset { + FixedOffset::east_opt(hours * 3600).unwrap() +} + +fn date( + (year, month, day): (i32, u32, u32), + (hour, min, sec): (u32, u32, u32), + nsec: u32, + offset: FixedOffset, +) -> Option> { + Some( + offset + .with_ymd_and_hms(year, month, day, hour, min, sec) + .single()? + .with_nanosecond(nsec)? + .into(), + ) +} + +pub fn test_cases() -> Vec { + vec![ + ZipTest { + source: ZipSource::File("zip64.zip"), + files: vec![ZipTestFile { + name: "README", + content: FileContent::Bytes( + "This small file is in ZIP64 format.\n".as_bytes().into(), + ), + modified: Some(date((2012, 8, 10), (14, 33, 32), 0, time_zone(0)).unwrap()), + mode: Some(0o644), + }], + ..Default::default() + }, + ZipTest { + source: ZipSource::File("test.zip"), + comment: Some("This is a zipfile comment."), + expected_encoding: Some(Encoding::Utf8), + files: vec![ + ZipTestFile { + name: "test.txt", + content: FileContent::Bytes("This is a test text file.\n".as_bytes().into()), + modified: Some(date((2010, 9, 5), (12, 12, 1), 0, time_zone(10)).unwrap()), + mode: Some(0o644), + }, + ZipTestFile { + name: "gophercolor16x16.png", + content: FileContent::File("gophercolor16x16.png"), + modified: Some(date((2010, 9, 5), (15, 52, 58), 0, time_zone(10)).unwrap()), + mode: Some(0o644), + }, + ], + ..Default::default() + }, + ZipTest { + source: ZipSource::File("cp-437.zip"), + expected_encoding: Some(Encoding::Cp437), + files: vec![ZipTestFile { + name: "français", + ..Default::default() + }], + ..Default::default() + }, + ZipTest { + source: ZipSource::File("shift-jis.zip"), + expected_encoding: Some(Encoding::ShiftJis), + files: vec![ + ZipTestFile { + name: "should-be-jis/", + ..Default::default() + }, + ZipTestFile { + name: "should-be-jis/ot_運命のワルツネぞなぞ小さな楽しみ遊びま.longboi", + ..Default::default() + }, + ], + ..Default::default() + }, + ZipTest { + source: ZipSource::File("utf8-winrar.zip"), + expected_encoding: Some(Encoding::Utf8), + files: vec![ZipTestFile { + name: "世界", + content: FileContent::Bytes(vec![]), + modified: Some(date((2017, 11, 6), (21, 9, 27), 867862500, time_zone(0)).unwrap()), + ..Default::default() + }], + ..Default::default() + }, + #[cfg(feature = "lzma")] + ZipTest { + source: ZipSource::File("found-me-lzma.zip"), + expected_encoding: Some(Encoding::Utf8), + files: vec![ZipTestFile { + name: "found-me.txt", + content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), + modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), + ..Default::default() + }], + ..Default::default() + }, + #[cfg(feature = "deflate64")] + ZipTest { + source: ZipSource::File("found-me-deflate64.zip"), + expected_encoding: Some(Encoding::Utf8), + files: vec![ZipTestFile { + name: "found-me.txt", + content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), + modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), + ..Default::default() + }], + ..Default::default() + }, + // same with bzip2 + #[cfg(feature = "bzip2")] + ZipTest { + source: ZipSource::File("found-me-bzip2.zip"), + expected_encoding: Some(Encoding::Utf8), + files: vec![ZipTestFile { + name: "found-me.txt", + content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), + modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), + ..Default::default() + }], + ..Default::default() + }, + // same with zstd + #[cfg(feature = "zstd")] + ZipTest { + source: ZipSource::File("found-me-zstd.zip"), + expected_encoding: Some(Encoding::Utf8), + files: vec![ZipTestFile { + name: "found-me.txt", + content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), + modified: Some(date((2024, 1, 31), (6, 10, 25), 800491400, time_zone(0)).unwrap()), + ..Default::default() + }], + ..Default::default() + }, + ] +} diff --git a/rc-zip/src/lib.rs b/rc-zip/src/lib.rs index 4494fc4..a3fb43e 100644 --- a/rc-zip/src/lib.rs +++ b/rc-zip/src/lib.rs @@ -16,3 +16,6 @@ pub mod encoding; pub mod error; pub mod fsm; pub mod parse; + +#[cfg(feature = "corpus")] +pub mod corpus; From a5b9cf8f0b13b642d7485ee4cd8a2f405bd81e99 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 18:29:32 +0100 Subject: [PATCH 35/49] Move more tests to the core crate --- Cargo.lock | 1 + rc-zip-sync/tests/integration_tests.rs | 106 +------------------------ rc-zip/Cargo.toml | 2 + rc-zip/src/corpus/mod.rs | 49 +++++++++++- rc-zip/src/lib.rs | 2 +- rc-zip/tests/integration_tests.rs | 48 +++++++++++ 6 files changed, 104 insertions(+), 104 deletions(-) create mode 100644 rc-zip/tests/integration_tests.rs diff --git a/Cargo.lock b/Cargo.lock index 4f15a87..aa87289 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -805,6 +805,7 @@ dependencies = [ "oem_cp", "oval", "pretty-hex", + "test-log", "thiserror", "tracing", "winnow", diff --git a/rc-zip-sync/tests/integration_tests.rs b/rc-zip-sync/tests/integration_tests.rs index abed241..32c724c 100644 --- a/rc-zip-sync/tests/integration_tests.rs +++ b/rc-zip-sync/tests/integration_tests.rs @@ -1,11 +1,10 @@ use rc_zip::{ - corpus::{self, zips_dir, FileContent, ZipTest, ZipTestFile}, + corpus::{self, zips_dir, ZipTest, ZipTestFile}, error::Error, - parse::{Archive, EntryContents}, }; -use rc_zip_sync::{HasCursor, ReadZip, SyncArchive, SyncStoredEntry}; +use rc_zip_sync::{HasCursor, ReadZip, SyncArchive}; -use std::{cmp, fs::File}; +use std::fs::File; fn check_case(test: &ZipTest, archive: Result, Error>) { let case_bytes = test.bytes(); @@ -50,56 +49,7 @@ fn check_file(file: &ZipTestFile, archive: &SyncArchive<'_, F>) { .by_name(file.name) .unwrap_or_else(|| panic!("entry {} should exist", file.name)); - let archive_inner: &Archive = archive; - let entry_inner = archive_inner.by_name(file.name).unwrap(); - assert_eq!(entry.name(), entry_inner.name()); - - check_file_against(file, entry) -} - -fn check_file_against(file: &ZipTestFile, entry: SyncStoredEntry<'_, F>) { - if let Some(expected) = file.modified { - assert_eq!( - expected, - entry.modified(), - "entry {} should have modified = {:?}", - entry.name(), - expected - ) - } - - if let Some(mode) = file.mode { - assert_eq!(entry.mode.0 & 0o777, mode); - } - - // I have honestly yet to see a zip file _entry_ with a comment. - assert!(entry.comment().is_none()); - - match entry.contents() { - EntryContents::File => { - let actual_bytes = entry.bytes().unwrap(); - - match &file.content { - FileContent::Unchecked => { - // ah well - } - FileContent::Bytes(expected_bytes) => { - // first check length - assert_eq!(actual_bytes.len(), expected_bytes.len()); - assert_eq!(&actual_bytes[..], &expected_bytes[..]) - } - FileContent::File(file_path) => { - let expected_bytes = std::fs::read(zips_dir().join(file_path)).unwrap(); - // first check length - assert_eq!(actual_bytes.len(), expected_bytes.len()); - assert_eq!(&actual_bytes[..], &expected_bytes[..]) - } - } - } - EntryContents::Symlink | EntryContents::Directory => { - assert!(matches!(file.content, FileContent::Unchecked)); - } - } + corpus::check_file_against(file, &entry, &entry.bytes().unwrap()[..]) } #[test_log::test] @@ -124,51 +74,3 @@ fn real_world_files() { check_case(&case, case.bytes().read_zip()) } } - -#[test_log::test] -fn state_machine() { - use rc_zip::fsm::{ArchiveFsm, FsmResult}; - - let cases = corpus::test_cases(); - let case = cases.iter().find(|x| x.name() == "zip64.zip").unwrap(); - let bs = case.bytes(); - let mut fsm = ArchiveFsm::new(bs.len() as u64); - - let archive = 'read_zip: loop { - if let Some(offset) = fsm.wants_read() { - let increment = 128usize; - let offset = offset as usize; - let slice = if offset + increment > bs.len() { - &bs[offset..] - } else { - &bs[offset..offset + increment] - }; - - let len = cmp::min(slice.len(), fsm.space().len()); - fsm.space()[..len].copy_from_slice(&slice[..len]); - match len { - 0 => panic!("EOF!"), - read_bytes => { - fsm.fill(read_bytes); - } - } - } - - fsm = match fsm.process() { - Ok(res) => match res { - FsmResult::Continue(fsm) => fsm, - FsmResult::Done(archive) => break 'read_zip archive, - }, - Err(err) => { - panic!("{}", err) - } - } - }; - - let sync_archive = bs.read_zip().unwrap(); - for (se, e) in sync_archive.entries().zip(archive.entries()) { - assert_eq!(se.name(), e.name()); - assert_eq!(se.inner.compressed_size, e.inner.compressed_size); - assert_eq!(se.inner.uncompressed_size, e.inner.uncompressed_size); - } -} diff --git a/rc-zip/Cargo.toml b/rc-zip/Cargo.toml index d3c12ae..cea4b04 100644 --- a/rc-zip/Cargo.toml +++ b/rc-zip/Cargo.toml @@ -34,3 +34,5 @@ miniz_oxide = { version = "0.7.1", optional = true } deflate = ["dep:miniz_oxide"] corpus = [] +[dev-dependencies] +test-log = { version = "0.2.14", default-features = false, features = ["tracing-subscriber", "trace"] } diff --git a/rc-zip/src/corpus/mod.rs b/rc-zip/src/corpus/mod.rs index a20f9e3..7494d09 100644 --- a/rc-zip/src/corpus/mod.rs +++ b/rc-zip/src/corpus/mod.rs @@ -6,7 +6,11 @@ use std::path::PathBuf; use chrono::{DateTime, FixedOffset, TimeZone, Timelike, Utc}; -use crate::{encoding::Encoding, error::Error}; +use crate::{ + encoding::Encoding, + error::Error, + parse::{EntryContents, StoredEntry}, +}; pub enum ZipSource { File(&'static str), @@ -221,3 +225,46 @@ pub fn test_cases() -> Vec { }, ] } + +pub fn check_file_against(file: &ZipTestFile, entry: &StoredEntry, actual_bytes: &[u8]) { + if let Some(expected) = file.modified { + assert_eq!( + expected, + entry.modified(), + "entry {} should have modified = {:?}", + entry.name(), + expected + ) + } + + if let Some(mode) = file.mode { + assert_eq!(entry.mode.0 & 0o777, mode); + } + + // I have honestly yet to see a zip file _entry_ with a comment. + assert!(entry.comment().is_none()); + + match entry.contents() { + EntryContents::File => { + match &file.content { + FileContent::Unchecked => { + // ah well + } + FileContent::Bytes(expected_bytes) => { + // first check length + assert_eq!(actual_bytes.len(), expected_bytes.len()); + assert_eq!(actual_bytes, &expected_bytes[..]) + } + FileContent::File(file_path) => { + let expected_bytes = std::fs::read(zips_dir().join(file_path)).unwrap(); + // first check length + assert_eq!(actual_bytes.len(), expected_bytes.len()); + assert_eq!(actual_bytes, &expected_bytes[..]) + } + } + } + EntryContents::Symlink | EntryContents::Directory => { + assert!(matches!(file.content, FileContent::Unchecked)); + } + } +} diff --git a/rc-zip/src/lib.rs b/rc-zip/src/lib.rs index a3fb43e..50fda8c 100644 --- a/rc-zip/src/lib.rs +++ b/rc-zip/src/lib.rs @@ -17,5 +17,5 @@ pub mod error; pub mod fsm; pub mod parse; -#[cfg(feature = "corpus")] +#[cfg(any(test, feature = "corpus"))] pub mod corpus; diff --git a/rc-zip/tests/integration_tests.rs b/rc-zip/tests/integration_tests.rs new file mode 100644 index 0000000..5df72d9 --- /dev/null +++ b/rc-zip/tests/integration_tests.rs @@ -0,0 +1,48 @@ +use std::cmp; + +use rc_zip::{ + corpus, + fsm::{ArchiveFsm, FsmResult}, +}; + +#[test_log::test] +fn state_machine() { + let cases = corpus::test_cases(); + let case = cases.iter().find(|x| x.name() == "zip64.zip").unwrap(); + let bs = case.bytes(); + let mut fsm = ArchiveFsm::new(bs.len() as u64); + + let archive = 'read_zip: loop { + if let Some(offset) = fsm.wants_read() { + let increment = 128usize; + let offset = offset as usize; + let slice = if offset + increment > bs.len() { + &bs[offset..] + } else { + &bs[offset..offset + increment] + }; + + let len = cmp::min(slice.len(), fsm.space().len()); + fsm.space()[..len].copy_from_slice(&slice[..len]); + match len { + 0 => panic!("EOF!"), + read_bytes => { + fsm.fill(read_bytes); + } + } + } + + fsm = match fsm.process() { + Ok(res) => match res { + FsmResult::Continue(fsm) => fsm, + FsmResult::Done(archive) => break 'read_zip archive, + }, + Err(err) => { + panic!("{}", err) + } + } + }; + + // cool, we have the archive + let _ = archive; +} From ba0d6c2871f23523394b5d7963296cf596ca7ac0 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 18:35:36 +0100 Subject: [PATCH 36/49] Move more code into rc-zip --- rc-zip-sync/tests/integration_tests.rs | 57 +++++++------------------- rc-zip/src/corpus/mod.rs | 38 ++++++++++++++++- 2 files changed, 51 insertions(+), 44 deletions(-) diff --git a/rc-zip-sync/tests/integration_tests.rs b/rc-zip-sync/tests/integration_tests.rs index 32c724c..a893189 100644 --- a/rc-zip-sync/tests/integration_tests.rs +++ b/rc-zip-sync/tests/integration_tests.rs @@ -1,55 +1,26 @@ use rc_zip::{ - corpus::{self, zips_dir, ZipTest, ZipTestFile}, + corpus::{self, zips_dir, ZipTest}, error::Error, + parse::Archive, }; use rc_zip_sync::{HasCursor, ReadZip, SyncArchive}; use std::fs::File; fn check_case(test: &ZipTest, archive: Result, Error>) { - let case_bytes = test.bytes(); - - if let Some(expected) = &test.error { - let actual = match archive { - Err(e) => e, - Ok(_) => panic!("should have failed"), - }; - let expected = format!("{:#?}", expected); - let actual = format!("{:#?}", actual); - assert_eq!(expected, actual); - return; - } - let archive = archive.unwrap(); - - assert_eq!(case_bytes.len() as u64, archive.size()); - - if let Some(expected) = test.comment { - assert_eq!(expected, archive.comment().expect("should have comment")) - } - - if let Some(exp_encoding) = test.expected_encoding { - assert_eq!(archive.encoding(), exp_encoding); + corpus::check_case(test, archive.as_ref().map(|ar| -> &Archive { ar })); + let archive = match archive { + Ok(archive) => archive, + Err(_) => return, + }; + + for file in &test.files { + let entry = archive + .by_name(file.name) + .unwrap_or_else(|| panic!("entry {} should exist", file.name)); + + corpus::check_file_against(file, &entry, &entry.bytes().unwrap()[..]) } - - assert_eq!( - test.files.len(), - archive.entries().count(), - "{} should have {} entries files", - test.name(), - test.files.len() - ); - - for f in &test.files { - check_file(f, &archive); - } -} - -fn check_file(file: &ZipTestFile, archive: &SyncArchive<'_, F>) { - let entry = archive - .by_name(file.name) - .unwrap_or_else(|| panic!("entry {} should exist", file.name)); - - corpus::check_file_against(file, &entry, &entry.bytes().unwrap()[..]) } #[test_log::test] diff --git a/rc-zip/src/corpus/mod.rs b/rc-zip/src/corpus/mod.rs index 7494d09..f9b79ad 100644 --- a/rc-zip/src/corpus/mod.rs +++ b/rc-zip/src/corpus/mod.rs @@ -9,7 +9,7 @@ use chrono::{DateTime, FixedOffset, TimeZone, Timelike, Utc}; use crate::{ encoding::Encoding, error::Error, - parse::{EntryContents, StoredEntry}, + parse::{Archive, EntryContents, StoredEntry}, }; pub enum ZipSource { @@ -226,6 +226,42 @@ pub fn test_cases() -> Vec { ] } +pub fn check_case(test: &ZipTest, archive: Result<&Archive, &Error>) { + let case_bytes = test.bytes(); + + if let Some(expected) = &test.error { + let actual = match archive { + Err(e) => e, + Ok(_) => panic!("should have failed"), + }; + let expected = format!("{:#?}", expected); + let actual = format!("{:#?}", actual); + assert_eq!(expected, actual); + return; + } + let archive = archive.unwrap(); + + assert_eq!(case_bytes.len() as u64, archive.size()); + + if let Some(expected) = test.comment { + assert_eq!(expected, archive.comment().expect("should have comment")) + } + + if let Some(exp_encoding) = test.expected_encoding { + assert_eq!(archive.encoding(), exp_encoding); + } + + assert_eq!( + test.files.len(), + archive.entries().count(), + "{} should have {} entries files", + test.name(), + test.files.len() + ); + + // then each implementation should check individual files +} + pub fn check_file_against(file: &ZipTestFile, entry: &StoredEntry, actual_bytes: &[u8]) { if let Some(expected) = file.modified { assert_eq!( From 8be9ac3907284db1deba1247408fd44bd227e1c6 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 18:49:19 +0100 Subject: [PATCH 37/49] failing deflate tokio test! --- Cargo.lock | 14 ++++ rc-zip-sync/tests/integration_tests.rs | 12 ++- rc-zip-tokio/Cargo.toml | 5 ++ rc-zip-tokio/src/async_read_zip.rs | 21 +++-- rc-zip-tokio/src/lib.rs | 2 +- rc-zip-tokio/tests/integration_tests.rs | 52 ++++++++++++ rc-zip/src/corpus/mod.rs | 104 ++++++++++-------------- rc-zip/tests/integration_tests.rs | 4 +- 8 files changed, 141 insertions(+), 73 deletions(-) create mode 100644 rc-zip-tokio/tests/integration_tests.rs diff --git a/Cargo.lock b/Cargo.lock index aa87289..e714484 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -846,8 +846,10 @@ dependencies = [ "pin-project-lite", "positioned-io", "rc-zip", + "test-log", "tokio", "tracing", + "tracing-subscriber", "winnow", ] @@ -1046,6 +1048,18 @@ dependencies = [ "bytes", "num_cpus", "pin-project-lite", + "tokio-macros", +] + +[[package]] +name = "tokio-macros" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] diff --git a/rc-zip-sync/tests/integration_tests.rs b/rc-zip-sync/tests/integration_tests.rs index a893189..3da3967 100644 --- a/rc-zip-sync/tests/integration_tests.rs +++ b/rc-zip-sync/tests/integration_tests.rs @@ -1,5 +1,5 @@ use rc_zip::{ - corpus::{self, zips_dir, ZipTest}, + corpus::{self, zips_dir, Case}, error::Error, parse::Archive, }; @@ -7,7 +7,7 @@ use rc_zip_sync::{HasCursor, ReadZip, SyncArchive}; use std::fs::File; -fn check_case(test: &ZipTest, archive: Result, Error>) { +fn check_case(test: &Case, archive: Result, Error>) { corpus::check_case(test, archive.as_ref().map(|ar| -> &Archive { ar })); let archive = match archive { Ok(archive) => archive, @@ -41,7 +41,11 @@ fn read_from_file() { #[test_log::test] fn real_world_files() { for case in corpus::test_cases() { - tracing::trace!("============ testing {}", case.name()); - check_case(&case, case.bytes().read_zip()) + tracing::trace!("============ testing {}", case.name); + + let file = File::open(case.absolute_path()).unwrap(); + let archive = file.read_zip().map_err(Error::from); + + check_case(&case, archive) } } diff --git a/rc-zip-tokio/Cargo.toml b/rc-zip-tokio/Cargo.toml index bea183b..d1db4eb 100644 --- a/rc-zip-tokio/Cargo.toml +++ b/rc-zip-tokio/Cargo.toml @@ -31,3 +31,8 @@ winnow = "0.5.36" default = ["deflate"] deflate = ["rc-zip/deflate"] +[dev-dependencies] +test-log = { version = "0.2.14", default-features = false, features = ["tracing-subscriber", "trace"] } +tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } +rc-zip = { version = "2.0.1", path = "../rc-zip", features = ["corpus"] } +tokio = { version = "1.35.1", features = ["rt", "macros"] } \ No newline at end of file diff --git a/rc-zip-tokio/src/async_read_zip.rs b/rc-zip-tokio/src/async_read_zip.rs index acca9ee..336715a 100644 --- a/rc-zip-tokio/src/async_read_zip.rs +++ b/rc-zip-tokio/src/async_read_zip.rs @@ -1,7 +1,7 @@ use std::{io, ops::Deref, pin::Pin, sync::Arc, task}; use futures::future::BoxFuture; -use positioned_io::{RandomAccessFile, ReadAt}; +use positioned_io::{RandomAccessFile, ReadAt, Size}; use tokio::io::{AsyncRead, AsyncReadExt, ReadBuf}; use rc_zip::{ @@ -15,7 +15,7 @@ use crate::entry_reader::EntryReader; /// A trait for reading something as a zip archive. /// /// See also [AsyncReadZip]. -pub trait AsyncReadZipWithSize { +pub trait ReadZipWithSizeAsync { /// The type of the file to read from. type File: HasAsyncCursor; @@ -32,7 +32,7 @@ pub trait AsyncReadZipWithSize { /// This only contains metadata for the archive and its entries. Separate /// readers can be created for arbitraries entries on-demand using /// [AsyncStoredEntry::reader]. -pub trait AsyncReadZip { +pub trait ReadZipAsync { /// The type of the file to read from. type File: HasAsyncCursor; @@ -41,7 +41,7 @@ pub trait AsyncReadZip { async fn read_zip_async(&self) -> Result, Error>; } -impl AsyncReadZipWithSize for F +impl ReadZipWithSizeAsync for F where F: HasAsyncCursor, { @@ -75,7 +75,7 @@ where } } -impl AsyncReadZip for &[u8] { +impl ReadZipAsync for &[u8] { type File = Self; async fn read_zip_async(&self) -> Result, Error> { @@ -83,7 +83,7 @@ impl AsyncReadZip for &[u8] { } } -impl AsyncReadZip for Vec { +impl ReadZipAsync for Vec { type File = Self; async fn read_zip_async(&self) -> Result, Error> { @@ -91,6 +91,15 @@ impl AsyncReadZip for Vec { } } +impl ReadZipAsync for Arc { + type File = Self; + + async fn read_zip_async(&self) -> Result, Error> { + let size = self.size()?.unwrap_or_default(); + self.read_zip_with_size_async(size).await + } +} + /// A zip archive, read asynchronously from a file or other I/O resource. pub struct AsyncArchive<'a, F> where diff --git a/rc-zip-tokio/src/lib.rs b/rc-zip-tokio/src/lib.rs index 071204a..8666c73 100644 --- a/rc-zip-tokio/src/lib.rs +++ b/rc-zip-tokio/src/lib.rs @@ -12,6 +12,6 @@ mod entry_reader; // re-exports pub use async_read_zip::{ - AsyncArchive, AsyncReadZip, AsyncReadZipWithSize, AsyncStoredEntry, HasAsyncCursor, + AsyncArchive, AsyncStoredEntry, HasAsyncCursor, ReadZipAsync, ReadZipWithSizeAsync, }; pub use rc_zip; diff --git a/rc-zip-tokio/tests/integration_tests.rs b/rc-zip-tokio/tests/integration_tests.rs new file mode 100644 index 0000000..b4e55e6 --- /dev/null +++ b/rc-zip-tokio/tests/integration_tests.rs @@ -0,0 +1,52 @@ +use positioned_io::RandomAccessFile; +use rc_zip::{ + corpus::{self, zips_dir, Case}, + error::Error, + parse::Archive, +}; +use rc_zip_tokio::{AsyncArchive, HasAsyncCursor, ReadZipAsync}; + +use std::sync::Arc; + +async fn check_case(test: &Case, archive: Result, Error>) { + corpus::check_case(test, archive.as_ref().map(|ar| -> &Archive { ar })); + let archive = match archive { + Ok(archive) => archive, + Err(_) => return, + }; + + for file in &test.files { + let entry = archive + .by_name(file.name) + .unwrap_or_else(|| panic!("entry {} should exist", file.name)); + + corpus::check_file_against(file, &entry, &entry.bytes().await.unwrap()[..]) + } +} + +#[test_log::test(tokio::test)] +async fn read_from_slice() { + let bytes = std::fs::read(zips_dir().join("test.zip")).unwrap(); + let slice = &bytes[..]; + let archive = slice.read_zip_async().await.unwrap(); + assert_eq!(archive.entries().count(), 2); +} + +#[test_log::test(tokio::test)] +async fn read_from_file() { + let f = Arc::new(RandomAccessFile::open(zips_dir().join("test.zip")).unwrap()); + let archive = f.read_zip_async().await.unwrap(); + assert_eq!(archive.entries().count(), 2); +} + +#[test_log::test(tokio::test)] +async fn real_world_files() { + for case in corpus::test_cases() { + tracing::trace!("============ testing {}", case.name); + + let file = Arc::new(RandomAccessFile::open(case.absolute_path()).unwrap()); + let archive = file.read_zip_async().await; + + check_case(&case, archive).await + } +} diff --git a/rc-zip/src/corpus/mod.rs b/rc-zip/src/corpus/mod.rs index f9b79ad..10f409c 100644 --- a/rc-zip/src/corpus/mod.rs +++ b/rc-zip/src/corpus/mod.rs @@ -12,23 +12,18 @@ use crate::{ parse::{Archive, EntryContents, StoredEntry}, }; -pub enum ZipSource { - File(&'static str), - Func(&'static str, Box Vec>), -} - -pub struct ZipTest { - pub source: ZipSource, +pub struct Case { + pub name: &'static str, pub expected_encoding: Option, pub comment: Option<&'static str>, - pub files: Vec, + pub files: Vec, pub error: Option, } -impl Default for ZipTest { +impl Default for Case { fn default() -> Self { Self { - source: ZipSource::Func("default.zip", Box::new(|| unreachable!())), + name: "test.zip", expected_encoding: None, comment: None, files: vec![], @@ -37,24 +32,13 @@ impl Default for ZipTest { } } -impl ZipTest { - pub fn name(&self) -> &'static str { - match &self.source { - ZipSource::File(name) => name, - ZipSource::Func(name, _f) => name, - } - } - - // Read source archive from disk - pub fn bytes(&self) -> Vec { - match &self.source { - ZipSource::File(name) => std::fs::read(zips_dir().join(name)).unwrap(), - ZipSource::Func(_name, f) => f(), - } +impl Case { + pub fn absolute_path(&self) -> PathBuf { + zips_dir().join(self.name) } } -pub struct ZipTestFile { +pub struct CaseFile { pub name: &'static str, pub mode: Option, pub modified: Option>, @@ -67,7 +51,7 @@ pub enum FileContent { File(&'static str), } -impl Default for ZipTestFile { +impl Default for CaseFile { fn default() -> Self { Self { name: "default", @@ -104,11 +88,11 @@ fn date( ) } -pub fn test_cases() -> Vec { +pub fn test_cases() -> Vec { vec![ - ZipTest { - source: ZipSource::File("zip64.zip"), - files: vec![ZipTestFile { + Case { + name: "zip64.zip", + files: vec![CaseFile { name: "README", content: FileContent::Bytes( "This small file is in ZIP64 format.\n".as_bytes().into(), @@ -118,18 +102,18 @@ pub fn test_cases() -> Vec { }], ..Default::default() }, - ZipTest { - source: ZipSource::File("test.zip"), + Case { + name: "test.zip", comment: Some("This is a zipfile comment."), expected_encoding: Some(Encoding::Utf8), files: vec![ - ZipTestFile { + CaseFile { name: "test.txt", content: FileContent::Bytes("This is a test text file.\n".as_bytes().into()), modified: Some(date((2010, 9, 5), (12, 12, 1), 0, time_zone(10)).unwrap()), mode: Some(0o644), }, - ZipTestFile { + CaseFile { name: "gophercolor16x16.png", content: FileContent::File("gophercolor16x16.png"), modified: Some(date((2010, 9, 5), (15, 52, 58), 0, time_zone(10)).unwrap()), @@ -138,34 +122,34 @@ pub fn test_cases() -> Vec { ], ..Default::default() }, - ZipTest { - source: ZipSource::File("cp-437.zip"), + Case { + name: "cp-437.zip", expected_encoding: Some(Encoding::Cp437), - files: vec![ZipTestFile { + files: vec![CaseFile { name: "français", ..Default::default() }], ..Default::default() }, - ZipTest { - source: ZipSource::File("shift-jis.zip"), + Case { + name: "shift-jis.zip", expected_encoding: Some(Encoding::ShiftJis), files: vec![ - ZipTestFile { + CaseFile { name: "should-be-jis/", ..Default::default() }, - ZipTestFile { + CaseFile { name: "should-be-jis/ot_運命のワルツネぞなぞ小さな楽しみ遊びま.longboi", ..Default::default() }, ], ..Default::default() }, - ZipTest { - source: ZipSource::File("utf8-winrar.zip"), + Case { + name: "utf8-winrar.zip", expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { + files: vec![CaseFile { name: "世界", content: FileContent::Bytes(vec![]), modified: Some(date((2017, 11, 6), (21, 9, 27), 867862500, time_zone(0)).unwrap()), @@ -174,10 +158,10 @@ pub fn test_cases() -> Vec { ..Default::default() }, #[cfg(feature = "lzma")] - ZipTest { - source: ZipSource::File("found-me-lzma.zip"), + Case { + source: "found-me-lzma.zip", expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { + files: vec![CaseFile { name: "found-me.txt", content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), @@ -186,10 +170,10 @@ pub fn test_cases() -> Vec { ..Default::default() }, #[cfg(feature = "deflate64")] - ZipTest { - source: ZipSource::File("found-me-deflate64.zip"), + Case { + source: "found-me-deflate64.zip", expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { + files: vec![CaseFile { name: "found-me.txt", content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), @@ -199,10 +183,10 @@ pub fn test_cases() -> Vec { }, // same with bzip2 #[cfg(feature = "bzip2")] - ZipTest { - source: ZipSource::File("found-me-bzip2.zip"), + Case { + source: "found-me-bzip2.zip", expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { + files: vec![CaseFile { name: "found-me.txt", content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), @@ -212,10 +196,10 @@ pub fn test_cases() -> Vec { }, // same with zstd #[cfg(feature = "zstd")] - ZipTest { - source: ZipSource::File("found-me-zstd.zip"), + Case { + source: "found-me-zstd.zip", expected_encoding: Some(Encoding::Utf8), - files: vec![ZipTestFile { + files: vec![CaseFile { name: "found-me.txt", content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), modified: Some(date((2024, 1, 31), (6, 10, 25), 800491400, time_zone(0)).unwrap()), @@ -226,8 +210,8 @@ pub fn test_cases() -> Vec { ] } -pub fn check_case(test: &ZipTest, archive: Result<&Archive, &Error>) { - let case_bytes = test.bytes(); +pub fn check_case(test: &Case, archive: Result<&Archive, &Error>) { + let case_bytes = std::fs::read(test.absolute_path()).unwrap(); if let Some(expected) = &test.error { let actual = match archive { @@ -255,14 +239,14 @@ pub fn check_case(test: &ZipTest, archive: Result<&Archive, &Error>) { test.files.len(), archive.entries().count(), "{} should have {} entries files", - test.name(), + test.name, test.files.len() ); // then each implementation should check individual files } -pub fn check_file_against(file: &ZipTestFile, entry: &StoredEntry, actual_bytes: &[u8]) { +pub fn check_file_against(file: &CaseFile, entry: &StoredEntry, actual_bytes: &[u8]) { if let Some(expected) = file.modified { assert_eq!( expected, diff --git a/rc-zip/tests/integration_tests.rs b/rc-zip/tests/integration_tests.rs index 5df72d9..8078b1c 100644 --- a/rc-zip/tests/integration_tests.rs +++ b/rc-zip/tests/integration_tests.rs @@ -8,8 +8,8 @@ use rc_zip::{ #[test_log::test] fn state_machine() { let cases = corpus::test_cases(); - let case = cases.iter().find(|x| x.name() == "zip64.zip").unwrap(); - let bs = case.bytes(); + let case = cases.iter().find(|x| x.name == "zip64.zip").unwrap(); + let bs = std::fs::read(case.absolute_path()).unwrap(); let mut fsm = ArchiveFsm::new(bs.len() as u64); let archive = 'read_zip: loop { From 4a29af82e3731d4f39d876012752c6f3ab4422b0 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 18:52:44 +0100 Subject: [PATCH 38/49] Async tests pass --- rc-zip-tokio/src/entry_reader.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/rc-zip-tokio/src/entry_reader.rs b/rc-zip-tokio/src/entry_reader.rs index ea370b8..a77e0bf 100644 --- a/rc-zip-tokio/src/entry_reader.rs +++ b/rc-zip-tokio/src/entry_reader.rs @@ -50,10 +50,16 @@ where }; if fsm.wants_read() { - tracing::trace!("fsm wants read"); + tracing::trace!(space_avail = fsm.space().len(), "fsm wants read"); let mut buf = ReadBuf::new(fsm.space()); - futures::ready!(this.rd.poll_read(cx, &mut buf))?; + match this.rd.poll_read(cx, &mut buf) { + task::Poll::Ready(res) => res?, + task::Poll::Pending => { + *this.fsm = Some(fsm); + return task::Poll::Pending; + } + } let n = buf.filled().len(); tracing::trace!("read {} bytes", n); From 1dc06521256f594c947871eb1c9590776c00e65b Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 18:59:44 +0100 Subject: [PATCH 39/49] Remove old methods from rc-zip-sync --- .vscode/settings.json | 8 +- Cargo.lock | 116 ------ rc-zip-sync/Cargo.toml | 19 +- rc-zip-sync/src/decoder.rs | 104 ----- rc-zip-sync/src/entry_reader.rs | 57 +++ rc-zip-sync/src/entry_reader/bzip2_dec.rs | 22 -- rc-zip-sync/src/entry_reader/deflate64_dec.rs | 22 -- rc-zip-sync/src/entry_reader/deflate_dec.rs | 22 -- rc-zip-sync/src/entry_reader/lzma_dec.rs | 119 ------ rc-zip-sync/src/entry_reader/mod.rs | 366 ------------------ rc-zip-sync/src/entry_reader/zstd_dec.rs | 22 -- rc-zip-sync/src/lib.rs | 11 - rc-zip-sync/src/read_zip.rs | 10 +- rc-zip-tokio/Cargo.toml | 7 +- rc-zip/Cargo.toml | 6 +- rc-zip/src/corpus/mod.rs | 8 +- 16 files changed, 84 insertions(+), 835 deletions(-) delete mode 100644 rc-zip-sync/src/decoder.rs create mode 100644 rc-zip-sync/src/entry_reader.rs delete mode 100644 rc-zip-sync/src/entry_reader/bzip2_dec.rs delete mode 100644 rc-zip-sync/src/entry_reader/deflate64_dec.rs delete mode 100644 rc-zip-sync/src/entry_reader/deflate_dec.rs delete mode 100644 rc-zip-sync/src/entry_reader/lzma_dec.rs delete mode 100644 rc-zip-sync/src/entry_reader/mod.rs delete mode 100644 rc-zip-sync/src/entry_reader/zstd_dec.rs diff --git a/.vscode/settings.json b/.vscode/settings.json index f843d7b..a36c645 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,7 +1,11 @@ { "rust-analyzer.cargo.features": [ - "rc-zip/deflate", - "rc-zip/corpus" + "rc-zip/corpus", + "deflate", + "deflate64", + "bzip2", + "lzma", + "zstd", ], "rust-analyzer.linkedProjects": [ "./Cargo.toml" diff --git a/Cargo.lock b/Cargo.lock index e714484..29a45d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -128,34 +128,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" -[[package]] -name = "bzip2" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" -dependencies = [ - "bzip2-sys", - "libc", -] - -[[package]] -name = "bzip2-sys" -version = "0.1.11+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "cc" version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" dependencies = [ - "jobserver", "libc", ] @@ -255,21 +233,6 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" -[[package]] -name = "crc" -version = "3.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86ec7a15cbe22e59248fc7eadb1907dab5ba09372595da4d73dd805ed4417dfe" -dependencies = [ - "crc-catalog", -] - -[[package]] -name = "crc-catalog" -version = "2.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" - [[package]] name = "crc32fast" version = "1.3.2" @@ -279,12 +242,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "deflate64" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9576c1de19747eb6f5efb6a806c3e836512bbdb17bfedc984ccb0bcc953c8390" - [[package]] name = "encode_unicode" version = "0.3.6" @@ -306,16 +263,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" -[[package]] -name = "flate2" -version = "1.0.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" -dependencies = [ - "crc32fast", - "miniz_oxide", -] - [[package]] name = "futures" version = "0.3.30" @@ -499,15 +446,6 @@ version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" -[[package]] -name = "jobserver" -version = "0.1.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" -dependencies = [ - "libc", -] - [[package]] name = "js-sys" version = "0.3.67" @@ -541,16 +479,6 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" -[[package]] -name = "lzma-rs" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e" -dependencies = [ - "byteorder", - "crc", -] - [[package]] name = "matchers" version = "0.1.0" @@ -720,12 +648,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" -[[package]] -name = "pkg-config" -version = "0.3.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb" - [[package]] name = "portable-atomic" version = "1.6.0" @@ -815,32 +737,23 @@ dependencies = [ name = "rc-zip-sync" version = "2.0.1" dependencies = [ - "bzip2", "cfg-if", "chrono", "clap", - "crc32fast", - "deflate64", - "flate2", "humansize", "indicatif", - "lzma-rs", "oval", "positioned-io", "rc-zip", "test-log", "tracing", "tracing-subscriber", - "winnow", - "zstd", ] [[package]] name = "rc-zip-tokio" version = "2.0.1" dependencies = [ - "cfg-if", - "crc32fast", "futures", "oval", "pin-project-lite", @@ -850,7 +763,6 @@ dependencies = [ "tokio", "tracing", "tracing-subscriber", - "winnow", ] [[package]] @@ -1323,31 +1235,3 @@ checksum = "818ce546a11a9986bc24f93d0cdf38a8a1a400f1473ea8c82e59f6e0ffab9249" dependencies = [ "memchr", ] - -[[package]] -name = "zstd" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "7.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e" -dependencies = [ - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "2.0.9+zstd.1.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" -dependencies = [ - "cc", - "pkg-config", -] diff --git a/rc-zip-sync/Cargo.toml b/rc-zip-sync/Cargo.toml index d6e779d..8974690 100644 --- a/rc-zip-sync/Cargo.toml +++ b/rc-zip-sync/Cargo.toml @@ -21,26 +21,18 @@ path = "examples/jean.rs" [dependencies] positioned-io = { version = "0.3.3", optional = true } -flate2 = { version = "1.0.28", optional = true } rc-zip = { version = "2.0.1", path = "../rc-zip" } -lzma-rs = { version = "0.3.0", features = ["stream"], optional = true } -deflate64 = { version = "0.1.7", optional = true } -bzip2 = { version = "0.4.4", optional = true } -zstd = { version = "0.13.0", optional = true } oval = "2.0.0" -crc32fast = "1.3.2" tracing = "0.1.40" -cfg-if = "1.0.0" -winnow = "0.5.36" [features] default = ["file", "deflate"] file = ["positioned-io"] -deflate = ["dep:flate2", "rc-zip/deflate"] -deflate64 = ["dep:deflate64"] -lzma = ["dep:lzma-rs"] -bzip2 = ["dep:bzip2"] -zstd = ["dep:zstd"] +deflate = ["rc-zip/deflate"] +deflate64 = ["rc-zip/deflate64"] +lzma = ["rc-zip/lzma"] +bzip2 = ["rc-zip/bzip2"] +zstd = ["rc-zip/zstd"] [dev-dependencies] chrono = "0.4.33" @@ -50,3 +42,4 @@ indicatif = "0.17.7" test-log = { version = "0.2.14", default-features = false, features = ["tracing-subscriber", "trace"] } tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } rc-zip = { version = "2.0.1", path = "../rc-zip", features = ["corpus"] } +cfg-if = "1.0.0" diff --git a/rc-zip-sync/src/decoder.rs b/rc-zip-sync/src/decoder.rs deleted file mode 100644 index b826d8d..0000000 --- a/rc-zip-sync/src/decoder.rs +++ /dev/null @@ -1,104 +0,0 @@ -use std::{cmp, io}; - -use oval::Buffer; - -/// Only allows reading a fixed number of bytes from a [oval::Buffer], -/// used for reading the raw (compressed) data for a single zip file entry. -/// It also allows moving out the inner buffer afterwards. -pub(crate) struct RawEntryReader { - remaining: u64, - inner: Buffer, -} - -impl RawEntryReader { - pub(crate) fn new(inner: Buffer, entry_size: u64) -> Self { - Self { - inner, - remaining: entry_size, - } - } - - pub(crate) fn into_inner(self) -> Buffer { - self.inner - } - - pub(crate) fn get_mut(&mut self) -> &mut Buffer { - &mut self.inner - } -} - -pub(crate) trait Decoder: io::Read -where - R: io::Read, -{ - /// Moves the inner reader out of this decoder. - /// self is boxed because decoders are typically used as trait objects. - fn into_inner(self: Box) -> R; - - /// Returns a mutable reference to the inner reader. - fn get_mut(&mut self) -> &mut R; -} - -pub(crate) struct StoreDecoder -where - R: io::Read, -{ - inner: R, -} - -impl StoreDecoder -where - R: io::Read, -{ - pub(crate) fn new(inner: R) -> Self { - Self { inner } - } -} - -impl io::Read for StoreDecoder -where - R: io::Read, -{ - fn read(&mut self, buf: &mut [u8]) -> io::Result { - self.inner.read(buf) - } -} - -impl Decoder for StoreDecoder -where - R: io::Read, -{ - fn into_inner(self: Box) -> R { - self.inner - } - - fn get_mut(&mut self) -> &mut R { - &mut self.inner - } -} - -impl io::BufRead for RawEntryReader { - fn fill_buf(&mut self) -> io::Result<&[u8]> { - let max_avail = cmp::min(self.remaining, self.inner.available_data() as u64); - Ok(&self.inner.data()[..max_avail as usize]) - } - - fn consume(&mut self, amt: usize) { - self.remaining -= amt as u64; - Buffer::consume(&mut self.inner, amt); - } -} - -impl io::Read for RawEntryReader { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - let len = cmp::min(buf.len() as u64, self.remaining) as usize; - tracing::trace!(%len, buf_len = buf.len(), remaining = self.remaining, available_data = self.inner.available_data(), available_space = self.inner.available_space(), "computing len"); - - let res = self.inner.read(&mut buf[..len]); - if let Ok(n) = res { - tracing::trace!(%n, "read ok"); - self.remaining -= n as u64; - } - res - } -} diff --git a/rc-zip-sync/src/entry_reader.rs b/rc-zip-sync/src/entry_reader.rs new file mode 100644 index 0000000..8b465fb --- /dev/null +++ b/rc-zip-sync/src/entry_reader.rs @@ -0,0 +1,57 @@ +use rc_zip::{ + fsm::{EntryFsm, FsmResult}, + parse::StoredEntry, +}; +use std::io; + +pub(crate) struct EntryReader +where + R: io::Read, +{ + rd: R, + fsm: Option, +} + +impl EntryReader +where + R: io::Read, +{ + pub(crate) fn new(entry: &StoredEntry, rd: R) -> Self { + Self { + rd, + fsm: Some(EntryFsm::new(entry.method(), entry.inner)), + } + } +} + +impl io::Read for EntryReader +where + R: io::Read, +{ + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let mut fsm = match self.fsm.take() { + Some(fsm) => fsm, + None => return Ok(0), + }; + + if fsm.wants_read() { + tracing::trace!("fsm wants read"); + let n = self.rd.read(fsm.space())?; + tracing::trace!("read {} bytes", n); + fsm.fill(n); + } else { + tracing::trace!("fsm does not want read"); + } + + match fsm.process(buf)? { + FsmResult::Continue((fsm, outcome)) => { + self.fsm = Some(fsm); + Ok(outcome.bytes_written) + } + FsmResult::Done(()) => { + // neat! + Ok(0) + } + } + } +} diff --git a/rc-zip-sync/src/entry_reader/bzip2_dec.rs b/rc-zip-sync/src/entry_reader/bzip2_dec.rs deleted file mode 100644 index e808831..0000000 --- a/rc-zip-sync/src/entry_reader/bzip2_dec.rs +++ /dev/null @@ -1,22 +0,0 @@ -use std::io::Read; - -use bzip2::read::BzDecoder; - -use crate::decoder::{Decoder, RawEntryReader}; - -impl Decoder for BzDecoder -where - R: Read, -{ - fn into_inner(self: Box) -> R { - Self::into_inner(*self) - } - - fn get_mut(&mut self) -> &mut R { - Self::get_mut(self) - } -} - -pub(crate) fn mk_decoder(r: RawEntryReader) -> impl Decoder { - BzDecoder::new(r) -} diff --git a/rc-zip-sync/src/entry_reader/deflate64_dec.rs b/rc-zip-sync/src/entry_reader/deflate64_dec.rs deleted file mode 100644 index f9e6d22..0000000 --- a/rc-zip-sync/src/entry_reader/deflate64_dec.rs +++ /dev/null @@ -1,22 +0,0 @@ -use std::io::{BufReader, Read}; - -use deflate64::Deflate64Decoder; - -use crate::decoder::{Decoder, RawEntryReader}; - -impl Decoder for Deflate64Decoder> -where - R: Read, -{ - fn into_inner(self: Box) -> R { - Self::into_inner(*self).into_inner() - } - - fn get_mut(&mut self) -> &mut R { - Self::get_mut(self).get_mut() - } -} - -pub(crate) fn mk_decoder(r: RawEntryReader) -> impl Decoder { - Deflate64Decoder::new(r) -} diff --git a/rc-zip-sync/src/entry_reader/deflate_dec.rs b/rc-zip-sync/src/entry_reader/deflate_dec.rs deleted file mode 100644 index db4e1e9..0000000 --- a/rc-zip-sync/src/entry_reader/deflate_dec.rs +++ /dev/null @@ -1,22 +0,0 @@ -use std::io::Read; - -use flate2::read::DeflateDecoder; - -use crate::decoder::{Decoder, RawEntryReader}; - -impl Decoder for DeflateDecoder -where - R: Read, -{ - fn into_inner(self: Box) -> R { - Self::into_inner(*self) - } - - fn get_mut(&mut self) -> &mut R { - Self::get_mut(self) - } -} - -pub(crate) fn mk_decoder(r: RawEntryReader) -> impl Decoder { - DeflateDecoder::new(r) -} diff --git a/rc-zip-sync/src/entry_reader/lzma_dec.rs b/rc-zip-sync/src/entry_reader/lzma_dec.rs deleted file mode 100644 index d42f4aa..0000000 --- a/rc-zip-sync/src/entry_reader/lzma_dec.rs +++ /dev/null @@ -1,119 +0,0 @@ -use lzma_rs::decompress::Stream; -use std::io::{Read, Write}; - -use crate::decoder::{Decoder, RawEntryReader}; - -enum LzmaDecoderState { - Writing(Box>>), - Draining(Vec), - Transition, -} -struct LzmaDecoderAdapter { - input: R, - total_write_count: u64, - state: LzmaDecoderState, - read_buf: Vec, -} - -impl Read for LzmaDecoderAdapter -where - R: Read, -{ - fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - let mut state = LzmaDecoderState::Transition; - std::mem::swap(&mut state, &mut self.state); - - match state { - LzmaDecoderState::Writing(mut stream) => { - let bytes_read = self.input.read(&mut self.read_buf)?; - if bytes_read == 0 { - // we're EOF: finish and move on to draining - self.state = LzmaDecoderState::Draining(stream.finish()?); - // and recurse - return self.read(buf); - } - - if let Err(e) = stream.write_all(&self.read_buf[..bytes_read]) { - if e.kind() == std::io::ErrorKind::WriteZero { - // that's expected actually! from the lzma-rs tests: - // - // A WriteZero error may occur if decompression is finished but there - // are remaining `compressed` bytes to write. - // This is the case when the unpacked size is encoded as unknown but - // provided when decoding. I.e. the 5 or 6 byte end-of-stream marker - // is not read. - - // finish and move on to draining - self.state = LzmaDecoderState::Draining(stream.finish()?); - // and recurse - return self.read(buf); - } else { - return Err(e); - } - } - - self.state = LzmaDecoderState::Writing(stream); - } - LzmaDecoderState::Draining(vec) => { - // nothing more to decode, we just need to empty our - // internal buffer - self.state = LzmaDecoderState::Draining(vec); - } - LzmaDecoderState::Transition => { - unreachable!() - } - }; - - let write_buf = match &mut self.state { - LzmaDecoderState::Writing(stream) => stream.get_output_mut().unwrap(), - LzmaDecoderState::Draining(vec) => vec, - LzmaDecoderState::Transition => unreachable!(), - }; - let write_count = std::cmp::min(buf.len(), write_buf.len()); - { - let src_slice = &write_buf[..write_count]; - let dst_slice = &mut buf[..write_count]; - dst_slice.copy_from_slice(src_slice); - } - - // copy the remaining bytes to the front of the buffer - write_buf.rotate_left(write_count); - write_buf.truncate(write_buf.len() - write_count); - - self.total_write_count += write_count as u64; - Ok(write_count) - } -} - -impl Decoder for LzmaDecoderAdapter -where - R: Read, -{ - fn into_inner(self: Box) -> R { - self.input - } - - fn get_mut(&mut self) -> &mut R { - &mut self.input - } -} - -pub(crate) fn mk_decoder( - r: RawEntryReader, - uncompressed_size: u64, -) -> std::io::Result> { - let memlimit = 128 * 1024 * 1024; - let opts = lzma_rs::decompress::Options { - unpacked_size: lzma_rs::decompress::UnpackedSize::UseProvided(Some(uncompressed_size)), - allow_incomplete: false, - memlimit: Some(memlimit), - }; - - let stream = Stream::new_with_options(&opts, vec![]); - Ok(LzmaDecoderAdapter { - input: r, - total_write_count: 0, - state: LzmaDecoderState::Writing(Box::new(stream)), - read_buf: vec![0u8; 8192], - }) -} diff --git a/rc-zip-sync/src/entry_reader/mod.rs b/rc-zip-sync/src/entry_reader/mod.rs deleted file mode 100644 index 41d11b0..0000000 --- a/rc-zip-sync/src/entry_reader/mod.rs +++ /dev/null @@ -1,366 +0,0 @@ -#[cfg(feature = "deflate")] -mod deflate_dec; - -#[cfg(feature = "deflate64")] -mod deflate64_dec; - -#[cfg(feature = "bzip2")] -mod bzip2_dec; - -#[cfg(feature = "lzma")] -mod lzma_dec; - -#[cfg(feature = "zstd")] -mod zstd_dec; - -use cfg_if::cfg_if; -use oval::Buffer; -use rc_zip::{ - error::{Error, FormatError}, - fsm::{EntryFsm, FsmResult}, - parse::{DataDescriptorRecord, LocalFileHeaderRecord, Method, StoredEntry, StoredEntryInner}, -}; -use std::io; -use tracing::trace; -use winnow::{ - error::ErrMode, - stream::{AsBytes, Offset}, - Parser, Partial, -}; - -use crate::decoder::{Decoder, RawEntryReader, StoreDecoder}; - -struct EntryReadMetrics { - uncompressed_size: u64, - crc32: u32, -} - -// FIXME: move this state machine to rc-zip -#[derive(Default)] -enum State { - ReadLocalHeader { - buffer: Buffer, - }, - ReadData { - hasher: crc32fast::Hasher, - uncompressed_size: u64, - header: LocalFileHeaderRecord, - decoder: Box>, - }, - ReadDataDescriptor { - metrics: EntryReadMetrics, - header: LocalFileHeaderRecord, - buffer: Buffer, - }, - Validate { - metrics: EntryReadMetrics, - header: LocalFileHeaderRecord, - descriptor: Option, - }, - Done, - #[default] - Transitioning, -} - -pub(crate) struct EntryReader -where - R: io::Read, -{ - rd: R, - eof: bool, - state: State, - inner: StoredEntryInner, - method: Method, -} - -impl io::Read for EntryReader -where - R: io::Read, -{ - fn read(&mut self, buf: &mut [u8]) -> io::Result { - use State as S; - match self.state { - S::ReadLocalHeader { ref mut buffer } => { - let read_bytes = self.rd.read(buffer.space())?; - if read_bytes == 0 { - // we should have read the local header by now - return Err(io::ErrorKind::UnexpectedEof.into()); - } - buffer.fill(read_bytes); - - let mut input = Partial::new(buffer.data()); - match LocalFileHeaderRecord::parser.parse_next(&mut input) { - Ok(header) => { - buffer.consume(input.as_bytes().offset_from(&buffer.data())); - - trace!("local file header: {:#?}", header); - transition!(self.state => (S::ReadLocalHeader { buffer }) { - let decoder = self.get_decoder(RawEntryReader::new(buffer, self.inner.compressed_size))?; - - S::ReadData { - hasher: crc32fast::Hasher::new(), - uncompressed_size: 0, - decoder, - header, - } - }); - self.read(buf) - } - Err(ErrMode::Incomplete(_)) => self.read(buf), - Err(_e) => Err(Error::Format(FormatError::InvalidLocalHeader).into()), - } - } - S::ReadData { - ref mut uncompressed_size, - ref mut decoder, - ref mut hasher, - .. - } => { - { - let buffer = decoder.get_mut().get_mut(); - if !self.eof && buffer.available_data() == 0 { - if buffer.available_space() == 0 { - buffer.shift(); - } - - match self.rd.read(buffer.space())? { - 0 => { - self.eof = true; - } - n => { - buffer.fill(n); - } - } - } - } - match decoder.read(buf)? { - 0 => { - transition!(self.state => (S::ReadData { decoder, header, hasher, uncompressed_size, .. }) { - let limited_reader = decoder.into_inner(); - let buffer = limited_reader.into_inner(); - let metrics = EntryReadMetrics { - crc32: hasher.finalize(), - uncompressed_size, - }; - if header.has_data_descriptor() { - trace!("will read data descriptor (flags = {:x})", header.flags); - S::ReadDataDescriptor { metrics, buffer, header } - } else { - trace!("no data descriptor to read"); - S::Validate { metrics, header, descriptor: None } - } - }); - self.read(buf) - } - n => { - *uncompressed_size += n as u64; - hasher.update(&buf[..n]); - Ok(n) - } - } - } - S::ReadDataDescriptor { ref mut buffer, .. } => { - trace!( - "read data descriptor, avail data = {}, avail space = {}", - buffer.available_data(), - buffer.available_space() - ); - - let mut input = Partial::new(buffer.data()); - match DataDescriptorRecord::mk_parser(self.inner.is_zip64).parse_next(&mut input) { - Ok(descriptor) => { - buffer.consume(input.as_bytes().offset_from(&buffer.data())); - trace!("data descriptor = {:#?}", descriptor); - transition!(self.state => (S::ReadDataDescriptor { metrics, header, .. }) { - S::Validate { metrics, header, descriptor: Some(descriptor) } - }); - self.read(buf) - } - Err(ErrMode::Incomplete(_)) => { - let n = self.rd.read(buffer.space())?; - if n == 0 { - return Err(io::ErrorKind::UnexpectedEof.into()); - } - buffer.fill(n); - trace!("filled {}", n); - - self.read(buf) - } - Err(_e) => Err(Error::Format(FormatError::InvalidLocalHeader).into()), - } - } - S::Validate { - ref metrics, - ref header, - ref descriptor, - } => { - let expected_crc32 = if self.inner.crc32 != 0 { - self.inner.crc32 - } else if let Some(descriptor) = descriptor.as_ref() { - descriptor.crc32 - } else { - header.crc32 - }; - - let expected_size = if self.inner.uncompressed_size != 0 { - self.inner.uncompressed_size - } else if let Some(descriptor) = descriptor.as_ref() { - descriptor.uncompressed_size - } else { - header.uncompressed_size as u64 - }; - - if expected_size != metrics.uncompressed_size { - return Err(Error::Format(FormatError::WrongSize { - expected: expected_size, - actual: metrics.uncompressed_size, - }) - .into()); - } - - if expected_crc32 != 0 && expected_crc32 != metrics.crc32 { - return Err(Error::Format(FormatError::WrongChecksum { - expected: expected_crc32, - actual: metrics.crc32, - }) - .into()); - } - - self.state = S::Done; - self.read(buf) - } - S::Done => Ok(0), - S::Transitioning => unreachable!(), - } - } -} - -impl EntryReader -where - R: io::Read, -{ - const DEFAULT_BUFFER_SIZE: usize = 256 * 1024; - - pub(crate) fn new(entry: &StoredEntry, rd: R) -> Self { - Self { - rd, - eof: false, - state: State::ReadLocalHeader { - buffer: Buffer::with_capacity(Self::DEFAULT_BUFFER_SIZE), - }, - method: entry.method(), - inner: entry.inner, - } - } - - fn get_decoder( - &self, - raw_r: RawEntryReader, - ) -> Result>, Error> { - let decoder: Box> = match self.method { - Method::Store => Box::new(StoreDecoder::new(raw_r)), - Method::Deflate => { - cfg_if! { - if #[cfg(feature = "deflate")] { - Box::new(deflate_dec::mk_decoder(raw_r)) - } else { - return Err(Error::method_not_enabled(self.method)); - } - } - } - Method::Deflate64 => { - cfg_if! { - if #[cfg(feature = "deflate64")] { - Box::new(deflate64_dec::mk_decoder(raw_r)) - } else { - return Err(Error::method_not_enabled(self.method)); - } - } - } - Method::Lzma => { - cfg_if! { - if #[cfg(feature = "lzma")] { - Box::new(lzma_dec::mk_decoder(raw_r, self.inner.uncompressed_size)?) - } else { - return Err(Error::method_not_enabled(self.method)); - } - } - } - Method::Bzip2 => { - cfg_if! { - if #[cfg(feature = "bzip2")] { - Box::new(bzip2_dec::mk_decoder(raw_r)) - } else { - return Err(Error::method_not_enabled(self.method)); - } - } - } - Method::Zstd => { - cfg_if! { - if #[cfg(feature = "zstd")] { - Box::new(zstd_dec::mk_decoder(raw_r)?) - } else { - return Err(Error::method_not_enabled(self.method)); - } - } - } - method => { - return Err(Error::method_not_supported(method)); - } - }; - - Ok(decoder) - } -} - -pub(crate) struct FsmEntryReader -where - R: io::Read, -{ - rd: R, - fsm: Option, -} - -impl FsmEntryReader -where - R: io::Read, -{ - pub(crate) fn new(entry: &StoredEntry, rd: R) -> Self { - Self { - rd, - fsm: Some(EntryFsm::new(entry.method(), entry.inner)), - } - } -} - -impl io::Read for FsmEntryReader -where - R: io::Read, -{ - fn read(&mut self, buf: &mut [u8]) -> io::Result { - let mut fsm = match self.fsm.take() { - Some(fsm) => fsm, - None => return Ok(0), - }; - - if fsm.wants_read() { - tracing::trace!("fsm wants read"); - let n = self.rd.read(fsm.space())?; - tracing::trace!("read {} bytes", n); - fsm.fill(n); - } else { - tracing::trace!("fsm does not want read"); - } - - match fsm.process(buf)? { - FsmResult::Continue((fsm, outcome)) => { - self.fsm = Some(fsm); - Ok(outcome.bytes_written) - } - FsmResult::Done(()) => { - // neat! - Ok(0) - } - } - } -} diff --git a/rc-zip-sync/src/entry_reader/zstd_dec.rs b/rc-zip-sync/src/entry_reader/zstd_dec.rs deleted file mode 100644 index bc0df63..0000000 --- a/rc-zip-sync/src/entry_reader/zstd_dec.rs +++ /dev/null @@ -1,22 +0,0 @@ -use std::io::{BufRead, Read}; - -use zstd::stream::Decoder as ZstdDecoder; - -use crate::decoder::{Decoder, RawEntryReader}; - -impl Decoder for ZstdDecoder<'static, R> -where - R: Read + BufRead, -{ - fn into_inner(self: Box) -> R { - Self::finish(*self) - } - - fn get_mut(&mut self) -> &mut R { - Self::get_mut(self) - } -} - -pub(crate) fn mk_decoder(r: RawEntryReader) -> std::io::Result> { - ZstdDecoder::with_buffer(r) -} diff --git a/rc-zip-sync/src/lib.rs b/rc-zip-sync/src/lib.rs index 4d963c5..304a1dd 100644 --- a/rc-zip-sync/src/lib.rs +++ b/rc-zip-sync/src/lib.rs @@ -7,17 +7,6 @@ #![warn(missing_docs)] -macro_rules! transition { - ($state: expr => ($pattern: pat) $body: expr) => { - $state = if let $pattern = std::mem::take(&mut $state) { - $body - } else { - unreachable!() - }; - }; -} - -mod decoder; mod entry_reader; mod read_zip; diff --git a/rc-zip-sync/src/read_zip.rs b/rc-zip-sync/src/read_zip.rs index fa234ea..3089090 100644 --- a/rc-zip-sync/src/read_zip.rs +++ b/rc-zip-sync/src/read_zip.rs @@ -4,7 +4,7 @@ use rc_zip::{ parse::{Archive, StoredEntry}, }; -use crate::entry_reader::{EntryReader, FsmEntryReader}; +use crate::entry_reader::EntryReader; use std::{io::Read, ops::Deref}; /// A trait for reading something as a zip archive @@ -153,19 +153,13 @@ where { /// Returns a reader for the entry. pub fn reader(&self) -> impl Read + 'a { - // FIXME: replace with `fsm_reader`` EntryReader::new(self.entry, self.file.cursor_at(self.entry.header_offset)) } - /// Returns an fsm-based reader for the entry - pub fn fsm_reader(&self) -> impl Read + 'a { - FsmEntryReader::new(self.entry, self.file.cursor_at(self.entry.header_offset)) - } - /// Reads the entire entry into a vector. pub fn bytes(&self) -> std::io::Result> { let mut v = Vec::new(); - self.fsm_reader().read_to_end(&mut v)?; + self.reader().read_to_end(&mut v)?; Ok(v) } } diff --git a/rc-zip-tokio/Cargo.toml b/rc-zip-tokio/Cargo.toml index d1db4eb..ee65588 100644 --- a/rc-zip-tokio/Cargo.toml +++ b/rc-zip-tokio/Cargo.toml @@ -22,14 +22,15 @@ tokio = { version = "1.35.1", features = ["fs", "io-util", "rt-multi-thread"] } futures = { version = "0.3.30" } pin-project-lite = { version = "0.2.13" } oval = "2.0.0" -crc32fast = "1.3.2" tracing = "0.1.40" -cfg-if = "1.0.0" -winnow = "0.5.36" [features] default = ["deflate"] deflate = ["rc-zip/deflate"] +deflate64 = ["rc-zip/deflate64"] +lzma = ["rc-zip/lzma"] +bzip2 = ["rc-zip/bzip2"] +zstd = ["rc-zip/zstd"] [dev-dependencies] test-log = { version = "0.2.14", default-features = false, features = ["tracing-subscriber", "trace"] } diff --git a/rc-zip/Cargo.toml b/rc-zip/Cargo.toml index cea4b04..284a65f 100644 --- a/rc-zip/Cargo.toml +++ b/rc-zip/Cargo.toml @@ -31,8 +31,12 @@ crc32fast = "1.3.2" miniz_oxide = { version = "0.7.1", optional = true } [features] -deflate = ["dep:miniz_oxide"] corpus = [] +deflate = ["dep:miniz_oxide"] +deflate64 = [] +bzip2 = [] +lzma = [] +zstd = [] [dev-dependencies] test-log = { version = "0.2.14", default-features = false, features = ["tracing-subscriber", "trace"] } diff --git a/rc-zip/src/corpus/mod.rs b/rc-zip/src/corpus/mod.rs index 10f409c..2cc8cc9 100644 --- a/rc-zip/src/corpus/mod.rs +++ b/rc-zip/src/corpus/mod.rs @@ -159,7 +159,7 @@ pub fn test_cases() -> Vec { }, #[cfg(feature = "lzma")] Case { - source: "found-me-lzma.zip", + name: "found-me-lzma.zip", expected_encoding: Some(Encoding::Utf8), files: vec![CaseFile { name: "found-me.txt", @@ -171,7 +171,7 @@ pub fn test_cases() -> Vec { }, #[cfg(feature = "deflate64")] Case { - source: "found-me-deflate64.zip", + name: "found-me-deflate64.zip", expected_encoding: Some(Encoding::Utf8), files: vec![CaseFile { name: "found-me.txt", @@ -184,7 +184,7 @@ pub fn test_cases() -> Vec { // same with bzip2 #[cfg(feature = "bzip2")] Case { - source: "found-me-bzip2.zip", + name: "found-me-bzip2.zip", expected_encoding: Some(Encoding::Utf8), files: vec![CaseFile { name: "found-me.txt", @@ -197,7 +197,7 @@ pub fn test_cases() -> Vec { // same with zstd #[cfg(feature = "zstd")] Case { - source: "found-me-zstd.zip", + name: "found-me-zstd.zip", expected_encoding: Some(Encoding::Utf8), files: vec![CaseFile { name: "found-me.txt", From 12de2eae00b55b10de565fec8a193df309d40346 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 19:10:57 +0100 Subject: [PATCH 40/49] Re-add bzip2 support --- Cargo.lock | 28 ++++++++++ rc-zip/Cargo.toml | 3 +- rc-zip/src/fsm/entry/bzip2_dec.rs | 67 +++++++++++++++++++++++ rc-zip/src/fsm/entry/deflate_dec.rs | 8 +-- rc-zip/src/fsm/{entry.rs => entry/mod.rs} | 19 ++++++- 5 files changed, 119 insertions(+), 6 deletions(-) create mode 100644 rc-zip/src/fsm/entry/bzip2_dec.rs rename rc-zip/src/fsm/{entry.rs => entry/mod.rs} (95%) diff --git a/Cargo.lock b/Cargo.lock index 29a45d4..b2cbdbf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -128,6 +128,27 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "cc" version = "1.0.83" @@ -648,6 +669,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb" + [[package]] name = "portable-atomic" version = "1.6.0" @@ -717,6 +744,7 @@ checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" name = "rc-zip" version = "2.0.1" dependencies = [ + "bzip2", "cfg-if", "chardetng", "chrono", diff --git a/rc-zip/Cargo.toml b/rc-zip/Cargo.toml index 284a65f..df72cb6 100644 --- a/rc-zip/Cargo.toml +++ b/rc-zip/Cargo.toml @@ -29,12 +29,13 @@ num_enum = "0.7.2" cfg-if = "1.0.0" crc32fast = "1.3.2" miniz_oxide = { version = "0.7.1", optional = true } +bzip2 = { version = "0.4.4", optional = true } [features] corpus = [] deflate = ["dep:miniz_oxide"] deflate64 = [] -bzip2 = [] +bzip2 = ["dep:bzip2"] lzma = [] zstd = [] diff --git a/rc-zip/src/fsm/entry/bzip2_dec.rs b/rc-zip/src/fsm/entry/bzip2_dec.rs new file mode 100644 index 0000000..6abad7c --- /dev/null +++ b/rc-zip/src/fsm/entry/bzip2_dec.rs @@ -0,0 +1,67 @@ +use crate::{error::Error, parse::Method}; + +use super::{DecompressOutcome, Decompressor, HasMoreInput}; + +pub(crate) struct Bzip2Dec { + inner: bzip2::Decompress, + eof: bool, +} + +impl Default for Bzip2Dec { + fn default() -> Self { + // don't use the 'small' alternative decompression algorithm + let small = false; + Self { + inner: bzip2::Decompress::new(small), + eof: false, + } + } +} + +impl Decompressor for Bzip2Dec { + fn decompress( + &mut self, + in_buf: &[u8], + out: &mut [u8], + _has_more_input: HasMoreInput, + ) -> Result { + tracing::trace!( + in_buf_len = in_buf.len(), + out_len = out.len(), + total_in = self.inner.total_in(), + total_out = self.inner.total_out(), + "Bzip2Dec::decompress", + ); + + if self.eof { + return Ok(DecompressOutcome { + bytes_written: 0, + bytes_read: 0, + }); + } + + let before_in = self.inner.total_in(); + let before_out = self.inner.total_out(); + + match self.inner.decompress(in_buf, out) { + Ok(status) => { + tracing::trace!("status: {:?}", status); + if status == bzip2::Status::StreamEnd { + self.eof = true; + } + } + Err(e) => { + return Err(Error::Decompression { + method: Method::Bzip2, + msg: e.to_string(), + }) + } + }; + + let outcome = DecompressOutcome { + bytes_written: (self.inner.total_out() - before_out) as usize, + bytes_read: (self.inner.total_in() - before_in) as usize, + }; + Ok(outcome) + } +} diff --git a/rc-zip/src/fsm/entry/deflate_dec.rs b/rc-zip/src/fsm/entry/deflate_dec.rs index 427d592..a87f9d8 100644 --- a/rc-zip/src/fsm/entry/deflate_dec.rs +++ b/rc-zip/src/fsm/entry/deflate_dec.rs @@ -52,19 +52,19 @@ impl Decompressor for DeflateDec { fn decompress( &mut self, in_buf: &[u8], - out_buf: &mut [u8], + out: &mut [u8], has_more_input: HasMoreInput, ) -> Result { tracing::trace!( in_buf_len = in_buf.len(), - out_buf_len = out_buf.len(), + out_len = out.len(), remain_in_internal_buffer = self.remain_in_internal_buffer, out_pos = self.out_pos, "DeflateDec::decompress", ); let mut outcome: DecompressOutcome = Default::default(); - self.copy_to_outbuf(out_buf, &mut outcome); + self.copy_to_outbuf(out, &mut outcome); if outcome.bytes_written > 0 { tracing::trace!( "returning {} bytes from internal buffer", @@ -115,7 +115,7 @@ impl Decompressor for DeflateDec { }, } - self.copy_to_outbuf(out_buf, &mut outcome); + self.copy_to_outbuf(out, &mut outcome); Ok(outcome) } } diff --git a/rc-zip/src/fsm/entry.rs b/rc-zip/src/fsm/entry/mod.rs similarity index 95% rename from rc-zip/src/fsm/entry.rs rename to rc-zip/src/fsm/entry/mod.rs index 0f2cbfc..187c4f4 100644 --- a/rc-zip/src/fsm/entry.rs +++ b/rc-zip/src/fsm/entry/mod.rs @@ -13,6 +13,9 @@ mod store_dec; #[cfg(feature = "deflate")] mod deflate_dec; +#[cfg(feature = "bzip2")] +mod bzip2_dec; + use crate::{ error::{Error, FormatError, UnsupportedError}, parse::{DataDescriptorRecord, LocalFileHeaderRecord, Method, StoredEntryInner}, @@ -290,6 +293,8 @@ enum AnyDecompressor { Store(store_dec::StoreDec), #[cfg(feature = "deflate")] Deflate(Box), + #[cfg(feature = "bzip2")] + Bzip2(bzip2_dec::Bzip2Dec), } #[derive(Default, Debug)] @@ -310,7 +315,7 @@ trait Decompressor { fn decompress( &mut self, in_buf: &[u8], - out_buf: &mut [u8], + out: &mut [u8], has_more_input: HasMoreInput, ) -> Result; } @@ -328,6 +333,14 @@ impl AnyDecompressor { return Err(err); } + #[cfg(feature = "bzip2")] + Method::Bzip2 => Self::Bzip2(Default::default()), + #[cfg(not(feature = "bzip2"))] + Method::Bzip2 => { + let err = Error::Unsupported(UnsupportedError::MethodNotEnabled(method)); + return Err(err); + } + _ => { let err = Error::Unsupported(UnsupportedError::MethodNotSupported(method)); return Err(err); @@ -335,7 +348,9 @@ impl AnyDecompressor { }; Ok(dec) } +} +impl Decompressor for AnyDecompressor { #[inline] fn decompress( &mut self, @@ -348,6 +363,8 @@ impl AnyDecompressor { Self::Store(dec) => dec.decompress(in_buf, out, has_more_input), #[cfg(feature = "deflate")] Self::Deflate(dec) => dec.decompress(in_buf, out, has_more_input), + #[cfg(feature = "bzip2")] + Self::Bzip2(dec) => dec.decompress(in_buf, out, has_more_input), } } } From c9525cb9602111fe555b8e305e3d661e85c26e5d Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 19:12:55 +0100 Subject: [PATCH 41/49] Stub LZMA --- Cargo.lock | 26 ++++++++++++++++++++++++++ rc-zip/Cargo.toml | 3 ++- rc-zip/src/fsm/entry/lzma_dec.rs | 22 ++++++++++++++++++++++ rc-zip/src/fsm/entry/mod.rs | 15 +++++++++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 rc-zip/src/fsm/entry/lzma_dec.rs diff --git a/Cargo.lock b/Cargo.lock index b2cbdbf..64dfed0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -254,6 +254,21 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +[[package]] +name = "crc" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86ec7a15cbe22e59248fc7eadb1907dab5ba09372595da4d73dd805ed4417dfe" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crc32fast" version = "1.3.2" @@ -500,6 +515,16 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lzma-rs" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e" +dependencies = [ + "byteorder", + "crc", +] + [[package]] name = "matchers" version = "0.1.0" @@ -750,6 +775,7 @@ dependencies = [ "chrono", "crc32fast", "encoding_rs", + "lzma-rs", "miniz_oxide", "num_enum", "oem_cp", diff --git a/rc-zip/Cargo.toml b/rc-zip/Cargo.toml index df72cb6..9f482f0 100644 --- a/rc-zip/Cargo.toml +++ b/rc-zip/Cargo.toml @@ -30,13 +30,14 @@ cfg-if = "1.0.0" crc32fast = "1.3.2" miniz_oxide = { version = "0.7.1", optional = true } bzip2 = { version = "0.4.4", optional = true } +lzma-rs = { version = "0.3.0", optional = true } [features] corpus = [] deflate = ["dep:miniz_oxide"] deflate64 = [] bzip2 = ["dep:bzip2"] -lzma = [] +lzma = ["dep:lzma-rs"] zstd = [] [dev-dependencies] diff --git a/rc-zip/src/fsm/entry/lzma_dec.rs b/rc-zip/src/fsm/entry/lzma_dec.rs new file mode 100644 index 0000000..8c80e65 --- /dev/null +++ b/rc-zip/src/fsm/entry/lzma_dec.rs @@ -0,0 +1,22 @@ +use crate::{error::Error, parse::Method}; + +use super::{DecompressOutcome, Decompressor, HasMoreInput}; + +pub(crate) struct LzmaDec {} + +impl Default for LzmaDec { + fn default() -> Self { + Self {} + } +} + +impl Decompressor for LzmaDec { + fn decompress( + &mut self, + in_buf: &[u8], + out: &mut [u8], + _has_more_input: HasMoreInput, + ) -> Result { + todo!() + } +} diff --git a/rc-zip/src/fsm/entry/mod.rs b/rc-zip/src/fsm/entry/mod.rs index 187c4f4..bc9681a 100644 --- a/rc-zip/src/fsm/entry/mod.rs +++ b/rc-zip/src/fsm/entry/mod.rs @@ -16,6 +16,9 @@ mod deflate_dec; #[cfg(feature = "bzip2")] mod bzip2_dec; +#[cfg(feature = "lzma")] +mod lzma_dec; + use crate::{ error::{Error, FormatError, UnsupportedError}, parse::{DataDescriptorRecord, LocalFileHeaderRecord, Method, StoredEntryInner}, @@ -295,6 +298,8 @@ enum AnyDecompressor { Deflate(Box), #[cfg(feature = "bzip2")] Bzip2(bzip2_dec::Bzip2Dec), + #[cfg(feature = "lzma")] + Lzma(lzma_dec::LzmaDec), } #[derive(Default, Debug)] @@ -341,6 +346,14 @@ impl AnyDecompressor { return Err(err); } + #[cfg(feature = "lzma")] + Method::Lzma => Self::Lzma(Default::default()), + #[cfg(not(feature = "lzma"))] + Method::Lzma => { + let err = Error::Unsupported(UnsupportedError::MethodNotEnabled(method)); + return Err(err); + } + _ => { let err = Error::Unsupported(UnsupportedError::MethodNotSupported(method)); return Err(err); @@ -365,6 +378,8 @@ impl Decompressor for AnyDecompressor { Self::Deflate(dec) => dec.decompress(in_buf, out, has_more_input), #[cfg(feature = "bzip2")] Self::Bzip2(dec) => dec.decompress(in_buf, out, has_more_input), + #[cfg(feature = "lzma")] + Self::Lzma(dec) => dec.decompress(in_buf, out, has_more_input), } } } From a4758152fdc30c5de966baea080ced39bfaf7774 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 19:33:49 +0100 Subject: [PATCH 42/49] Confused about lzma --- rc-zip-sync/src/entry_reader.rs | 9 +++- rc-zip-tokio/src/entry_reader.rs | 8 +++- rc-zip/Cargo.toml | 2 +- rc-zip/src/fsm/entry/deflate_dec.rs | 14 +++--- rc-zip/src/fsm/entry/lzma_dec.rs | 73 +++++++++++++++++++++++++++-- rc-zip/src/fsm/entry/mod.rs | 22 +++++++-- 6 files changed, 108 insertions(+), 20 deletions(-) diff --git a/rc-zip-sync/src/entry_reader.rs b/rc-zip-sync/src/entry_reader.rs index 8b465fb..a48a6df 100644 --- a/rc-zip-sync/src/entry_reader.rs +++ b/rc-zip-sync/src/entry_reader.rs @@ -37,7 +37,7 @@ where if fsm.wants_read() { tracing::trace!("fsm wants read"); let n = self.rd.read(fsm.space())?; - tracing::trace!("read {} bytes", n); + tracing::trace!("giving fsm {} bytes", n); fsm.fill(n); } else { tracing::trace!("fsm does not want read"); @@ -46,7 +46,12 @@ where match fsm.process(buf)? { FsmResult::Continue((fsm, outcome)) => { self.fsm = Some(fsm); - Ok(outcome.bytes_written) + if outcome.bytes_written > 0 { + Ok(outcome.bytes_written) + } else { + // loop, it happens + self.read(buf) + } } FsmResult::Done(()) => { // neat! diff --git a/rc-zip-tokio/src/entry_reader.rs b/rc-zip-tokio/src/entry_reader.rs index a77e0bf..aea0405 100644 --- a/rc-zip-tokio/src/entry_reader.rs +++ b/rc-zip-tokio/src/entry_reader.rs @@ -71,7 +71,13 @@ where match fsm.process(buf.initialize_unfilled())? { FsmResult::Continue((fsm, outcome)) => { *this.fsm = Some(fsm); - buf.advance(outcome.bytes_written); + if outcome.bytes_written > 0 { + tracing::trace!("wrote {} bytes", outcome.bytes_written); + buf.advance(outcome.bytes_written); + } else { + // loop, it happens + return self.poll_read(cx, buf); + } } FsmResult::Done(()) => { // neat! diff --git a/rc-zip/Cargo.toml b/rc-zip/Cargo.toml index 9f482f0..bfde897 100644 --- a/rc-zip/Cargo.toml +++ b/rc-zip/Cargo.toml @@ -30,7 +30,7 @@ cfg-if = "1.0.0" crc32fast = "1.3.2" miniz_oxide = { version = "0.7.1", optional = true } bzip2 = { version = "0.4.4", optional = true } -lzma-rs = { version = "0.3.0", optional = true } +lzma-rs = { version = "0.3.0", optional = true, features = ["stream"] } [features] corpus = [] diff --git a/rc-zip/src/fsm/entry/deflate_dec.rs b/rc-zip/src/fsm/entry/deflate_dec.rs index a87f9d8..48b2b22 100644 --- a/rc-zip/src/fsm/entry/deflate_dec.rs +++ b/rc-zip/src/fsm/entry/deflate_dec.rs @@ -64,7 +64,7 @@ impl Decompressor for DeflateDec { ); let mut outcome: DecompressOutcome = Default::default(); - self.copy_to_outbuf(out, &mut outcome); + self.copy_to_out(out, &mut outcome); if outcome.bytes_written > 0 { tracing::trace!( "returning {} bytes from internal buffer", @@ -115,7 +115,7 @@ impl Decompressor for DeflateDec { }, } - self.copy_to_outbuf(out, &mut outcome); + self.copy_to_out(out, &mut outcome); Ok(outcome) } } @@ -123,22 +123,22 @@ impl Decompressor for DeflateDec { impl DeflateDec { const INTERNAL_BUFFER_LENGTH: usize = 64 * 1024; - fn copy_to_outbuf(&mut self, mut out_buf: &mut [u8], outcome: &mut DecompressOutcome) { + fn copy_to_out(&mut self, mut out: &mut [u8], outcome: &mut DecompressOutcome) { // as long as there's room in out_buf and we have remaining data in the // internal buffer, copy from internal_buffer wrapping as needed, // decreasing self.remain_in_internal_buffer and increasing self.out_pos // and outcome.bytes_written - while !out_buf.is_empty() && self.remain_in_internal_buffer > 0 { - let copy_len = cmp::min(self.remain_in_internal_buffer, out_buf.len()); + while !out.is_empty() && self.remain_in_internal_buffer > 0 { + let copy_len = cmp::min(self.remain_in_internal_buffer, out.len()); // take wrapping into account let copy_len = cmp::min(copy_len, self.internal_buffer.len() - self.out_pos); trace!("copying {} bytes from internal buffer to out_buf", copy_len); - out_buf[..copy_len].copy_from_slice(&self.internal_buffer[self.out_pos..][..copy_len]); + out[..copy_len].copy_from_slice(&self.internal_buffer[self.out_pos..][..copy_len]); self.out_pos += copy_len; outcome.bytes_written += copy_len; self.remain_in_internal_buffer -= copy_len; - out_buf = &mut out_buf[copy_len..]; + out = &mut out[copy_len..]; // if we've reached the end of the buffer, wrap around if self.out_pos == self.internal_buffer.len() { diff --git a/rc-zip/src/fsm/entry/lzma_dec.rs b/rc-zip/src/fsm/entry/lzma_dec.rs index 8c80e65..e9564eb 100644 --- a/rc-zip/src/fsm/entry/lzma_dec.rs +++ b/rc-zip/src/fsm/entry/lzma_dec.rs @@ -1,12 +1,28 @@ +use std::{cmp, io::Write}; + use crate::{error::Error, parse::Method}; use super::{DecompressOutcome, Decompressor, HasMoreInput}; -pub(crate) struct LzmaDec {} +use lzma_rs::decompress::Stream; +use tracing::trace; + +pub(crate) struct LzmaDec { + stream: Stream>, +} + +impl LzmaDec { + pub fn new(uncompressed_size: u64) -> Self { + let memlimit = 128 * 1024 * 1024; + let opts = lzma_rs::decompress::Options { + unpacked_size: lzma_rs::decompress::UnpackedSize::UseProvided(Some(uncompressed_size)), + allow_incomplete: false, + memlimit: Some(memlimit), + }; -impl Default for LzmaDec { - fn default() -> Self { - Self {} + Self { + stream: Stream::new_with_options(&opts, vec![]), + } } } @@ -17,6 +33,53 @@ impl Decompressor for LzmaDec { out: &mut [u8], _has_more_input: HasMoreInput, ) -> Result { - todo!() + tracing::trace!( + in_buf_len = in_buf.len(), + out_len = out.len(), + remain_in_internal_buffer = self.stream.get_output_mut().unwrap().len(), + "DeflateDec::decompress", + ); + + let mut outcome: DecompressOutcome = Default::default(); + + self.copy_to_out(out, &mut outcome); + if outcome.bytes_written > 0 { + trace!("LzmaDec: bytes_written > 0"); + return Ok(outcome); + } + + let n = self + .stream + .write(in_buf) + .map_err(|e| Error::Decompression { + method: Method::Lzma, + msg: e.to_string(), + })?; + trace!("LzmaDec: wrote n = {}", n); + outcome.bytes_read = n; + + self.copy_to_out(out, &mut outcome); + trace!("LzmaDec: bytes_written = {}", outcome.bytes_written); + Ok(outcome) + } +} + +impl LzmaDec { + fn copy_to_out(&mut self, mut out: &mut [u8], outcome: &mut DecompressOutcome) { + let internal_buf = self.stream.get_output_mut().unwrap(); + + while !out.is_empty() && !internal_buf.is_empty() { + let to_copy = cmp::min(out.len(), internal_buf.len()); + trace!("LzmaDec: to_copy = {}", to_copy); + out[..to_copy].copy_from_slice(&internal_buf[..to_copy]); + out = &mut out[to_copy..]; + + // rotate the internal buffer + internal_buf.rotate_left(to_copy); + // and shrink it + internal_buf.resize(internal_buf.len() - to_copy, 0); + + outcome.bytes_written += to_copy; + } } } diff --git a/rc-zip/src/fsm/entry/mod.rs b/rc-zip/src/fsm/entry/mod.rs index bc9681a..54b77d3 100644 --- a/rc-zip/src/fsm/entry/mod.rs +++ b/rc-zip/src/fsm/entry/mod.rs @@ -151,7 +151,7 @@ impl EntryFsm { compressed_bytes: 0, uncompressed_bytes: 0, hasher: crc32fast::Hasher::new(), - decompressor: AnyDecompressor::new(self.method)?, + decompressor: AnyDecompressor::new(self.method, &self.entry)?, }; self.process(out) } @@ -183,6 +183,14 @@ impl EntryFsm { HasMoreInput::Yes }; let outcome = decompressor.decompress(in_buf, out, has_more_input)?; + trace!( + compressed_bytes = *compressed_bytes, + uncompressed_bytes = *uncompressed_bytes, + bytes_read = outcome.bytes_read, + bytes_written = outcome.bytes_written, + eof = self.eof, + "decompressed" + ); self.buffer.consume(outcome.bytes_read); *compressed_bytes += outcome.bytes_read as u64; @@ -205,6 +213,12 @@ impl EntryFsm { // update the number of bytes we've decompressed *uncompressed_bytes += outcome.bytes_written as u64; + trace!( + compressed_bytes = *compressed_bytes, + uncompressed_bytes = *uncompressed_bytes, + "updated hasher" + ); + Ok(FsmResult::Continue((self, outcome))) } S::ReadDataDescriptor { .. } => { @@ -299,7 +313,7 @@ enum AnyDecompressor { #[cfg(feature = "bzip2")] Bzip2(bzip2_dec::Bzip2Dec), #[cfg(feature = "lzma")] - Lzma(lzma_dec::LzmaDec), + Lzma(Box), } #[derive(Default, Debug)] @@ -326,7 +340,7 @@ trait Decompressor { } impl AnyDecompressor { - fn new(method: Method) -> Result { + fn new(method: Method, entry: &StoredEntryInner) -> Result { let dec = match method { Method::Store => Self::Store(Default::default()), @@ -347,7 +361,7 @@ impl AnyDecompressor { } #[cfg(feature = "lzma")] - Method::Lzma => Self::Lzma(Default::default()), + Method::Lzma => Self::Lzma(Box::new(lzma_dec::LzmaDec::new(entry.uncompressed_size))), #[cfg(not(feature = "lzma"))] Method::Lzma => { let err = Error::Unsupported(UnsupportedError::MethodNotEnabled(method)); From b433f81e72bee64033f6c3a08f47c9a1c0473018 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 19:41:19 +0100 Subject: [PATCH 43/49] Fix conditional data descriptor thingy --- rc-zip-tokio/src/entry_reader.rs | 1 - rc-zip/src/fsm/entry/lzma_dec.rs | 1 + rc-zip/src/fsm/entry/mod.rs | 18 ++++++++++-------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/rc-zip-tokio/src/entry_reader.rs b/rc-zip-tokio/src/entry_reader.rs index aea0405..c4af59b 100644 --- a/rc-zip-tokio/src/entry_reader.rs +++ b/rc-zip-tokio/src/entry_reader.rs @@ -51,7 +51,6 @@ where if fsm.wants_read() { tracing::trace!(space_avail = fsm.space().len(), "fsm wants read"); - let mut buf = ReadBuf::new(fsm.space()); match this.rd.poll_read(cx, &mut buf) { task::Poll::Ready(res) => res?, diff --git a/rc-zip/src/fsm/entry/lzma_dec.rs b/rc-zip/src/fsm/entry/lzma_dec.rs index e9564eb..8bd8dc6 100644 --- a/rc-zip/src/fsm/entry/lzma_dec.rs +++ b/rc-zip/src/fsm/entry/lzma_dec.rs @@ -13,6 +13,7 @@ pub(crate) struct LzmaDec { impl LzmaDec { pub fn new(uncompressed_size: u64) -> Self { + trace!(%uncompressed_size, "LzmaDec::new"); let memlimit = 128 * 1024 * 1024; let opts = lzma_rs::decompress::Options { unpacked_size: lzma_rs::decompress::UnpackedSize::UseProvided(Some(uncompressed_size)), diff --git a/rc-zip/src/fsm/entry/mod.rs b/rc-zip/src/fsm/entry/mod.rs index 54b77d3..d518e68 100644 --- a/rc-zip/src/fsm/entry/mod.rs +++ b/rc-zip/src/fsm/entry/mod.rs @@ -184,10 +184,9 @@ impl EntryFsm { }; let outcome = decompressor.decompress(in_buf, out, has_more_input)?; trace!( + ?outcome, compressed_bytes = *compressed_bytes, uncompressed_bytes = *uncompressed_bytes, - bytes_read = outcome.bytes_read, - bytes_written = outcome.bytes_written, eof = self.eof, "decompressed" ); @@ -197,12 +196,15 @@ impl EntryFsm { if outcome.bytes_written == 0 && self.eof { // we're done, let's read the data descriptor (if there's one) transition!(self.state => (S::ReadData { header, uncompressed_bytes, hasher, .. }) { - S::ReadDataDescriptor { - header, - metrics: EntryReadMetrics { - uncompressed_size: uncompressed_bytes, - crc32: hasher.finalize(), - }, + let metrics = EntryReadMetrics { + uncompressed_size: uncompressed_bytes, + crc32: hasher.finalize(), + }; + + if header.has_data_descriptor() { + S::ReadDataDescriptor { header, metrics } + } else { + S::Validate { header, metrics, descriptor: None } } }); return self.process(out); From 35126fe39c4c8280e6b9d8d9fee631768e5e65a3 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 19:55:36 +0100 Subject: [PATCH 44/49] LZMA works --- rc-zip/src/fsm/entry/lzma_dec.rs | 112 ++++++++++++++++++++++++------- rc-zip/src/fsm/entry/mod.rs | 4 +- 2 files changed, 90 insertions(+), 26 deletions(-) diff --git a/rc-zip/src/fsm/entry/lzma_dec.rs b/rc-zip/src/fsm/entry/lzma_dec.rs index 8bd8dc6..e3ffc73 100644 --- a/rc-zip/src/fsm/entry/lzma_dec.rs +++ b/rc-zip/src/fsm/entry/lzma_dec.rs @@ -4,25 +4,35 @@ use crate::{error::Error, parse::Method}; use super::{DecompressOutcome, Decompressor, HasMoreInput}; -use lzma_rs::decompress::Stream; +use lzma_rs::decompress::{Options, Stream, UnpackedSize}; use tracing::trace; +#[derive(Default)] +enum State { + Writing(Box>>), + Draining(Vec), + + #[default] + Transition, +} + pub(crate) struct LzmaDec { - stream: Stream>, + state: State, } impl LzmaDec { pub fn new(uncompressed_size: u64) -> Self { - trace!(%uncompressed_size, "LzmaDec::new"); - let memlimit = 128 * 1024 * 1024; - let opts = lzma_rs::decompress::Options { - unpacked_size: lzma_rs::decompress::UnpackedSize::UseProvided(Some(uncompressed_size)), - allow_incomplete: false, - memlimit: Some(memlimit), - }; + let stream = Stream::new_with_options( + &(Options { + unpacked_size: UnpackedSize::UseProvided(Some(uncompressed_size)), + allow_incomplete: false, + memlimit: Some(128 * 1024 * 1024), + }), + vec![], + ); Self { - stream: Stream::new_with_options(&opts, vec![]), + state: State::Writing(Box::new(stream)), } } } @@ -32,12 +42,12 @@ impl Decompressor for LzmaDec { &mut self, in_buf: &[u8], out: &mut [u8], - _has_more_input: HasMoreInput, + has_more_input: HasMoreInput, ) -> Result { tracing::trace!( in_buf_len = in_buf.len(), out_len = out.len(), - remain_in_internal_buffer = self.stream.get_output_mut().unwrap().len(), + remain_in_internal_buffer = self.internal_buf_mut().len(), "DeflateDec::decompress", ); @@ -45,33 +55,85 @@ impl Decompressor for LzmaDec { self.copy_to_out(out, &mut outcome); if outcome.bytes_written > 0 { - trace!("LzmaDec: bytes_written > 0"); + trace!( + "LzmaDec: still draining internal buffer, just copied {} bytes", + outcome.bytes_written + ); return Ok(outcome); } - let n = self - .stream - .write(in_buf) - .map_err(|e| Error::Decompression { - method: Method::Lzma, - msg: e.to_string(), - })?; - trace!("LzmaDec: wrote n = {}", n); - outcome.bytes_read = n; + match &mut self.state { + State::Writing(stream) => { + let n = stream.write(in_buf).map_err(dec_err)?; + trace!( + "LzmaDec: wrote {} bytes to decompressor (of {} available)", + n, + in_buf.len() + ); + outcome.bytes_read = n; + + // if we haven't written all the input, and we haven't gotten + // any output, then we need to keep going + if n != 0 && n < in_buf.len() && self.internal_buf_mut().is_empty() { + trace!("LzmaDec: didn't write all output AND no output yet, so keep going"); + return self.decompress(&in_buf[n..], out, has_more_input); + } + + match has_more_input { + HasMoreInput::Yes => { + // keep going + trace!("LzmaDec: more input to come"); + } + HasMoreInput::No => { + trace!("LzmaDec: no more input to come"); + match std::mem::take(&mut self.state) { + State::Writing(stream) => { + trace!("LzmaDec: finishing..."); + self.state = State::Draining(stream.finish().map_err(dec_err)?); + } + _ => unreachable!(), + } + } + } + } + State::Draining(_) => { + // keep going + } + State::Transition => unreachable!(), + } self.copy_to_out(out, &mut outcome); - trace!("LzmaDec: bytes_written = {}", outcome.bytes_written); + trace!( + "LzmaDec: decompressor gave us {} bytes", + outcome.bytes_written + ); Ok(outcome) } } +fn dec_err(e: impl std::fmt::Display) -> Error { + Error::Decompression { + method: Method::Lzma, + msg: e.to_string(), + } +} + impl LzmaDec { + #[inline(always)] + fn internal_buf_mut(&mut self) -> &mut Vec { + match &mut self.state { + State::Writing(stream) => stream.get_output_mut().unwrap(), + State::Draining(buf) => buf, + State::Transition => unreachable!(), + } + } + fn copy_to_out(&mut self, mut out: &mut [u8], outcome: &mut DecompressOutcome) { - let internal_buf = self.stream.get_output_mut().unwrap(); + let internal_buf = self.internal_buf_mut(); while !out.is_empty() && !internal_buf.is_empty() { let to_copy = cmp::min(out.len(), internal_buf.len()); - trace!("LzmaDec: to_copy = {}", to_copy); + trace!("LzmaDec: copying {} bytes from internal buffer", to_copy); out[..to_copy].copy_from_slice(&internal_buf[..to_copy]); out = &mut out[to_copy..]; diff --git a/rc-zip/src/fsm/entry/mod.rs b/rc-zip/src/fsm/entry/mod.rs index d518e68..d576fde 100644 --- a/rc-zip/src/fsm/entry/mod.rs +++ b/rc-zip/src/fsm/entry/mod.rs @@ -177,7 +177,9 @@ impl EntryFsm { ); let in_buf = &in_buf[..in_buf_max_len]; - let has_more_input = if *compressed_bytes == self.entry.compressed_size as _ { + let fed_bytes_after_this = *compressed_bytes + in_buf.len() as u64; + + let has_more_input = if fed_bytes_after_this == self.entry.compressed_size as _ { HasMoreInput::No } else { HasMoreInput::Yes From 6ac4b5494e41e4824ed2e65647292f6a9ec6a7c3 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 19:56:39 +0100 Subject: [PATCH 45/49] Explain what on cthulu's blue sea is going on with that n != 0 condition --- rc-zip/src/fsm/entry/lzma_dec.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/rc-zip/src/fsm/entry/lzma_dec.rs b/rc-zip/src/fsm/entry/lzma_dec.rs index e3ffc73..29a5b6e 100644 --- a/rc-zip/src/fsm/entry/lzma_dec.rs +++ b/rc-zip/src/fsm/entry/lzma_dec.rs @@ -75,6 +75,10 @@ impl Decompressor for LzmaDec { // if we haven't written all the input, and we haven't gotten // any output, then we need to keep going if n != 0 && n < in_buf.len() && self.internal_buf_mut().is_empty() { + // note: the n != 0 here is because apparently there can be a 10-byte + // trailer after LZMA compressed data? and the decoder will _refuse_ + // to let us write them, so when we have just these 10 bytes left, + // it's good to just let the decoder finish up. trace!("LzmaDec: didn't write all output AND no output yet, so keep going"); return self.decompress(&in_buf[n..], out, has_more_input); } From e43f575c0b9f34979bcc7c7d52e472a791bb54e2 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 20:09:18 +0100 Subject: [PATCH 46/49] Re-add zstd support --- Cargo.lock | 39 +++++++ rc-zip-sync/tests/integration_tests.rs | 2 +- rc-zip-tokio/tests/integration_tests.rs | 2 +- rc-zip/Cargo.toml | 3 +- rc-zip/src/fsm/entry/mod.rs | 15 +++ rc-zip/src/fsm/entry/zstd_dec.rs | 144 ++++++++++++++++++++++++ 6 files changed, 202 insertions(+), 3 deletions(-) create mode 100644 rc-zip/src/fsm/entry/zstd_dec.rs diff --git a/Cargo.lock b/Cargo.lock index 64dfed0..1066f14 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -155,6 +155,7 @@ version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" dependencies = [ + "jobserver", "libc", ] @@ -482,6 +483,15 @@ version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +[[package]] +name = "jobserver" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.67" @@ -785,6 +795,7 @@ dependencies = [ "thiserror", "tracing", "winnow", + "zstd", ] [[package]] @@ -1289,3 +1300,31 @@ checksum = "818ce546a11a9986bc24f93d0cdf38a8a1a400f1473ea8c82e59f6e0ffab9249" dependencies = [ "memchr", ] + +[[package]] +name = "zstd" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.9+zstd.1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/rc-zip-sync/tests/integration_tests.rs b/rc-zip-sync/tests/integration_tests.rs index 3da3967..a459318 100644 --- a/rc-zip-sync/tests/integration_tests.rs +++ b/rc-zip-sync/tests/integration_tests.rs @@ -41,7 +41,7 @@ fn read_from_file() { #[test_log::test] fn real_world_files() { for case in corpus::test_cases() { - tracing::trace!("============ testing {}", case.name); + tracing::info!("============ testing {}", case.name); let file = File::open(case.absolute_path()).unwrap(); let archive = file.read_zip().map_err(Error::from); diff --git a/rc-zip-tokio/tests/integration_tests.rs b/rc-zip-tokio/tests/integration_tests.rs index b4e55e6..7cf49c2 100644 --- a/rc-zip-tokio/tests/integration_tests.rs +++ b/rc-zip-tokio/tests/integration_tests.rs @@ -42,7 +42,7 @@ async fn read_from_file() { #[test_log::test(tokio::test)] async fn real_world_files() { for case in corpus::test_cases() { - tracing::trace!("============ testing {}", case.name); + tracing::info!("============ testing {}", case.name); let file = Arc::new(RandomAccessFile::open(case.absolute_path()).unwrap()); let archive = file.read_zip_async().await; diff --git a/rc-zip/Cargo.toml b/rc-zip/Cargo.toml index bfde897..9fe5309 100644 --- a/rc-zip/Cargo.toml +++ b/rc-zip/Cargo.toml @@ -31,6 +31,7 @@ crc32fast = "1.3.2" miniz_oxide = { version = "0.7.1", optional = true } bzip2 = { version = "0.4.4", optional = true } lzma-rs = { version = "0.3.0", optional = true, features = ["stream"] } +zstd = { version = "0.13.0", optional = true } [features] corpus = [] @@ -38,7 +39,7 @@ deflate = ["dep:miniz_oxide"] deflate64 = [] bzip2 = ["dep:bzip2"] lzma = ["dep:lzma-rs"] -zstd = [] +zstd = ["dep:zstd"] [dev-dependencies] test-log = { version = "0.2.14", default-features = false, features = ["tracing-subscriber", "trace"] } diff --git a/rc-zip/src/fsm/entry/mod.rs b/rc-zip/src/fsm/entry/mod.rs index d576fde..1f42959 100644 --- a/rc-zip/src/fsm/entry/mod.rs +++ b/rc-zip/src/fsm/entry/mod.rs @@ -19,6 +19,9 @@ mod bzip2_dec; #[cfg(feature = "lzma")] mod lzma_dec; +#[cfg(feature = "zstd")] +mod zstd_dec; + use crate::{ error::{Error, FormatError, UnsupportedError}, parse::{DataDescriptorRecord, LocalFileHeaderRecord, Method, StoredEntryInner}, @@ -318,6 +321,8 @@ enum AnyDecompressor { Bzip2(bzip2_dec::Bzip2Dec), #[cfg(feature = "lzma")] Lzma(Box), + #[cfg(feature = "zstd")] + Zstd(zstd_dec::ZstdDec), } #[derive(Default, Debug)] @@ -372,6 +377,14 @@ impl AnyDecompressor { return Err(err); } + #[cfg(feature = "zstd")] + Method::Zstd => Self::Zstd(zstd_dec::ZstdDec::new()?), + #[cfg(not(feature = "zstd"))] + Method::Zstd => { + let err = Error::Unsupported(UnsupportedError::MethodNotEnabled(method)); + return Err(err); + } + _ => { let err = Error::Unsupported(UnsupportedError::MethodNotSupported(method)); return Err(err); @@ -398,6 +411,8 @@ impl Decompressor for AnyDecompressor { Self::Bzip2(dec) => dec.decompress(in_buf, out, has_more_input), #[cfg(feature = "lzma")] Self::Lzma(dec) => dec.decompress(in_buf, out, has_more_input), + #[cfg(feature = "zstd")] + Self::Zstd(dec) => dec.decompress(in_buf, out, has_more_input), } } } diff --git a/rc-zip/src/fsm/entry/zstd_dec.rs b/rc-zip/src/fsm/entry/zstd_dec.rs new file mode 100644 index 0000000..e687693 --- /dev/null +++ b/rc-zip/src/fsm/entry/zstd_dec.rs @@ -0,0 +1,144 @@ +use std::{cmp, io::Write}; + +use crate::{error::Error, parse::Method}; + +use super::{DecompressOutcome, Decompressor, HasMoreInput}; + +use tracing::trace; +use zstd::stream::write::Decoder; + +#[derive(Default)] +enum State { + Writing(Box>>), + Draining(Vec), + + #[default] + Transition, +} + +pub(crate) struct ZstdDec { + state: State, +} + +impl ZstdDec { + pub fn new() -> Result { + Ok(Self { + state: State::Writing(Box::new(Decoder::new(vec![])?)), + }) + } +} + +impl Decompressor for ZstdDec { + fn decompress( + &mut self, + in_buf: &[u8], + out: &mut [u8], + has_more_input: HasMoreInput, + ) -> Result { + tracing::trace!( + in_buf_len = in_buf.len(), + out_len = out.len(), + remain_in_internal_buffer = self.internal_buf_mut().len(), + "DeflateDec::decompress", + ); + + let mut outcome: DecompressOutcome = Default::default(); + + self.copy_to_out(out, &mut outcome); + if outcome.bytes_written > 0 { + trace!( + "ZstdDec: still draining internal buffer, just copied {} bytes", + outcome.bytes_written + ); + return Ok(outcome); + } + + match &mut self.state { + State::Writing(stream) => { + let n = stream.write(in_buf).map_err(dec_err)?; + trace!( + "ZstdDec: wrote {} bytes to decompressor (of {} available)", + n, + in_buf.len() + ); + outcome.bytes_read = n; + + // if we haven't written all the input, and we haven't gotten + // any output, then we need to keep going + if n != 0 && n < in_buf.len() && self.internal_buf_mut().is_empty() { + // note: the n != 0 here is because apparently there can be a 10-byte + // trailer after LZMA compressed data? and the decoder will _refuse_ + // to let us write them, so when we have just these 10 bytes left, + // it's good to just let the decoder finish up. + trace!("ZstdDec: didn't write all output AND no output yet, so keep going"); + return self.decompress(&in_buf[n..], out, has_more_input); + } + + match has_more_input { + HasMoreInput::Yes => { + // keep going + trace!("ZstdDec: more input to come"); + } + HasMoreInput::No => { + trace!("ZstdDec: no more input to come"); + match std::mem::take(&mut self.state) { + State::Writing(mut stream) => { + trace!("ZstdDec: finishing..."); + stream.flush().map_err(dec_err)?; + self.state = State::Draining(stream.into_inner()); + } + _ => unreachable!(), + } + } + } + } + State::Draining(_) => { + // keep going + } + State::Transition => unreachable!(), + } + + self.copy_to_out(out, &mut outcome); + trace!( + "ZstdDec: decompressor gave us {} bytes", + outcome.bytes_written + ); + Ok(outcome) + } +} + +fn dec_err(e: impl std::fmt::Display) -> Error { + Error::Decompression { + method: Method::Zstd, + msg: e.to_string(), + } +} + +impl ZstdDec { + #[inline(always)] + fn internal_buf_mut(&mut self) -> &mut Vec { + match &mut self.state { + State::Writing(stream) => stream.get_mut(), + State::Draining(buf) => buf, + State::Transition => unreachable!(), + } + } + + fn copy_to_out(&mut self, mut out: &mut [u8], outcome: &mut DecompressOutcome) { + let internal_buf = self.internal_buf_mut(); + + while !out.is_empty() && !internal_buf.is_empty() { + let to_copy = cmp::min(out.len(), internal_buf.len()); + trace!("ZstdDec: copying {} bytes from internal buffer", to_copy); + out[..to_copy].copy_from_slice(&internal_buf[..to_copy]); + out = &mut out[to_copy..]; + + // rotate the internal buffer + internal_buf.rotate_left(to_copy); + // and shrink it + internal_buf.resize(internal_buf.len() - to_copy, 0); + + outcome.bytes_written += to_copy; + } + } +} From a8b0c4f02c3e21fcf63143a0d2cf11e93e8a51e8 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 20:21:26 +0100 Subject: [PATCH 47/49] Re-add deflate64 support --- Cargo.lock | 7 ++++ rc-zip/Cargo.toml | 3 +- rc-zip/src/fsm/entry/deflate64_dec.rs | 47 +++++++++++++++++++++++++++ rc-zip/src/fsm/entry/deflate_dec.rs | 2 +- rc-zip/src/fsm/entry/lzma_dec.rs | 21 +++++------- rc-zip/src/fsm/entry/mod.rs | 15 +++++++++ rc-zip/src/fsm/entry/zstd_dec.rs | 21 +++++------- 7 files changed, 90 insertions(+), 26 deletions(-) create mode 100644 rc-zip/src/fsm/entry/deflate64_dec.rs diff --git a/Cargo.lock b/Cargo.lock index 1066f14..eaa3efb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -279,6 +279,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "deflate64" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9576c1de19747eb6f5efb6a806c3e836512bbdb17bfedc984ccb0bcc953c8390" + [[package]] name = "encode_unicode" version = "0.3.6" @@ -784,6 +790,7 @@ dependencies = [ "chardetng", "chrono", "crc32fast", + "deflate64", "encoding_rs", "lzma-rs", "miniz_oxide", diff --git a/rc-zip/Cargo.toml b/rc-zip/Cargo.toml index 9fe5309..9177949 100644 --- a/rc-zip/Cargo.toml +++ b/rc-zip/Cargo.toml @@ -29,6 +29,7 @@ num_enum = "0.7.2" cfg-if = "1.0.0" crc32fast = "1.3.2" miniz_oxide = { version = "0.7.1", optional = true } +deflate64 = { version = "0.1.7", optional = true } bzip2 = { version = "0.4.4", optional = true } lzma-rs = { version = "0.3.0", optional = true, features = ["stream"] } zstd = { version = "0.13.0", optional = true } @@ -36,7 +37,7 @@ zstd = { version = "0.13.0", optional = true } [features] corpus = [] deflate = ["dep:miniz_oxide"] -deflate64 = [] +deflate64 = ["dep:deflate64"] bzip2 = ["dep:bzip2"] lzma = ["dep:lzma-rs"] zstd = ["dep:zstd"] diff --git a/rc-zip/src/fsm/entry/deflate64_dec.rs b/rc-zip/src/fsm/entry/deflate64_dec.rs new file mode 100644 index 0000000..0a2cfec --- /dev/null +++ b/rc-zip/src/fsm/entry/deflate64_dec.rs @@ -0,0 +1,47 @@ +use deflate64::InflaterManaged; + +use crate::{error::Error, parse::Method}; + +use super::{DecompressOutcome, Decompressor, HasMoreInput}; + +pub(crate) struct Deflate64Dec { + inflater: InflaterManaged, +} + +impl Default for Deflate64Dec { + fn default() -> Self { + Self { + inflater: InflaterManaged::new(), + } + } +} + +impl Decompressor for Deflate64Dec { + fn decompress( + &mut self, + in_buf: &[u8], + out: &mut [u8], + _has_more_input: HasMoreInput, + ) -> Result { + tracing::trace!( + in_buf_len = in_buf.len(), + out_len = out.len(), + remain_in_internal_buffer = self.inflater.available_output(), + "decompress", + ); + + let res = self.inflater.inflate(in_buf, out); + if res.data_error { + return Err(Error::Decompression { + method: Method::Deflate64, + msg: "data error".into(), + }); + } + + let outcome = DecompressOutcome { + bytes_read: res.bytes_consumed, + bytes_written: res.bytes_written, + }; + Ok(outcome) + } +} diff --git a/rc-zip/src/fsm/entry/deflate_dec.rs b/rc-zip/src/fsm/entry/deflate_dec.rs index 48b2b22..db405b6 100644 --- a/rc-zip/src/fsm/entry/deflate_dec.rs +++ b/rc-zip/src/fsm/entry/deflate_dec.rs @@ -60,7 +60,7 @@ impl Decompressor for DeflateDec { out_len = out.len(), remain_in_internal_buffer = self.remain_in_internal_buffer, out_pos = self.out_pos, - "DeflateDec::decompress", + "decompress", ); let mut outcome: DecompressOutcome = Default::default(); diff --git a/rc-zip/src/fsm/entry/lzma_dec.rs b/rc-zip/src/fsm/entry/lzma_dec.rs index 29a5b6e..0b98890 100644 --- a/rc-zip/src/fsm/entry/lzma_dec.rs +++ b/rc-zip/src/fsm/entry/lzma_dec.rs @@ -48,7 +48,7 @@ impl Decompressor for LzmaDec { in_buf_len = in_buf.len(), out_len = out.len(), remain_in_internal_buffer = self.internal_buf_mut().len(), - "DeflateDec::decompress", + "decompress", ); let mut outcome: DecompressOutcome = Default::default(); @@ -56,7 +56,7 @@ impl Decompressor for LzmaDec { self.copy_to_out(out, &mut outcome); if outcome.bytes_written > 0 { trace!( - "LzmaDec: still draining internal buffer, just copied {} bytes", + "still draining internal buffer, just copied {} bytes", outcome.bytes_written ); return Ok(outcome); @@ -66,7 +66,7 @@ impl Decompressor for LzmaDec { State::Writing(stream) => { let n = stream.write(in_buf).map_err(dec_err)?; trace!( - "LzmaDec: wrote {} bytes to decompressor (of {} available)", + "wrote {} bytes to decompressor (of {} available)", n, in_buf.len() ); @@ -79,20 +79,20 @@ impl Decompressor for LzmaDec { // trailer after LZMA compressed data? and the decoder will _refuse_ // to let us write them, so when we have just these 10 bytes left, // it's good to just let the decoder finish up. - trace!("LzmaDec: didn't write all output AND no output yet, so keep going"); + trace!("didn't write all output AND no output yet, so keep going"); return self.decompress(&in_buf[n..], out, has_more_input); } match has_more_input { HasMoreInput::Yes => { // keep going - trace!("LzmaDec: more input to come"); + trace!("more input to come"); } HasMoreInput::No => { - trace!("LzmaDec: no more input to come"); + trace!("no more input to come"); match std::mem::take(&mut self.state) { State::Writing(stream) => { - trace!("LzmaDec: finishing..."); + trace!("finishing..."); self.state = State::Draining(stream.finish().map_err(dec_err)?); } _ => unreachable!(), @@ -107,10 +107,7 @@ impl Decompressor for LzmaDec { } self.copy_to_out(out, &mut outcome); - trace!( - "LzmaDec: decompressor gave us {} bytes", - outcome.bytes_written - ); + trace!("decompressor gave us {} bytes", outcome.bytes_written); Ok(outcome) } } @@ -137,7 +134,7 @@ impl LzmaDec { while !out.is_empty() && !internal_buf.is_empty() { let to_copy = cmp::min(out.len(), internal_buf.len()); - trace!("LzmaDec: copying {} bytes from internal buffer", to_copy); + trace!("copying {} bytes from internal buffer", to_copy); out[..to_copy].copy_from_slice(&internal_buf[..to_copy]); out = &mut out[to_copy..]; diff --git a/rc-zip/src/fsm/entry/mod.rs b/rc-zip/src/fsm/entry/mod.rs index 1f42959..8b451c0 100644 --- a/rc-zip/src/fsm/entry/mod.rs +++ b/rc-zip/src/fsm/entry/mod.rs @@ -13,6 +13,9 @@ mod store_dec; #[cfg(feature = "deflate")] mod deflate_dec; +#[cfg(feature = "deflate64")] +mod deflate64_dec; + #[cfg(feature = "bzip2")] mod bzip2_dec; @@ -317,6 +320,8 @@ enum AnyDecompressor { Store(store_dec::StoreDec), #[cfg(feature = "deflate")] Deflate(Box), + #[cfg(feature = "deflate64")] + Deflate64(Box), #[cfg(feature = "bzip2")] Bzip2(bzip2_dec::Bzip2Dec), #[cfg(feature = "lzma")] @@ -361,6 +366,14 @@ impl AnyDecompressor { return Err(err); } + #[cfg(feature = "deflate64")] + Method::Deflate64 => Self::Deflate64(Default::default()), + #[cfg(not(feature = "deflate64"))] + Method::Deflate64 => { + let err = Error::Unsupported(UnsupportedError::MethodNotEnabled(method)); + return Err(err); + } + #[cfg(feature = "bzip2")] Method::Bzip2 => Self::Bzip2(Default::default()), #[cfg(not(feature = "bzip2"))] @@ -407,6 +420,8 @@ impl Decompressor for AnyDecompressor { Self::Store(dec) => dec.decompress(in_buf, out, has_more_input), #[cfg(feature = "deflate")] Self::Deflate(dec) => dec.decompress(in_buf, out, has_more_input), + #[cfg(feature = "deflate64")] + Self::Deflate64(dec) => dec.decompress(in_buf, out, has_more_input), #[cfg(feature = "bzip2")] Self::Bzip2(dec) => dec.decompress(in_buf, out, has_more_input), #[cfg(feature = "lzma")] diff --git a/rc-zip/src/fsm/entry/zstd_dec.rs b/rc-zip/src/fsm/entry/zstd_dec.rs index e687693..276fefc 100644 --- a/rc-zip/src/fsm/entry/zstd_dec.rs +++ b/rc-zip/src/fsm/entry/zstd_dec.rs @@ -39,7 +39,7 @@ impl Decompressor for ZstdDec { in_buf_len = in_buf.len(), out_len = out.len(), remain_in_internal_buffer = self.internal_buf_mut().len(), - "DeflateDec::decompress", + "decompress", ); let mut outcome: DecompressOutcome = Default::default(); @@ -47,7 +47,7 @@ impl Decompressor for ZstdDec { self.copy_to_out(out, &mut outcome); if outcome.bytes_written > 0 { trace!( - "ZstdDec: still draining internal buffer, just copied {} bytes", + "still draining internal buffer, just copied {} bytes", outcome.bytes_written ); return Ok(outcome); @@ -57,7 +57,7 @@ impl Decompressor for ZstdDec { State::Writing(stream) => { let n = stream.write(in_buf).map_err(dec_err)?; trace!( - "ZstdDec: wrote {} bytes to decompressor (of {} available)", + "wrote {} bytes to decompressor (of {} available)", n, in_buf.len() ); @@ -70,20 +70,20 @@ impl Decompressor for ZstdDec { // trailer after LZMA compressed data? and the decoder will _refuse_ // to let us write them, so when we have just these 10 bytes left, // it's good to just let the decoder finish up. - trace!("ZstdDec: didn't write all output AND no output yet, so keep going"); + trace!("didn't write all output AND no output yet, so keep going"); return self.decompress(&in_buf[n..], out, has_more_input); } match has_more_input { HasMoreInput::Yes => { // keep going - trace!("ZstdDec: more input to come"); + trace!("more input to come"); } HasMoreInput::No => { - trace!("ZstdDec: no more input to come"); + trace!("no more input to come"); match std::mem::take(&mut self.state) { State::Writing(mut stream) => { - trace!("ZstdDec: finishing..."); + trace!("finishing..."); stream.flush().map_err(dec_err)?; self.state = State::Draining(stream.into_inner()); } @@ -99,10 +99,7 @@ impl Decompressor for ZstdDec { } self.copy_to_out(out, &mut outcome); - trace!( - "ZstdDec: decompressor gave us {} bytes", - outcome.bytes_written - ); + trace!("decompressor gave us {} bytes", outcome.bytes_written); Ok(outcome) } } @@ -129,7 +126,7 @@ impl ZstdDec { while !out.is_empty() && !internal_buf.is_empty() { let to_copy = cmp::min(out.len(), internal_buf.len()); - trace!("ZstdDec: copying {} bytes from internal buffer", to_copy); + trace!("copying {} bytes from internal buffer", to_copy); out[..to_copy].copy_from_slice(&internal_buf[..to_copy]); out = &mut out[to_copy..]; From c87711d69eff445244ec1bce6af19055904ae0a7 Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 20:22:24 +0100 Subject: [PATCH 48/49] Fix docs --- rc-zip-tokio/src/async_read_zip.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rc-zip-tokio/src/async_read_zip.rs b/rc-zip-tokio/src/async_read_zip.rs index 336715a..bc68be0 100644 --- a/rc-zip-tokio/src/async_read_zip.rs +++ b/rc-zip-tokio/src/async_read_zip.rs @@ -14,7 +14,7 @@ use crate::entry_reader::EntryReader; /// A trait for reading something as a zip archive. /// -/// See also [AsyncReadZip]. +/// See also [ReadZipAsync]. pub trait ReadZipWithSizeAsync { /// The type of the file to read from. type File: HasAsyncCursor; From a532d35b45917d73eb812c4d831e076d9cd9959f Mon Sep 17 00:00:00 2001 From: Amos Wenger Date: Fri, 2 Feb 2024 20:23:10 +0100 Subject: [PATCH 49/49] Fix docs --- rc-zip/src/fsm/entry/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rc-zip/src/fsm/entry/mod.rs b/rc-zip/src/fsm/entry/mod.rs index 8b451c0..db72965 100644 --- a/rc-zip/src/fsm/entry/mod.rs +++ b/rc-zip/src/fsm/entry/mod.rs @@ -354,7 +354,7 @@ trait Decompressor { } impl AnyDecompressor { - fn new(method: Method, entry: &StoredEntryInner) -> Result { + fn new(method: Method, #[allow(unused)] entry: &StoredEntryInner) -> Result { let dec = match method { Method::Store => Self::Store(Default::default()),