Skip to content

Commit

Permalink
Make built-in adapters' identifiers configurable
Browse files Browse the repository at this point in the history
This will allow end users to provide their own lists of extensions and/or
mimetypes for each of the built-in adapters.
  • Loading branch information
lafrenierejm committed Sep 15, 2024
1 parent 6b79490 commit 80cbb9d
Show file tree
Hide file tree
Showing 14 changed files with 689 additions and 318 deletions.
209 changes: 182 additions & 27 deletions src/adapters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ pub mod zip;
use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*};
use anyhow::{format_err, Context, Result};
use async_trait::async_trait;
use custom::strs;
use custom::CustomAdapterConfig;
use custom::CustomIdentifiers;
use custom::BUILTIN_SPAWNING_ADAPTERS;
use log::*;
use tokio::io::AsyncRead;
Expand All @@ -38,7 +40,7 @@ pub struct AdapterMeta {
pub fast_matchers: Vec<FastFileMatcher>,
/// list of matchers when we have mime type detection active (interpreted as ORed)
/// warning: this *overrides* the fast matchers
pub slow_matchers: Option<Vec<FileMatcher>>,
pub slow_matchers: Vec<FileMatcher>,
/// if true, slow_matchers is merged with fast matchers if accurate is enabled
/// for example, in sqlite you want this disabled since the db extension can mean other things and the mime type matching is very accurate for sqlite.
/// but for tar you want it enabled, since the tar extension is very accurate but the tar mime matcher can have false negatives
Expand All @@ -48,39 +50,63 @@ pub struct AdapterMeta {
}
impl AdapterMeta {
// todo: this is pretty ugly
pub fn get_matchers<'a>(
&'a self,
slow: bool,
) -> Box<dyn Iterator<Item = Cow<FileMatcher>> + 'a> {
pub fn get_matchers(&self, slow: bool) -> Box<dyn Iterator<Item = Cow<FileMatcher>> + '_> {
match (
slow,
self.keep_fast_matchers_if_accurate,
&self.slow_matchers,
&self.fast_matchers,
) {
(true, false, Some(ref sm)) => Box::new(sm.iter().map(Cow::Borrowed)),
(true, true, Some(ref sm)) => Box::new(
(true, false, sm, _) => Box::new(sm.iter().map(Cow::Borrowed)),
(true, true, sm, fm) => Box::new(
sm.iter().map(Cow::Borrowed).chain(
self.fast_matchers
.iter()
.map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))),
fm.iter()
.map(|e| Cow::Owned(FileMatcher::Fast(e.clone())))
.collect::<Vec<_>>(),
),
),
// don't have slow matchers or slow matching disabled
(true, _, None) | (false, _, _) => Box::new(
self.fast_matchers
.iter()
.map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))),
),
// slow matching disabled
(false, _, _, fm) => {
Box::new(fm.iter().map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))))
}
}
}
}

pub trait GetMetadata {
fn metadata(&self) -> &AdapterMeta;
pub trait Adapter {
fn name(&self) -> String;
fn version(&self) -> i32;
fn description(&self) -> String;
fn recurses(&self) -> bool;
fn disabled_by_default(&self) -> bool;
fn keep_fast_matchers_if_accurate(&self) -> bool;
fn extensions(&self) -> Vec<String>;
fn mimetypes(&self) -> Vec<String>;

fn metadata(&self) -> AdapterMeta {
return AdapterMeta {
name: self.name(),
version: self.version(),
description: self.description(),
recurses: true,
fast_matchers: self
.extensions()
.iter()
.map(|s| FastFileMatcher::FileExtension(s.to_string()))
.collect(),
slow_matchers: self
.mimetypes()
.iter()
.map(|mimetype| FileMatcher::MimeType(mimetype.to_string()))
.collect(),
disabled_by_default: self.disabled_by_default(),
keep_fast_matchers_if_accurate: self.keep_fast_matchers_if_accurate(),
};
}
}

#[async_trait]
pub trait FileAdapter: GetMetadata + Send + Sync {
pub trait FileAdapter: Adapter + Send + Sync {
/// adapt a file.
///
/// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher
Expand Down Expand Up @@ -109,7 +135,110 @@ pub struct AdaptInfo {
/// (enabledAdapters, disabledAdapters)
type AdaptersTuple = (Vec<Arc<dyn FileAdapter>>, Vec<Arc<dyn FileAdapter>>);

pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> AdaptersTuple {
pub fn get_all_adapters(
custom_identifiers: Option<CustomIdentifiers>,
custom_adapters: Option<Vec<CustomAdapterConfig>>,
) -> AdaptersTuple {
// decompress
let mut bz2_extensions = strs(decompress::EXTENSIONS_BZ2);
let mut bz2_mimetypes = strs(decompress::MIMETYPES_BZ2);
let mut gz_extensions = strs(decompress::EXTENSIONS_GZ);
let mut gz_mimetypes = strs(decompress::MIMETYPES_GZ);
let mut xz_extensions = strs(decompress::EXTENSIONS_XZ);
let mut xz_mimetypes = strs(decompress::MIMETYPES_XZ);
let mut zst_extensions = strs(decompress::EXTENSIONS_ZST);
let mut zst_mimetypes = strs(decompress::MIMETYPES_ZST);

let mut ffmpeg_extensions = strs(ffmpeg::EXTENSIONS);
let mut ffmpeg_mimetypes = strs(ffmpeg::MIMETYPES);

let mut mbox_extensions = strs(mbox::EXTENSIONS);
let mut mbox_mimetypes = strs(mbox::MIMETYPES);

let mut sqlite_extensions = strs(sqlite::EXTENSIONS);
let mut sqlite_mimetypes = strs(sqlite::MIMETYPES);

let mut tar_extensions = strs(tar::EXTENSIONS);
let mut tar_mimetypes = strs(tar::MIMETYPES);

let mut zip_extensions = strs(zip::EXTENSIONS);
let mut zip_mimetypes = strs(zip::MIMETYPES);

if let Some(identifiers) = custom_identifiers {
if let Some(identifier) = identifiers.bz2 {
if let Some(extensions) = identifier.extensions {
bz2_extensions = extensions;
}
if let Some(mimetypes) = identifier.mimetypes {
bz2_mimetypes = mimetypes;
}
}
if let Some(identifier) = identifiers.gz {
if let Some(extensions) = identifier.extensions {
gz_extensions = extensions;
}
if let Some(mimetypes) = identifier.mimetypes {
gz_mimetypes = mimetypes;
}
}
if let Some(identifier) = identifiers.xz {
if let Some(extensions) = identifier.extensions {
xz_extensions = extensions;
}
if let Some(mimetypes) = identifier.mimetypes {
xz_mimetypes = mimetypes;
}
}
if let Some(identifier) = identifiers.zst {
if let Some(extensions) = identifier.extensions {
zst_extensions = extensions;
}
if let Some(mimetypes) = identifier.mimetypes {
zst_mimetypes = mimetypes;
}
}
if let Some(identifier) = identifiers.ffmpeg {
if let Some(extensions) = identifier.extensions {
ffmpeg_extensions = extensions;
}
if let Some(mimetypes) = identifier.mimetypes {
ffmpeg_mimetypes = mimetypes;
}
}
if let Some(identifier) = identifiers.mbox {
if let Some(extensions) = identifier.extensions {
mbox_extensions = extensions;
}
if let Some(mimetypes) = identifier.mimetypes {
mbox_mimetypes = mimetypes;
}
}
if let Some(identifier) = identifiers.sqlite {
if let Some(extensions) = identifier.extensions {
sqlite_extensions = extensions;
}
if let Some(mimetypes) = identifier.mimetypes {
sqlite_mimetypes = mimetypes;
}
}
if let Some(identifier) = identifiers.tar {
if let Some(extensions) = identifier.extensions {
tar_extensions = extensions;
}
if let Some(mimetypes) = identifier.mimetypes {
tar_mimetypes = mimetypes;
}
}
if let Some(identifier) = identifiers.zip {
if let Some(extensions) = identifier.extensions {
zip_extensions = extensions;
}
if let Some(mimetypes) = identifier.mimetypes {
zip_mimetypes = mimetypes;
}
}
}

// order in descending priority
let mut adapters: Vec<Arc<dyn FileAdapter>> = vec![];
if let Some(custom_adapters) = custom_adapters {
Expand All @@ -120,12 +249,36 @@ pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> Ad

let internal_adapters: Vec<Arc<dyn FileAdapter>> = vec![
Arc::new(PostprocPageBreaks::default()),
Arc::new(ffmpeg::FFmpegAdapter::new()),
Arc::new(zip::ZipAdapter::new()),
Arc::new(decompress::DecompressAdapter::new()),
Arc::new(mbox::MboxAdapter::new()),
Arc::new(tar::TarAdapter::new()),
Arc::new(sqlite::SqliteAdapter::new()),
Arc::new(ffmpeg::FFmpegAdapter {
extensions: ffmpeg_extensions,
mimetypes: ffmpeg_mimetypes,
}),
Arc::new(zip::ZipAdapter {
extensions: zip_extensions,
mimetypes: zip_mimetypes,
}),
Arc::new(decompress::DecompressAdapter {
extensions_gz: gz_extensions,
extensions_bz2: bz2_extensions,
extensions_xz: xz_extensions,
extensions_zst: zst_extensions,
mimetypes_gz: gz_mimetypes,
mimetypes_bz2: bz2_mimetypes,
mimetypes_xz: xz_mimetypes,
mimetypes_zst: zst_mimetypes,
}),
Arc::new(mbox::MboxAdapter {
extensions: mbox_extensions,
mimetypes: mbox_mimetypes,
}),
Arc::new(sqlite::SqliteAdapter {
extensions: sqlite_extensions,
mimetypes: sqlite_mimetypes,
}),
Arc::new(tar::TarAdapter {
extensions: tar_extensions,
mimetypes: tar_mimetypes,
}),
];
adapters.extend(
BUILTIN_SPAWNING_ADAPTERS
Expand All @@ -148,10 +301,12 @@ pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> Ad
* - "+a,b" means use default list but also a and b (a,b will be prepended to the list so given higher priority)
*/
pub fn get_adapters_filtered<T: AsRef<str>>(
custom_identifiers: Option<CustomIdentifiers>,
custom_adapters: Option<Vec<CustomAdapterConfig>>,
adapter_names: &[T],
) -> Result<Vec<Arc<dyn FileAdapter>>> {
let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters(custom_adapters);
let (def_enabled_adapters, def_disabled_adapters) =
get_all_adapters(custom_identifiers, custom_adapters);
let adapters = if !adapter_names.is_empty() {
let adapters_map: HashMap<_, _> = def_enabled_adapters
.iter()
Expand Down
Loading

0 comments on commit 80cbb9d

Please sign in to comment.