Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add OSA algorithm #23

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/release-python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ jobs:
manylinux: auto
command: build
# container default is manylinux
args: --release -o dist --manifest-path crates/stringmetrics_py/Cargo.toml
args: --release -o dist --manifest-path stringmetrics-py/Cargo.toml
- name: Build musl wheels
uses: messense/maturin-action@v1
with:
target: x86_64-unknown-linux-musl
manylinux: musllinux_1_1
command: build
args: --release -o dist -i 3.7 3.8 3.9 3.10 --manifest-path crates/stringmetrics_py/Cargo.toml
args: --release -o dist -i 3.7 3.8 3.9 3.10 --manifest-path stringmetrics-py/Cargo.toml
- name: Upload wheels
uses: actions/upload-artifact@v2
with:
Expand All @@ -37,7 +37,7 @@ jobs:
- uses: messense/maturin-action@v1
with:
command: build
args: --release --no-sdist -o dist --manifest-path crates/stringmetrics_py/Cargo.toml
args: --release --no-sdist -o dist --manifest-path stringmetrics-py/Cargo.toml
- name: Upload wheels
uses: actions/upload-artifact@v2
with:
Expand All @@ -51,7 +51,7 @@ jobs:
- uses: messense/maturin-action@v1
with:
command: build
args: --release --no-sdist -o dist --universal2 --manifest-path crates/stringmetrics_py/Cargo.toml
args: --release --no-sdist -o dist --universal2 --manifest-path stringmetrics-py/Cargo.toml
- name: Upload wheels
uses: actions/upload-artifact@v2
with:
Expand Down
9 changes: 7 additions & 2 deletions stringmetrics/src/algorithms.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,21 @@
//! assert_eq!(levenshtein(a, b), 6);
//! ```

mod damerau_impl;
mod hamming_impl;
// mod damerau;
mod jaccard_impl;
mod lev_impl;
mod osa_impl;

pub use self::damerau_impl::DamerauWeights;
pub use self::hamming_impl::{hamming, hamming_iter};
// pub use self::damerau::damerau_levenshtein;
pub use self::jaccard_impl::{jaccard, jaccard_set};
pub use self::lev_impl::{
levenshtein, levenshtein_limit, levenshtein_limit_iter, levenshtein_weight,
levenshtein_weight_iter, try_levenshtein, try_levenshtein_iter, try_levenshtein_weight,
try_levenshtein_weight_iter, LevWeights,
};
pub use self::osa_impl::{
osa_distance, osa_limit, osa_limit_iter, osa_weight, osa_weight_iter, try_osa, try_osa_iter,
try_osa_weight, try_osa_weight_iter,
};
45 changes: 42 additions & 3 deletions stringmetrics/src/algorithms/damerau_impl.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,43 @@
// Using the "optimal string alignment distance" from wikipedia
pub fn damerau_levenshtein(_a: &str, _b: &str) -> u32 {
0
use crate::algorithms::lev_impl::WeightsSwap;
use std::mem;

/// A struct that holds
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct DamerauWeights {
pub insertion: u32,
pub deletion: u32,
pub substitution: u32,
pub transposition: u32,
}

impl DamerauWeights {
/// Create a new `DamerauWeights` object
#[inline]
pub const fn new(w_ins: u32, w_del: u32, w_sub: u32, w_tspn: u32) -> Self {
Self {
insertion: w_ins,
deletion: w_del,
substitution: w_sub,
transposition: w_tspn,
}
}
}

impl WeightsSwap for DamerauWeights {
// Swap insertion and deletion terms
#[inline]
fn swap(&mut self) {
mem::swap(&mut self.insertion, &mut self.deletion);
}
}

impl Default for DamerauWeights {
fn default() -> Self {
Self {
insertion: 1,
deletion: 1,
substitution: 1,
transposition: 1,
}
}
}
6 changes: 3 additions & 3 deletions stringmetrics/src/algorithms/lev_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ pub use structures::*;
/// better to use [`levenshtein_limit`] to avoid unnecessary computation.
///
/// Behind the scenes, this wraps [`levenshtein_limit_iter`]. For details on
/// operation, see the [algorithms](crate::algorithms) page.
/// operation, see the [module-level documentation](crate).
///
/// # Example
///
Expand All @@ -34,7 +34,7 @@ pub use structures::*;
/// if you need that functionality, please use [`levenshtein_weight`].
#[inline]
pub fn levenshtein(a: &str, b: &str) -> u32 {
levenshtein_limit_iter(a.bytes(), b.bytes(), u32::MAX)
try_levenshtein_iter(a.bytes(), b.bytes(), u32::MAX).unwrap_or(u32::MAX)
}

/// Levenshtein distance computation with a limit
Expand All @@ -56,7 +56,7 @@ pub fn levenshtein(a: &str, b: &str) -> u32 {
/// ```
#[inline]
pub fn levenshtein_limit(a: &str, b: &str, limit: u32) -> u32 {
levenshtein_limit_iter(a.bytes(), b.bytes(), limit)
try_levenshtein_iter(a.bytes(), b.bytes(), limit).unwrap_or(limit)
}

/// The same alrogithm as [`levenshtein_limit`] but return an `Option` to
Expand Down
13 changes: 7 additions & 6 deletions stringmetrics/src/algorithms/lev_impl/implementation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use super::{LevState, LevWeights};
use std::cmp::min;

/// The same algorithm as [`levenshtein_limit_iter`] but return an `Option` to
/// The same algorithm as [`levenshtein_limit_iter`](crate::levenshtein_limit_iter) but return an `Option` to
/// indicate if the limit is exceeded
///
/// Returns `Some(u32)` if a distance is found, `None` if a limit is hit
Expand All @@ -23,13 +23,14 @@ where
D: DoubleEndedIterator<Item = T> + Clone,
T: PartialEq,
{
// Identical implementation to levenshtein_weight_iter, just avoiding
// Identical implementation to levenshtein_weight_iter, just saving some ops
// from the weight calculations
let state = LevState::new(a.into_iter(), b.into_iter());
let LevState {
a_iter,
b_iter,
a_diff_len: a_len,
b_diff_len: b_len,
a_len,
b_len,
} = state;

// Only check b_len because if a_len is 0, the loop won't happen
Expand Down Expand Up @@ -132,8 +133,8 @@ where
let LevState {
a_iter,
b_iter,
a_diff_len: a_len,
b_diff_len: b_len,
a_len,
b_len,
} = state;
let LevWeights {
insertion: w_ins,
Expand Down
27 changes: 18 additions & 9 deletions stringmetrics/src/algorithms/lev_impl/structures.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ use crate::iter::find_eq_end_items;
use std::iter::Skip;
use std::mem;

///
pub trait WeightsSwap {
fn swap(&mut self);
}

/// A struct that holds the costs of insertion, deletion, and substitution. Used
/// for levenshthein algorithms that require weight specifications.
#[derive(Debug, PartialEq, Eq, Clone)]
Expand All @@ -12,6 +17,7 @@ pub struct LevWeights {
}

impl LevWeights {
/// Create a new `LevWeights` object
#[inline]
pub const fn new(w_ins: u32, w_del: u32, w_sub: u32) -> Self {
Self {
Expand All @@ -20,10 +26,11 @@ impl LevWeights {
substitution: w_sub,
}
}

}
impl WeightsSwap for LevWeights {
// Swap insertion and deletion terms
#[inline]
pub fn swap(&mut self) {
fn swap(&mut self) {
mem::swap(&mut self.insertion, &mut self.deletion);
}
}
Expand All @@ -35,12 +42,14 @@ impl Default for LevWeights {
}
}

/// Representation of a string for lev parsing after stipping start & end
#[derive(Debug)]
pub struct LevState<D: DoubleEndedIterator> {
pub a_iter: Skip<D>,
pub b_iter: Skip<D>,
pub a_diff_len: u32,
pub b_diff_len: u32,
/// Lengths after trimming
pub a_len: u32,
pub b_len: u32,
}

impl<D: DoubleEndedIterator<Item = T> + Clone, T: PartialEq> LevState<D> {
Expand All @@ -50,8 +59,8 @@ impl<D: DoubleEndedIterator<Item = T> + Clone, T: PartialEq> LevState<D> {
Self {
a_iter: a_iter.skip(skip),
b_iter: b_iter.skip(skip),
a_diff_len: iter_info.a_diff_len(),
b_diff_len: iter_info.b_diff_len(),
a_len: iter_info.a_diff_len(),
b_len: iter_info.b_diff_len(),
}
}

Expand All @@ -67,7 +76,7 @@ impl<D: DoubleEndedIterator<Item = T> + Clone, T: PartialEq> LevState<D> {

/// Create a new structure and swap weights if needed
#[inline]
pub fn new_weights(a_iter: D, b_iter: D, weights: &mut LevWeights) -> Self {
pub fn new_weights<W: WeightsSwap>(a_iter: D, b_iter: D, weights: &mut W) -> Self {
let mut ret = Self::new_inner(a_iter, b_iter);
if ret.should_swap() {
ret.swap_inner();
Expand All @@ -79,12 +88,12 @@ impl<D: DoubleEndedIterator<Item = T> + Clone, T: PartialEq> LevState<D> {
/// We want the longer string in B so it's in the inner loop
#[inline]
pub const fn should_swap(&self) -> bool {
self.a_diff_len > self.b_diff_len
self.a_len > self.b_len
}

#[inline]
pub fn swap_inner(&mut self) {
mem::swap(&mut self.a_iter, &mut self.b_iter);
mem::swap(&mut self.a_diff_len, &mut self.b_diff_len);
mem::swap(&mut self.a_len, &mut self.b_len);
}
}
5 changes: 3 additions & 2 deletions stringmetrics/src/algorithms/lev_impl/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ fn test_levstate_new() {
let a = "aaxxxxxc";
let b = "aaabbbccc";
let state = LevState::new(a.bytes(), b.bytes());
assert_eq!(state.a_diff_len, 5);
assert_eq!(state.b_diff_len, 6);
assert_eq!(state.a_len, 5);
assert_eq!(state.b_len, 6);
}

#[test]
Expand All @@ -35,6 +35,7 @@ fn test_levenshtein_empty() {

#[test]
fn test_levenshtein_basic() {
assert_eq!(levenshtein("ab", "ba"), 2);
assert_eq!(levenshtein("abcd", "ab"), 2);
assert_eq!(levenshtein("ab", "abcd"), 2);
assert_eq!(levenshtein("abcd", "ad"), 2);
Expand Down
53 changes: 53 additions & 0 deletions stringmetrics/src/algorithms/osa_impl.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
mod implementation;
pub use implementation::*;

use crate::DamerauWeights;

#[inline]
pub fn osa_distance(a: &str, b: &str) -> u32 {
// try_osa_iter(a.bytes(), b.bytes(), u32::MAX).unwrap_or(u32::MAX)
try_osa_weight_iter(a.bytes(), b.bytes(), u32::MAX, &DamerauWeights::default())
.unwrap_or(u32::MAX)
}

#[inline]
pub fn osa_limit(a: &str, b: &str, limit: u32) -> u32 {
try_osa_iter(a.bytes(), b.bytes(), limit).unwrap_or(limit)
}

#[inline]
pub fn osa_limit_iter<I, T, D>(a: I, b: I, limit: u32) -> u32
where
I: IntoIterator<IntoIter = D>,
D: DoubleEndedIterator<Item = T> + Clone,
T: PartialEq + Clone,
{
try_osa_iter(a, b, limit).unwrap_or(limit)
}

#[inline]
pub fn try_osa(a: &str, b: &str, limit: u32) -> Option<u32> {
try_osa_iter(a.bytes(), b.bytes(), limit)
}

#[inline]
pub fn osa_weight(a: &str, b: &str, limit: u32, weights: &DamerauWeights) -> u32 {
try_osa_weight_iter(a.bytes(), b.bytes(), limit, weights).unwrap_or(limit)
}
#[inline]
pub fn try_osa_weight(a: &str, b: &str, limit: u32, weights: &DamerauWeights) -> Option<u32> {
try_osa_weight_iter(a.bytes(), b.bytes(), limit, weights)
}

#[inline]
pub fn osa_weight_iter<I, T, D>(a: I, b: I, limit: u32, weights: &DamerauWeights) -> u32
where
I: IntoIterator<IntoIter = D>,
D: DoubleEndedIterator<Item = T> + Clone,
T: PartialEq + Clone,
{
try_osa_weight_iter(a, b, limit, weights).unwrap_or(limit)
}

#[cfg(test)]
mod tests;
Loading