Skip to content

Commit

Permalink
boulder/draft: Add support for license matching when running boulder new
Browse files Browse the repository at this point in the history
  • Loading branch information
joebonrichie committed Feb 2, 2025
1 parent cb1207e commit 98671a3
Show file tree
Hide file tree
Showing 5 changed files with 263 additions and 3 deletions.
51 changes: 51 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ strum = { version = "0.26.3", features = ["derive"] }
thiserror = "2.0.3"
thread-priority = "1.1.0"
tokio = { version = "1.38.0", features = ["full"] }
tokio-stream = { version = "0.1.15", features = ["time"] }
tokio-util = { version = "0.7.11", features = ["io"] }
url = { version = "2.5.2", features = ["serde"] }
xxhash-rust = { version = "0.8.11", features = ["xxh3"] }
Expand Down
4 changes: 4 additions & 0 deletions boulder/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ thread-priority.workspace = true
tokio.workspace = true
url.workspace = true
mailparse.workspace = true
rayon.workspace = true
rapidfuzz = "0.5.0"
memmap2 = "0.9.5"
jwalk = "0.8.1"

[lints]
workspace = true
42 changes: 40 additions & 2 deletions boulder/src/draft.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use std::{io, path::PathBuf};

use fs_err as fs;
use itertools::Itertools;
use licenses::match_licences;
use moss::Dependency;
use thiserror::Error;
use tui::Styled;
Expand All @@ -19,6 +20,7 @@ use self::monitoring::Monitoring;
use self::upstream::Upstream;

mod build;
mod licenses;
mod metadata;
mod monitoring;
mod upstream;
Expand Down Expand Up @@ -62,6 +64,14 @@ impl Drafter {
// Analyze files to determine build system / collect deps
let build = build::analyze(&files).map_err(Error::AnalyzeBuildSystem)?;

let licenses = format_licenses(
match_licences(
&extract_root,
&PathBuf::from("/home/ninya/boulder-d-legacy/license-list-data/text"),
)
.unwrap_or_default(),
);

// Remove temp extract dir
fs::remove_dir_all(extract_root)?;

Expand Down Expand Up @@ -97,8 +107,8 @@ upstreams :
summary : UPDATE SUMMARY
description : |
UPDATE DESCRIPTION
license : UPDATE LICENSE
{options}{builddeps}{environment}{phases}
license :
{licenses}{options}{builddeps}{environment}{phases}
",
metadata.source.name,
metadata.source.version,
Expand All @@ -123,6 +133,30 @@ fn builddeps(deps: impl IntoIterator<Item = Dependency>) -> String {
}
}

fn format_licenses(licenses: Vec<String>) -> String {
let formatted = licenses
.into_iter()
.map(|license| format!(" - {license}"))
.sorted_by(|a, b| {
// HACK: Ensure -or-later for GNU licenses comes before -only
// to match 90% of cases. We need to read the standard license
// header to figure out the actual variant.
if a.contains("-only") {
std::cmp::Ordering::Greater
} else if b.contains("-only") {
std::cmp::Ordering::Less
} else {
a.cmp(b)
}
})
.join("\n");
if formatted.is_empty() {
String::default()
} else {
format!("{formatted}\n")
}
}

pub struct File<'a> {
pub path: PathBuf,
pub extract_root: &'a Path,
Expand Down Expand Up @@ -150,8 +184,12 @@ pub enum Error {
Upstream(#[from] upstream::Error),
#[error("monitoring")]
Monitoring(#[from] monitoring::Error),
#[error("licensing")]
Licenses(#[from] licenses::Error),
#[error("io")]
Io(#[from] io::Error),
#[error("walkdir")]
WalkDir(#[from] jwalk::Error),
}

#[cfg(test)]
Expand Down
168 changes: 168 additions & 0 deletions boulder/src/draft/licenses.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
// SPDX-FileCopyrightText: Copyright © 2020-2025 Serpent OS Developers
//
// SPDX-License-Identifier: MPL-2.0

use jwalk::WalkDir;
use rapidfuzz::distance::levenshtein;
use rayon::prelude::*;
use std::collections::HashSet;
use std::path::Path;
use std::path::PathBuf;
use tui::Styled;

pub type Error = Box<dyn std::error::Error>;

fn collect_spdx_licenses(dir: &Path) -> (HashSet<PathBuf>, HashSet<PathBuf>) {
// Collect our spdx licenses to compare against ensuring we don't match against deprecated licenses.
let mut purified_spdx_licenses = HashSet::new();
let spdx_license_paths: HashSet<_> = WalkDir::new(dir)
.into_iter()
.filter_map(|entry| {
entry.ok().and_then(|e| {
if e.file_type().is_file() && !e.file_name().to_str().unwrap_or_default().contains("deprecated_") {
purified_spdx_licenses.insert(PathBuf::from(e.file_name()));
Some(e.path())
} else {
None
}
})
})
.collect();
(purified_spdx_licenses, spdx_license_paths)
}

fn collect_dir_licenses(
dir: &Path,
spdx_list: &HashSet<PathBuf>,
) -> Result<(HashSet<PathBuf>, HashSet<PathBuf>), Error> {
let patterns = ["copying", "license"];

// Match potential license files
let mut licenses = HashSet::new();
let mut hash_direntries = HashSet::new();

for entry in WalkDir::new(dir).max_depth(3) {
let entry = entry?;
if entry.file_type().is_file() {
let file_name = PathBuf::from(entry.file_name());
hash_direntries.insert(file_name);

let file_name = entry.file_name().to_string_lossy().to_lowercase();
if patterns.iter().any(|&pattern| file_name.contains(pattern)) {
licenses.insert(entry.path());
}

// Split the spdx licence e.g. GPL-2.0-or-later -> GPL then check
// if the file name contains the split value, if it does
// add it to our licences to check against.
let file_name_contains_licence: Vec<_> = spdx_list
.par_iter()
.filter_map(|license| match license.to_string_lossy().split_once("-") {
Some((key, _)) => {
if file_name.starts_with(&key.to_lowercase()) {
Some(license)
} else {
None
}
}
None => None,
})
.collect();

if !file_name_contains_licence.is_empty() {
licenses.insert(entry.path());
}
}
}

Ok((licenses, hash_direntries))
}

pub fn match_licences(dir: &Path, spdx_dir: &Path) -> Result<Vec<String>, Error> {
let (spdx_pure, spdx_paths) = collect_spdx_licenses(spdx_dir);
let (licenses, dir_entries) = collect_dir_licenses(dir, &spdx_pure)?;

let reuse_matches: Vec<_> = dir_entries
.intersection(&spdx_pure)
.map(|m| m.with_extension("").to_str().unwrap_or_default().to_owned())
.collect();

if !reuse_matches.is_empty() {
return Ok(reuse_matches);
}

if licenses.is_empty() {
println!("{} | Failed to find any licenses", "Warning".yellow());
return Ok(vec![]);
}

let confidence = 0.9;

let matches: Vec<_> = licenses
.par_iter()
.filter_map(|license| {
let license_content = std::fs::read_to_string(license).ok();
if license_content != None {

Check warning on line 105 in boulder/src/draft/licenses.rs

View workflow job for this annotation

GitHub Actions / clippy

[clippy] boulder/src/draft/licenses.rs#L105

warning: binary comparison to literal `Option::None` --> boulder/src/draft/licenses.rs:105:16 | 105 | if license_content != None { | ^^^^^^^^^^^^^^^^^^^^^^^ help: use `Option::is_some()` instead: `license_content.is_some()` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#partialeq_to_none = note: `#[warn(clippy::partialeq_to_none)]` on by default
Raw output
boulder/src/draft/licenses.rs:105:16:w:warning: binary comparison to literal `Option::None`
   --> boulder/src/draft/licenses.rs:105:16
    |
105 |             if license_content != None {
    |                ^^^^^^^^^^^^^^^^^^^^^^^ help: use `Option::is_some()` instead: `license_content.is_some()`
    |
    = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#partialeq_to_none
    = note: `#[warn(clippy::partialeq_to_none)]` on by default


__END__
Some(license_content)
} else {
println!("{} | Failed to parse {}", "Warning".yellow(), license.display());
None
}
})
.flat_map(|content| {
let sanitized = content
.unwrap_or_default()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
let scorer = levenshtein::BatchComparator::new(sanitized.chars());
spdx_paths.par_iter().filter_map(move |spdx_license| {
// For GNU derivate licenses SPDX includes a copy of the general GNU license below the
// derivate license whereas downstream tarballs will typically only contain the derivate license.
// This ruins the algorithms, just truncate to the .len() plus an additional 5% (to account for subtle
// license variants) of the file we're comparing against to get around it.
// NOTE: Although only reading up to n lines/chars would be quicker it has difficulty differentiating
// between subtle differences e.g. Apache-2.0 vs Pixar or GFDL-1.2-* vs GFDL-1.3-*.
// TODO: How to match against multiple licences in one file? hybrid sliding window approach approach?
let truncated_canonical: String = std::fs::read_to_string(spdx_license)
.ok()?
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
.chars()
.take((sanitized.chars().count() as f64 * 1.05) as usize)
.collect();
let lev_sim = scorer.normalized_similarity_with_args(
truncated_canonical.chars(),
&levenshtein::Args::default().score_cutoff(confidence),
)?;
if lev_sim >= confidence {
println!(
"{} | Matched against {:?} (confidence {:.2}%)",
"License".green(),
spdx_license.with_extension("").file_name().unwrap_or_default(),
lev_sim * 100.0
);
Some(
spdx_license
.with_extension("")
.file_name()
.unwrap_or_default()
.to_str()
.unwrap_or_default()
.to_owned(),
)
} else {
None
}
})
})
.collect();

if matches.is_empty() {
println!("{} | Failed to match against any licenses", "Warning".yellow());
return Ok(vec![]);
}

Ok(matches)
}

0 comments on commit 98671a3

Please sign in to comment.