From fdaeb7a3e0f31d78fef5b01eb890f208c8d6f101 Mon Sep 17 00:00:00 2001 From: sunby Date: Mon, 20 Jan 2025 17:27:42 +0800 Subject: [PATCH 1/3] extend regex query to support json field Signed-off-by: sunby --- src/query/regex_query.rs | 79 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 3 deletions(-) diff --git a/src/query/regex_query.rs b/src/query/regex_query.rs index cc5701744a..bb3c44df75 100644 --- a/src/query/regex_query.rs +++ b/src/query/regex_query.rs @@ -6,6 +6,7 @@ use tantivy_fst::Regex; use crate::error::TantivyError; use crate::query::{AutomatonWeight, EnableScoring, Query, Weight}; use crate::schema::Field; +use crate::Term; /// A Regex Query matches all of the documents /// containing a specific term that matches @@ -57,6 +58,7 @@ use crate::schema::Field; pub struct RegexQuery { regex: Arc, field: Field, + json_path: Option, } impl RegexQuery { @@ -72,11 +74,40 @@ impl RegexQuery { RegexQuery { regex: regex.into(), field, + json_path: None, } } + /// Creates a new RegexQuery from a given pattern with a json path + pub fn from_pattern_with_json_path( + regex_pattern: &str, + field: Field, + json_path: &str, + ) -> crate::Result { + let mut term = Term::from_field_json_path(field, json_path, false); + term.append_type_and_str(regex_pattern); + let regex_text = std::str::from_utf8(term.serialized_value_bytes()).map_err(|err| { + TantivyError::InvalidArgument(format!( + "Failed to convert json term value bytes to utf8 string: {err}" + )) + })?; + let regex = Regex::new(regex_text).unwrap(); + Ok(RegexQuery { + regex: regex.into(), + field, + json_path: Some(json_path.to_string()), + }) + } + fn specialized_weight(&self) -> AutomatonWeight { - AutomatonWeight::new(self.field, self.regex.clone()) + match &self.json_path { + Some(json_path) => AutomatonWeight::new_for_json_path( + self.field, + self.regex.clone(), + json_path.as_bytes(), + ), + None => AutomatonWeight::new(self.field, self.regex.clone()), + } } } @@ -94,8 +125,8 @@ mod test { use super::RegexQuery; use crate::collector::TopDocs; - use crate::schema::{Field, Schema, TEXT}; - use crate::{assert_nearly_equals, Index, IndexReader, IndexWriter}; + use crate::schema::{Field, Schema, STORED, TEXT}; + use crate::{assert_nearly_equals, Index, IndexReader, IndexWriter, TantivyDocument}; fn build_test_index() -> crate::Result<(IndexReader, Field)> { let mut schema_builder = Schema::builder(); @@ -188,4 +219,46 @@ mod test { res => panic!("unexpected result: {res:?}"), } } + + #[test] + pub fn test_regex_query_with_json_path() -> crate::Result<()> { + std::env::set_var("RUST_BACKTRACE", "1"); + let mut schema_builder = Schema::builder(); + let attributes_field = schema_builder.add_json_field("attributes", TEXT | STORED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + { + let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); + + let doc = TantivyDocument::parse_json( + &schema, + r#"{ + "attributes": { + "country": "japan" + } + }"#, + )?; + + index_writer.add_document(doc)?; + let doc = TantivyDocument::parse_json( + &schema, + r#"{ + "attributes": { + "country": "korea" + } + }"#, + )?; + + index_writer.add_document(doc)?; + index_writer.commit()?; + } + let reader = index.reader()?; + + let matching_one = + RegexQuery::from_pattern_with_json_path("jap[ao]n", attributes_field, "country")?; + let matching_zero = + RegexQuery::from_pattern_with_json_path("jap[A-Z]n", attributes_field, "country")?; + verify_regex_query(matching_one, matching_zero, reader); + Ok(()) + } } From b2a8370e04c0c852fcbe5f52c6380fca4d3dfbb2 Mon Sep 17 00:00:00 2001 From: sunby Date: Tue, 21 Jan 2025 10:35:05 +0800 Subject: [PATCH 2/3] add comments Signed-off-by: sunby --- src/query/regex_query.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/query/regex_query.rs b/src/query/regex_query.rs index bb3c44df75..7f052dcd64 100644 --- a/src/query/regex_query.rs +++ b/src/query/regex_query.rs @@ -84,6 +84,7 @@ impl RegexQuery { field: Field, json_path: &str, ) -> crate::Result { + // tantivy-fst does not support ^ and $ in regex pattern so it is valid to append regex pattern to the end of the json path let mut term = Term::from_field_json_path(field, json_path, false); term.append_type_and_str(regex_pattern); let regex_text = std::str::from_utf8(term.serialized_value_bytes()).map_err(|err| { From 386e4ac3e717d0cb9943388b6e9d17903e29d41e Mon Sep 17 00:00:00 2001 From: sunby Date: Thu, 23 Jan 2025 15:18:01 +0800 Subject: [PATCH 3/3] fix a bug Signed-off-by: sunby --- src/query/regex_query.rs | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/query/regex_query.rs b/src/query/regex_query.rs index 7f052dcd64..345ba44ad9 100644 --- a/src/query/regex_query.rs +++ b/src/query/regex_query.rs @@ -58,7 +58,7 @@ use crate::Term; pub struct RegexQuery { regex: Arc, field: Field, - json_path: Option, + json_path_bytes: Option>, } impl RegexQuery { @@ -74,7 +74,7 @@ impl RegexQuery { RegexQuery { regex: regex.into(), field, - json_path: None, + json_path_bytes: None, } } @@ -93,19 +93,26 @@ impl RegexQuery { )) })?; let regex = Regex::new(regex_text).unwrap(); - Ok(RegexQuery { - regex: regex.into(), - field, - json_path: Some(json_path.to_string()), - }) + + if let Some((json_path_bytes, _)) = term.value().as_json() { + Ok(RegexQuery { + regex: regex.into(), + field, + json_path_bytes: Some(json_path_bytes.to_vec()), + }) + } else { + Err(TantivyError::InvalidArgument(format!( + "The regex query requires a json path for a json term." + ))) + } } fn specialized_weight(&self) -> AutomatonWeight { - match &self.json_path { - Some(json_path) => AutomatonWeight::new_for_json_path( + match &self.json_path_bytes { + Some(json_path_bytes) => AutomatonWeight::new_for_json_path( self.field, self.regex.clone(), - json_path.as_bytes(), + json_path_bytes.as_slice(), ), None => AutomatonWeight::new(self.field, self.regex.clone()), } @@ -235,7 +242,7 @@ mod test { &schema, r#"{ "attributes": { - "country": "japan" + "country": {"name": "japan"} } }"#, )?; @@ -245,7 +252,7 @@ mod test { &schema, r#"{ "attributes": { - "country": "korea" + "country": {"name": "korea"} } }"#, )?; @@ -256,7 +263,7 @@ mod test { let reader = index.reader()?; let matching_one = - RegexQuery::from_pattern_with_json_path("jap[ao]n", attributes_field, "country")?; + RegexQuery::from_pattern_with_json_path("j.*", attributes_field, "country.name")?; let matching_zero = RegexQuery::from_pattern_with_json_path("jap[A-Z]n", attributes_field, "country")?; verify_regex_query(matching_one, matching_zero, reader);