From 350d7ff15b964f22610132b65deda9c0b4ec2c87 Mon Sep 17 00:00:00 2001 From: Raivis Dejus Date: Sat, 21 Oct 2023 10:11:31 +0300 Subject: [PATCH] Adding regex replacement feature --- README.md | 72 ++++++++++++++++++++++++++++++++----------------- src/replacer.rs | 29 ++++++++++++++++++++ src/rules.rs | 3 +++ 3 files changed, 80 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index c2a1367..6ccc7fc 100644 --- a/README.md +++ b/README.md @@ -155,30 +155,31 @@ cargo run --release -- -l en -d ../texts/ extract-file >> file.en.txt The following rules can be configured per language. Add a `.toml` file in the `rules` directory to enable a new locale. Note that the `replacements` get applied before any other rules are checked. -| Name | Description | Values | Default | -|--------|-----------------------|---------|---------| -| abbreviation_patterns | Regex defining abbreviations | Rust Regex Array | all abbreviations allowed -| allowed_symbols_regex | Regex of allowed symbols or letters. Each character gets matched against this pattern. | String Array | not used -| broken_whitespace | Array of broken whitespaces. This could for example disallow two spaces following each other | String Array | all types of whitespaces allowed -| disallowed_symbols | Use `allowed_symbols_regex` instead. Array of disallowed symbols or letters. Only used when allowed_symbols_regex is not set or is an empty String. | String Array | all symbols allowed -| disallowed_words | Array of disallowed words. Prefer the blocklist approach when possible. | String Array | all words allowed -| even_symbols | Symbols that always need an even count | Char Array | [] -| matching_symbols | Symbols that map to another | Array of matching configurations: each configuration is an Array of two values: `["match", "match"]`. See example below. | [] -| max_word_count | Maximum number of words in a sentence | integer | 14 -| may_end_with_colon | If a sentence can end with a : or not | boolean | false -| min_characters | Minimum of character occurrences | integer | 0 -| max_characters | Maximum of character occurrences | integer | MAX -| min_trimmed_length | Minimum length of string after trimming | integer | 3 -| min_word_count | Minimum number of words in a sentence | integer | 1 -| needs_letter_start | If a sentence needs to start with a letter | boolean | true -| needs_punctuation_end | If a sentence needs to end with a punctuation | boolean | false -| needs_uppercase_start | If a sentence needs to start with an uppercase | boolean | false -| other_patterns | Regex to disallow anything else | Rust Regex Array | all other patterns allowed -| quote_start_with_letter | If a quote needs to start with a letter | boolean | true -| remove_brackets_list | Removes (possibly nested) user defined brackets and content inside them `(anything [else])` from the sentence before replacements and checking other rules | Array of matching brackets: each configuration is an Array of two values: `["opening_bracket", "closing_bracket"]`. See example below. | [] -| replacements | Replaces abbreviations or other words according to configuration. This happens before any other rules are checked. | Array of replacement configurations: each configuration is an Array of two values: `["search", "replacement"]`. See example below. | nothing gets replaced -| segmenter | Segmenter to use for this language. See below for more information. | "python" | using `rust-punkt` by default -| stem_separator_regex | If given, splits words at the given characters to reach the stem words to check them again against the blacklist, e.g. prevents "Rust's" to pass if "Rust" is in the blacklist. | Simple regex of separators, e.g. for apostrophe `stem_separator_regex = "[']"` | "" +| Name | Description | Values | Default | +|-------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|---------| +| abbreviation_patterns | Regex defining abbreviations | Rust Regex Array | all abbreviations allowed +| allowed_symbols_regex | Regex of allowed symbols or letters. Each character gets matched against this pattern. | String Array | not used +| broken_whitespace | Array of broken whitespaces. This could for example disallow two spaces following each other | String Array | all types of whitespaces allowed +| disallowed_symbols | Use `allowed_symbols_regex` instead. Array of disallowed symbols or letters. Only used when allowed_symbols_regex is not set or is an empty String. | String Array | all symbols allowed +| disallowed_words | Array of disallowed words. Prefer the blocklist approach when possible. | String Array | all words allowed +| even_symbols | Symbols that always need an even count | Char Array | [] +| matching_symbols | Symbols that map to another | Array of matching configurations: each configuration is an Array of two values: `["match", "match"]`. See example below. | [] +| max_word_count | Maximum number of words in a sentence | integer | 14 +| may_end_with_colon | If a sentence can end with a : or not | boolean | false +| min_characters | Minimum of character occurrences | integer | 0 +| max_characters | Maximum of character occurrences | integer | MAX +| min_trimmed_length | Minimum length of string after trimming | integer | 3 +| min_word_count | Minimum number of words in a sentence | integer | 1 +| needs_letter_start | If a sentence needs to start with a letter | boolean | true +| needs_punctuation_end | If a sentence needs to end with a punctuation | boolean | false +| needs_uppercase_start | If a sentence needs to start with an uppercase | boolean | false +| other_patterns | Regex to disallow anything else | Rust Regex Array | all other patterns allowed +| quote_start_with_letter | If a quote needs to start with a letter | boolean | true +| remove_brackets_list | Removes (possibly nested) user defined brackets and content inside them `(anything [else])` from the sentence before replacements and checking other rules | Array of matching brackets: each configuration is an Array of two values: `["opening_bracket", "closing_bracket"]`. See example below. | [] +| replacements | Replaces abbreviations or other words according to configuration. This happens before any other rules are checked. | Array of replacement configurations: each configuration is an Array of two values: `["search", "replacement"]`. See example below. | nothing gets replaced +| regex_replacement_list | Finds regex and makes replacements within found patterms. This happens before any other rules are checked. | Array of configurations: each configuration is an Array of three values: `["regex", "search", "replacement"]`. See example below. | nothing gets replaced +| segmenter | Segmenter to use for this language. See below for more information. | "python" | using `rust-punkt` by default +| stem_separator_regex | If given, splits words at the given characters to reach the stem words to check them again against the blacklist, e.g. prevents "Rust's" to pass if "Rust" is in the blacklist. | Simple regex of separators, e.g. for apostrophe `stem_separator_regex = "[']"` | "" ### Example for `matching_symbols` @@ -239,6 +240,29 @@ Input: I am foo test a test Output: I am hi a hi ``` +### Example for `regex_replacement_list` + +``` +regex_replacement_list = [ + # Split glued sentences + ["\\ [a-z]{3,}\\.[A-Z][a-z]{2,}\\ ", ".", ". "], + + # Split long sentences + ["\\b(?:\\S+\\s+){15,}\\S+[.!?]", ", but ", ". But "], +] +``` + +This will find words that glue two sentences and will add a space to un-glue them. +And will split a long sentence in two smaller. + +``` +Input: A sentence.Glued to another. +Output: A sentence. Glued to another. + +Input: A first part of a long sentence that would be rejected, but infact it could be used. +Output: A first part of a long sentence that would be rejected. But infact it could be used. +``` + ## Using disallowed words In order to increase the quality of the final output, you might want to consider filtering out some words that are complex, too long or non-native. diff --git a/src/replacer.rs b/src/replacer.rs index 6a2c833..b37092d 100644 --- a/src/replacer.rs +++ b/src/replacer.rs @@ -28,6 +28,19 @@ pub fn replace_strings(rules: &Rules, raw: &str) -> String { } } + // regexp replacements + for regex_replacement in rules.regex_replacement_list.iter() { + if Value::as_array(regex_replacement).unwrap().len() == 3 { + let regex = Regex::new(®ex_replacement[0].as_str().unwrap()).unwrap(); + let search = regex_replacement[1].as_str().unwrap(); + let replacement = regex_replacement[2].as_str().unwrap(); + + result = regex.replace_all(&result, |caps: ®ex::Captures| { + caps[0].replace(search, replacement) + }).to_string(); + } + } + result } @@ -168,4 +181,20 @@ mod test { assert_eq!(replace_strings(&rules, &String::from("Four: (content (and nested one)) should be removed.")), "Four: should be removed."); assert_eq!(replace_strings(&rules, &String::from("Five: (one) (two) and [three] 'and' should stay.")), "Five: and 'and' should stay."); } + + #[test] + fn test_regex_replacement() { + let rules = Rules { + regex_replacement_list: vec![ + Value::try_from([ + Value::try_from("\\ [a-z]{3,}\\.[A-Z][a-z]{2,}\\ ").unwrap(), + Value::try_from(".").unwrap(), + Value::try_from(". ").unwrap() + ]).unwrap(), + ], + ..Default::default() + }; + + assert_eq!(replace_strings(&rules, &String::from("A sentence.Glued to another.")), "A sentence. Glued to another."); + } } \ No newline at end of file diff --git a/src/rules.rs b/src/rules.rs index 8309ccd..fb23492 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -57,6 +57,7 @@ pub struct Rules { pub other_patterns: Array, pub stem_separator_regex: String, pub replacements: Array, + pub regex_replacement_list: Array, pub even_symbols: Array, pub matching_symbols: Array, } @@ -84,6 +85,7 @@ impl Default for Rules { other_patterns: vec![], stem_separator_regex: String::from(""), replacements: vec![], + regex_replacement_list: vec![], even_symbols: vec![], matching_symbols: vec![], } @@ -121,6 +123,7 @@ mod test { assert_eq!(rules.other_patterns, vec![]); assert_eq!(rules.stem_separator_regex, String::from("")); assert_eq!(rules.replacements, vec![]); + assert_eq!(rules.regex_replacement_list, vec![]); assert_eq!(rules.even_symbols, vec![]); assert_eq!(rules.matching_symbols, vec![]); }