From 0c1ef66c0493a392d65eece8ae97efae745267bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Tue, 1 Oct 2024 16:04:09 +0200 Subject: [PATCH] Add bwc layer for 'romanian' analyzer The 'romanian' language analyzer has been improved in Lucene 10 in two important ways. First, the snowball stemmer has been modified to work with s-comma and t-comma characters but only with their cedilla forms used when Romanian didn't have full Unicode support (https://github.com/snowballstem/snowball/pull/177). Second, the analyzer now contains a normalization step to map cedilla forms to forms with comma. In order to maintain backwards compatibility with existing indices, this change moves the Lucene 9 stemmer over to the analysis module was a deprecated variant and creates the analyzer for existing indices with the "old" stemmer and without the normalization step. New indices automatically run with the improved behaviour. --- .../common/LegacyRomanianStemmer.java | 744 ++++++++++++++++++ .../common/RomanianAnalyzerProvider.java | 46 +- .../common/RomanianAnalyzerTests.java | 80 ++ .../upgrades/FullClusterRestartIT.java | 106 +++ 4 files changed, 969 insertions(+), 7 deletions(-) create mode 100644 modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LegacyRomanianStemmer.java create mode 100644 modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/RomanianAnalyzerTests.java diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LegacyRomanianStemmer.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LegacyRomanianStemmer.java new file mode 100644 index 0000000000000..944fc1dacd880 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/LegacyRomanianStemmer.java @@ -0,0 +1,744 @@ +/* + * @notice + * Generated by Snowball 2.0.0 - https://snowballstem.org/ + * + * Modifications copyright (C) 2024 Elasticsearch B.V. + */ + +package org.elasticsearch.analysis.common; + +import org.tartarus.snowball.Among; + +/** +* This class implements the stemming algorithm defined by a snowball script. +* NOTE: This is the RomanianStemmer used in Lucene 9 +* +* @deprecated this class exists for backwards compatibility of indices created with Lucene 9 +*/ +@Deprecated +@SuppressWarnings("checkstyle:DescendantToken") +public class LegacyRomanianStemmer extends org.tartarus.snowball.SnowballStemmer { + + private static final java.lang.invoke.MethodHandles.Lookup methodObject = java.lang.invoke.MethodHandles.lookup(); + + private static final Among a_0[] = { new Among("", -1, 3), new Among("I", 0, 1), new Among("U", 0, 2) }; + + private static final Among a_1[] = { + new Among("ea", -1, 3), + new Among("a\u0163ia", -1, 7), + new Among("aua", -1, 2), + new Among("iua", -1, 4), + new Among("a\u0163ie", -1, 7), + new Among("ele", -1, 3), + new Among("ile", -1, 5), + new Among("iile", 6, 4), + new Among("iei", -1, 4), + new Among("atei", -1, 6), + new Among("ii", -1, 4), + new Among("ului", -1, 1), + new Among("ul", -1, 1), + new Among("elor", -1, 3), + new Among("ilor", -1, 4), + new Among("iilor", 14, 4) }; + + private static final Among a_2[] = { + new Among("icala", -1, 4), + new Among("iciva", -1, 4), + new Among("ativa", -1, 5), + new Among("itiva", -1, 6), + new Among("icale", -1, 4), + new Among("a\u0163iune", -1, 5), + new Among("i\u0163iune", -1, 6), + new Among("atoare", -1, 5), + new Among("itoare", -1, 6), + new Among("\u0103toare", -1, 5), + new Among("icitate", -1, 4), + new Among("abilitate", -1, 1), + new Among("ibilitate", -1, 2), + new Among("ivitate", -1, 3), + new Among("icive", -1, 4), + new Among("ative", -1, 5), + new Among("itive", -1, 6), + new Among("icali", -1, 4), + new Among("atori", -1, 5), + new Among("icatori", 18, 4), + new Among("itori", -1, 6), + new Among("\u0103tori", -1, 5), + new Among("icitati", -1, 4), + new Among("abilitati", -1, 1), + new Among("ivitati", -1, 3), + new Among("icivi", -1, 4), + new Among("ativi", -1, 5), + new Among("itivi", -1, 6), + new Among("icit\u0103i", -1, 4), + new Among("abilit\u0103i", -1, 1), + new Among("ivit\u0103i", -1, 3), + new Among("icit\u0103\u0163i", -1, 4), + new Among("abilit\u0103\u0163i", -1, 1), + new Among("ivit\u0103\u0163i", -1, 3), + new Among("ical", -1, 4), + new Among("ator", -1, 5), + new Among("icator", 35, 4), + new Among("itor", -1, 6), + new Among("\u0103tor", -1, 5), + new Among("iciv", -1, 4), + new Among("ativ", -1, 5), + new Among("itiv", -1, 6), + new Among("ical\u0103", -1, 4), + new Among("iciv\u0103", -1, 4), + new Among("ativ\u0103", -1, 5), + new Among("itiv\u0103", -1, 6) }; + + private static final Among a_3[] = { + new Among("ica", -1, 1), + new Among("abila", -1, 1), + new Among("ibila", -1, 1), + new Among("oasa", -1, 1), + new Among("ata", -1, 1), + new Among("ita", -1, 1), + new Among("anta", -1, 1), + new Among("ista", -1, 3), + new Among("uta", -1, 1), + new Among("iva", -1, 1), + new Among("ic", -1, 1), + new Among("ice", -1, 1), + new Among("abile", -1, 1), + new Among("ibile", -1, 1), + new Among("isme", -1, 3), + new Among("iune", -1, 2), + new Among("oase", -1, 1), + new Among("ate", -1, 1), + new Among("itate", 17, 1), + new Among("ite", -1, 1), + new Among("ante", -1, 1), + new Among("iste", -1, 3), + new Among("ute", -1, 1), + new Among("ive", -1, 1), + new Among("ici", -1, 1), + new Among("abili", -1, 1), + new Among("ibili", -1, 1), + new Among("iuni", -1, 2), + new Among("atori", -1, 1), + new Among("osi", -1, 1), + new Among("ati", -1, 1), + new Among("itati", 30, 1), + new Among("iti", -1, 1), + new Among("anti", -1, 1), + new Among("isti", -1, 3), + new Among("uti", -1, 1), + new Among("i\u015Fti", -1, 3), + new Among("ivi", -1, 1), + new Among("it\u0103i", -1, 1), + new Among("o\u015Fi", -1, 1), + new Among("it\u0103\u0163i", -1, 1), + new Among("abil", -1, 1), + new Among("ibil", -1, 1), + new Among("ism", -1, 3), + new Among("ator", -1, 1), + new Among("os", -1, 1), + new Among("at", -1, 1), + new Among("it", -1, 1), + new Among("ant", -1, 1), + new Among("ist", -1, 3), + new Among("ut", -1, 1), + new Among("iv", -1, 1), + new Among("ic\u0103", -1, 1), + new Among("abil\u0103", -1, 1), + new Among("ibil\u0103", -1, 1), + new Among("oas\u0103", -1, 1), + new Among("at\u0103", -1, 1), + new Among("it\u0103", -1, 1), + new Among("ant\u0103", -1, 1), + new Among("ist\u0103", -1, 3), + new Among("ut\u0103", -1, 1), + new Among("iv\u0103", -1, 1) }; + + private static final Among a_4[] = { + new Among("ea", -1, 1), + new Among("ia", -1, 1), + new Among("esc", -1, 1), + new Among("\u0103sc", -1, 1), + new Among("ind", -1, 1), + new Among("\u00E2nd", -1, 1), + new Among("are", -1, 1), + new Among("ere", -1, 1), + new Among("ire", -1, 1), + new Among("\u00E2re", -1, 1), + new Among("se", -1, 2), + new Among("ase", 10, 1), + new Among("sese", 10, 2), + new Among("ise", 10, 1), + new Among("use", 10, 1), + new Among("\u00E2se", 10, 1), + new Among("e\u015Fte", -1, 1), + new Among("\u0103\u015Fte", -1, 1), + new Among("eze", -1, 1), + new Among("ai", -1, 1), + new Among("eai", 19, 1), + new Among("iai", 19, 1), + new Among("sei", -1, 2), + new Among("e\u015Fti", -1, 1), + new Among("\u0103\u015Fti", -1, 1), + new Among("ui", -1, 1), + new Among("ezi", -1, 1), + new Among("\u00E2i", -1, 1), + new Among("a\u015Fi", -1, 1), + new Among("se\u015Fi", -1, 2), + new Among("ase\u015Fi", 29, 1), + new Among("sese\u015Fi", 29, 2), + new Among("ise\u015Fi", 29, 1), + new Among("use\u015Fi", 29, 1), + new Among("\u00E2se\u015Fi", 29, 1), + new Among("i\u015Fi", -1, 1), + new Among("u\u015Fi", -1, 1), + new Among("\u00E2\u015Fi", -1, 1), + new Among("a\u0163i", -1, 2), + new Among("ea\u0163i", 38, 1), + new Among("ia\u0163i", 38, 1), + new Among("e\u0163i", -1, 2), + new Among("i\u0163i", -1, 2), + new Among("\u00E2\u0163i", -1, 2), + new Among("ar\u0103\u0163i", -1, 1), + new Among("ser\u0103\u0163i", -1, 2), + new Among("aser\u0103\u0163i", 45, 1), + new Among("seser\u0103\u0163i", 45, 2), + new Among("iser\u0103\u0163i", 45, 1), + new Among("user\u0103\u0163i", 45, 1), + new Among("\u00E2ser\u0103\u0163i", 45, 1), + new Among("ir\u0103\u0163i", -1, 1), + new Among("ur\u0103\u0163i", -1, 1), + new Among("\u00E2r\u0103\u0163i", -1, 1), + new Among("am", -1, 1), + new Among("eam", 54, 1), + new Among("iam", 54, 1), + new Among("em", -1, 2), + new Among("asem", 57, 1), + new Among("sesem", 57, 2), + new Among("isem", 57, 1), + new Among("usem", 57, 1), + new Among("\u00E2sem", 57, 1), + new Among("im", -1, 2), + new Among("\u00E2m", -1, 2), + new Among("\u0103m", -1, 2), + new Among("ar\u0103m", 65, 1), + new Among("ser\u0103m", 65, 2), + new Among("aser\u0103m", 67, 1), + new Among("seser\u0103m", 67, 2), + new Among("iser\u0103m", 67, 1), + new Among("user\u0103m", 67, 1), + new Among("\u00E2ser\u0103m", 67, 1), + new Among("ir\u0103m", 65, 1), + new Among("ur\u0103m", 65, 1), + new Among("\u00E2r\u0103m", 65, 1), + new Among("au", -1, 1), + new Among("eau", 76, 1), + new Among("iau", 76, 1), + new Among("indu", -1, 1), + new Among("\u00E2ndu", -1, 1), + new Among("ez", -1, 1), + new Among("easc\u0103", -1, 1), + new Among("ar\u0103", -1, 1), + new Among("ser\u0103", -1, 2), + new Among("aser\u0103", 84, 1), + new Among("seser\u0103", 84, 2), + new Among("iser\u0103", 84, 1), + new Among("user\u0103", 84, 1), + new Among("\u00E2ser\u0103", 84, 1), + new Among("ir\u0103", -1, 1), + new Among("ur\u0103", -1, 1), + new Among("\u00E2r\u0103", -1, 1), + new Among("eaz\u0103", -1, 1) }; + + private static final Among a_5[] = { + new Among("a", -1, 1), + new Among("e", -1, 1), + new Among("ie", 1, 1), + new Among("i", -1, 1), + new Among("\u0103", -1, 1) }; + + private static final char g_v[] = { 17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 32, 0, 0, 4 }; + + private boolean B_standard_suffix_removed; + private int I_p2; + private int I_p1; + private int I_pV; + + private boolean r_prelude() { + while (true) { + int v_1 = cursor; + lab0: { + golab1: while (true) { + int v_2 = cursor; + lab2: { + if (!(in_grouping(g_v, 97, 259))) { + break lab2; + } + bra = cursor; + lab3: { + int v_3 = cursor; + lab4: { + if (!(eq_s("u"))) { + break lab4; + } + ket = cursor; + if (!(in_grouping(g_v, 97, 259))) { + break lab4; + } + slice_from("U"); + break lab3; + } + cursor = v_3; + if (!(eq_s("i"))) { + break lab2; + } + ket = cursor; + if (!(in_grouping(g_v, 97, 259))) { + break lab2; + } + slice_from("I"); + } + cursor = v_2; + break golab1; + } + cursor = v_2; + if (cursor >= limit) { + break lab0; + } + cursor++; + } + continue; + } + cursor = v_1; + break; + } + return true; + } + + private boolean r_mark_regions() { + I_pV = limit; + I_p1 = limit; + I_p2 = limit; + int v_1 = cursor; + lab0: { + lab1: { + int v_2 = cursor; + lab2: { + if (!(in_grouping(g_v, 97, 259))) { + break lab2; + } + lab3: { + int v_3 = cursor; + lab4: { + if (!(out_grouping(g_v, 97, 259))) { + break lab4; + } + golab5: while (true) { + lab6: { + if (!(in_grouping(g_v, 97, 259))) { + break lab6; + } + break golab5; + } + if (cursor >= limit) { + break lab4; + } + cursor++; + } + break lab3; + } + cursor = v_3; + if (!(in_grouping(g_v, 97, 259))) { + break lab2; + } + golab7: while (true) { + lab8: { + if (!(out_grouping(g_v, 97, 259))) { + break lab8; + } + break golab7; + } + if (cursor >= limit) { + break lab2; + } + cursor++; + } + } + break lab1; + } + cursor = v_2; + if (!(out_grouping(g_v, 97, 259))) { + break lab0; + } + lab9: { + int v_6 = cursor; + lab10: { + if (!(out_grouping(g_v, 97, 259))) { + break lab10; + } + golab11: while (true) { + lab12: { + if (!(in_grouping(g_v, 97, 259))) { + break lab12; + } + break golab11; + } + if (cursor >= limit) { + break lab10; + } + cursor++; + } + break lab9; + } + cursor = v_6; + if (!(in_grouping(g_v, 97, 259))) { + break lab0; + } + if (cursor >= limit) { + break lab0; + } + cursor++; + } + } + I_pV = cursor; + } + cursor = v_1; + int v_8 = cursor; + lab13: { + golab14: while (true) { + lab15: { + if (!(in_grouping(g_v, 97, 259))) { + break lab15; + } + break golab14; + } + if (cursor >= limit) { + break lab13; + } + cursor++; + } + golab16: while (true) { + lab17: { + if (!(out_grouping(g_v, 97, 259))) { + break lab17; + } + break golab16; + } + if (cursor >= limit) { + break lab13; + } + cursor++; + } + I_p1 = cursor; + golab18: while (true) { + lab19: { + if (!(in_grouping(g_v, 97, 259))) { + break lab19; + } + break golab18; + } + if (cursor >= limit) { + break lab13; + } + cursor++; + } + golab20: while (true) { + lab21: { + if (!(out_grouping(g_v, 97, 259))) { + break lab21; + } + break golab20; + } + if (cursor >= limit) { + break lab13; + } + cursor++; + } + I_p2 = cursor; + } + cursor = v_8; + return true; + } + + private boolean r_postlude() { + int among_var; + while (true) { + int v_1 = cursor; + lab0: { + bra = cursor; + among_var = find_among(a_0); + if (among_var == 0) { + break lab0; + } + ket = cursor; + switch (among_var) { + case 1: + slice_from("i"); + break; + case 2: + slice_from("u"); + break; + case 3: + if (cursor >= limit) { + break lab0; + } + cursor++; + break; + } + continue; + } + cursor = v_1; + break; + } + return true; + } + + private boolean r_RV() { + if (!(I_pV <= cursor)) { + return false; + } + return true; + } + + private boolean r_R1() { + if (!(I_p1 <= cursor)) { + return false; + } + return true; + } + + private boolean r_R2() { + if (!(I_p2 <= cursor)) { + return false; + } + return true; + } + + private boolean r_step_0() { + int among_var; + ket = cursor; + among_var = find_among_b(a_1); + if (among_var == 0) { + return false; + } + bra = cursor; + if (!r_R1()) { + return false; + } + switch (among_var) { + case 1: + slice_del(); + break; + case 2: + slice_from("a"); + break; + case 3: + slice_from("e"); + break; + case 4: + slice_from("i"); + break; + case 5: { + int v_1 = limit - cursor; + lab0: { + if (!(eq_s_b("ab"))) { + break lab0; + } + return false; + } + cursor = limit - v_1; + } + slice_from("i"); + break; + case 6: + slice_from("at"); + break; + case 7: + slice_from("a\u0163i"); + break; + } + return true; + } + + private boolean r_combo_suffix() { + int among_var; + int v_1 = limit - cursor; + ket = cursor; + among_var = find_among_b(a_2); + if (among_var == 0) { + return false; + } + bra = cursor; + if (!r_R1()) { + return false; + } + switch (among_var) { + case 1: + slice_from("abil"); + break; + case 2: + slice_from("ibil"); + break; + case 3: + slice_from("iv"); + break; + case 4: + slice_from("ic"); + break; + case 5: + slice_from("at"); + break; + case 6: + slice_from("it"); + break; + } + B_standard_suffix_removed = true; + cursor = limit - v_1; + return true; + } + + private boolean r_standard_suffix() { + int among_var; + B_standard_suffix_removed = false; + while (true) { + int v_1 = limit - cursor; + lab0: { + if (!r_combo_suffix()) { + break lab0; + } + continue; + } + cursor = limit - v_1; + break; + } + ket = cursor; + among_var = find_among_b(a_3); + if (among_var == 0) { + return false; + } + bra = cursor; + if (!r_R2()) { + return false; + } + switch (among_var) { + case 1: + slice_del(); + break; + case 2: + if (!(eq_s_b("\u0163"))) { + return false; + } + bra = cursor; + slice_from("t"); + break; + case 3: + slice_from("ist"); + break; + } + B_standard_suffix_removed = true; + return true; + } + + private boolean r_verb_suffix() { + int among_var; + if (cursor < I_pV) { + return false; + } + int v_2 = limit_backward; + limit_backward = I_pV; + ket = cursor; + among_var = find_among_b(a_4); + if (among_var == 0) { + limit_backward = v_2; + return false; + } + bra = cursor; + switch (among_var) { + case 1: + lab0: { + int v_3 = limit - cursor; + lab1: { + if (!(out_grouping_b(g_v, 97, 259))) { + break lab1; + } + break lab0; + } + cursor = limit - v_3; + if (!(eq_s_b("u"))) { + limit_backward = v_2; + return false; + } + } + slice_del(); + break; + case 2: + slice_del(); + break; + } + limit_backward = v_2; + return true; + } + + private boolean r_vowel_suffix() { + ket = cursor; + if (find_among_b(a_5) == 0) { + return false; + } + bra = cursor; + if (!r_RV()) { + return false; + } + slice_del(); + return true; + } + + @Override + public boolean stem() { + int v_1 = cursor; + r_prelude(); + cursor = v_1; + r_mark_regions(); + limit_backward = cursor; + cursor = limit; + int v_3 = limit - cursor; + r_step_0(); + cursor = limit - v_3; + int v_4 = limit - cursor; + r_standard_suffix(); + cursor = limit - v_4; + int v_5 = limit - cursor; + lab0: { + lab1: { + int v_6 = limit - cursor; + lab2: { + if (!(B_standard_suffix_removed)) { + break lab2; + } + break lab1; + } + cursor = limit - v_6; + if (!r_verb_suffix()) { + break lab0; + } + } + } + cursor = limit - v_5; + int v_7 = limit - cursor; + r_vowel_suffix(); + cursor = limit - v_7; + cursor = limit_backward; + int v_8 = cursor; + r_postlude(); + cursor = v_8; + return true; + } + + @Override + public boolean equals(Object o) { + return o instanceof LegacyRomanianStemmer; + } + + @Override + public int hashCode() { + return LegacyRomanianStemmer.class.getName().hashCode(); + } +} diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/RomanianAnalyzerProvider.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/RomanianAnalyzerProvider.java index cf33a38abd634..6c28df83a6d36 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/RomanianAnalyzerProvider.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/RomanianAnalyzerProvider.java @@ -9,28 +9,60 @@ package org.elasticsearch.analysis.common; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.StopwordAnalyzerBase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; import org.apache.lucene.analysis.ro.RomanianAnalyzer; +import org.apache.lucene.analysis.snowball.SnowballFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider; import org.elasticsearch.index.analysis.Analysis; -public class RomanianAnalyzerProvider extends AbstractIndexAnalyzerProvider { +public class RomanianAnalyzerProvider extends AbstractIndexAnalyzerProvider { - private final RomanianAnalyzer analyzer; + private final StopwordAnalyzerBase analyzer; RomanianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(name, settings); - analyzer = new RomanianAnalyzer( - Analysis.parseStopWords(env, settings, RomanianAnalyzer.getDefaultStopSet()), - Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) - ); + CharArraySet stopwords = Analysis.parseStopWords(env, settings, RomanianAnalyzer.getDefaultStopSet()); + CharArraySet stemExclusionSet = Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET); + if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)) { + // since Lucene 10, this analyzer a modern unicode form and normalizes cedilla forms to forms with commas + analyzer = new RomanianAnalyzer(stopwords, stemExclusionSet); + } else { + // for older index versions we need the old behaviour without normalization + analyzer = new StopwordAnalyzerBase(Analysis.parseStopWords(env, settings, RomanianAnalyzer.getDefaultStopSet())) { + + protected Analyzer.TokenStreamComponents createComponents(String fieldName) { + final Tokenizer source = new StandardTokenizer(); + TokenStream result = new LowerCaseFilter(source); + result = new StopFilter(result, stopwords); + if (stemExclusionSet.isEmpty() == false) { + result = new SetKeywordMarkerFilter(result, stemExclusionSet); + } + result = new SnowballFilter(result, new LegacyRomanianStemmer()); + return new TokenStreamComponents(source, result); + } + + protected TokenStream normalize(String fieldName, TokenStream in) { + return new LowerCaseFilter(in); + } + }; + + } } @Override - public RomanianAnalyzer get() { + public StopwordAnalyzerBase get() { return this.analyzer; } } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/RomanianAnalyzerTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/RomanianAnalyzerTests.java new file mode 100644 index 0000000000000..1af44bc71f35d --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/RomanianAnalyzerTests.java @@ -0,0 +1,80 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.Analyzer; +import org.elasticsearch.cluster.metadata.IndexMetadata; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.IndexVersion; +import org.elasticsearch.index.IndexVersions; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.ESTokenStreamTestCase; +import org.elasticsearch.test.IndexSettingsModule; +import org.elasticsearch.test.index.IndexVersionUtils; + +import java.io.IOException; + +import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo; + +/** + * Verifies the behavior of Romanian analyzer. + */ +public class RomanianAnalyzerTests extends ESTokenStreamTestCase { + + public void testRomanianAnalyzerPostLucene10() throws IOException { + IndexVersion postLucene10Version = IndexVersionUtils.randomVersionBetween( + random(), + IndexVersions.UPGRADE_TO_LUCENE_10_0_0, + IndexVersion.current() + ); + Settings settings = ESTestCase.indexSettings(1, 1) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(IndexMetadata.SETTING_VERSION_CREATED, postLucene10Version) + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + Environment environment = new Environment(settings, null); + + RomanianAnalyzerProvider romanianAnalyzerProvider = new RomanianAnalyzerProvider( + idxSettings, + environment, + "my-analyzer", + Settings.EMPTY + ); + Analyzer analyzer = romanianAnalyzerProvider.get(); + assertAnalyzesTo(analyzer, "absenţa", new String[] { "absenț" }); + assertAnalyzesTo(analyzer, "cunoştinţă", new String[] { "cunoștinț" }); + } + + public void testRomanianAnalyzerPreLucene10() throws IOException { + IndexVersion preLucene10Version = IndexVersionUtils.randomVersionBetween( + random(), + IndexVersionUtils.getFirstVersion(), + IndexVersionUtils.getPreviousVersion(IndexVersions.UPGRADE_TO_LUCENE_10_0_0) + ); + Settings settings = ESTestCase.indexSettings(1, 1) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(IndexMetadata.SETTING_VERSION_CREATED, preLucene10Version) + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + Environment environment = new Environment(settings, null); + + RomanianAnalyzerProvider romanianAnalyzerProvider = new RomanianAnalyzerProvider( + idxSettings, + environment, + "my-analyzer", + Settings.EMPTY + ); + Analyzer analyzer = romanianAnalyzerProvider.get(); + assertAnalyzesTo(analyzer, "absenţa", new String[] { "absenţ" }); + assertAnalyzesTo(analyzer, "cunoştinţă", new String[] { "cunoştinţ" }); + } +} diff --git a/qa/full-cluster-restart/src/javaRestTest/java/org/elasticsearch/upgrades/FullClusterRestartIT.java b/qa/full-cluster-restart/src/javaRestTest/java/org/elasticsearch/upgrades/FullClusterRestartIT.java index ee18f8fc2ec4b..5bbf6d536f0f8 100644 --- a/qa/full-cluster-restart/src/javaRestTest/java/org/elasticsearch/upgrades/FullClusterRestartIT.java +++ b/qa/full-cluster-restart/src/javaRestTest/java/org/elasticsearch/upgrades/FullClusterRestartIT.java @@ -33,6 +33,7 @@ import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.mapper.DateFieldMapper; import org.elasticsearch.rest.action.admin.indices.RestPutIndexTemplateAction; +import org.elasticsearch.search.SearchFeatures; import org.elasticsearch.test.NotEqualMessageBuilder; import org.elasticsearch.test.XContentTestUtils; import org.elasticsearch.test.cluster.ElasticsearchCluster; @@ -1726,6 +1727,111 @@ public void testSystemIndexMetadataIsUpgraded() throws Exception { } } + /** + * This test ensures that search results on old indices using "romanain" analyzer don't change + * after we introduce Lucene 10 + */ + public void testRomanianAnalyzerBWC() throws Exception { + var originalClusterLegacyRomanianAnalyzer = oldClusterHasFeature(SearchFeatures.LUCENE_10_0_0_UPGRADE) == false; + assumeTrue("Don't run this test if both versions already support stemming", originalClusterLegacyRomanianAnalyzer); + final String indexName = "test_romanian_stemmer"; + Settings idxSettings = indexSettings(1, 1).build(); + String cedillaForm = "absenţa"; + String commaForm = "absența"; + + String mapping = """ + { + "properties": { + "textfield" : { + "type": "text", + "analyzer": "romanian" + } + } + } + """; + + // query that uses the cedilla form of "t" + String query = """ + { + "query": { + "match": { + "textfield": "absenţa" + } + } + } + """; + + if (isRunningAgainstOldCluster()) { + createIndex(client(), indexName, idxSettings, mapping); + ensureGreen(indexName); + + assertOK( + client().performRequest( + newXContentRequest( + HttpMethod.POST, + "/" + indexName + "/" + "_doc/1", + (builder, params) -> builder.field("textfield", cedillaForm) + ) + ) + ); + assertOK( + client().performRequest( + newXContentRequest( + HttpMethod.POST, + "/" + indexName + "/" + "_doc/2", + // this doc uses the comma form + (builder, params) -> builder.field("textfield", commaForm) + ) + ) + ); + refresh(indexName); + + assertNumHits(indexName, 2, 1); + + Request searchRequest = new Request("POST", "/" + indexName + "/_search"); + searchRequest.setJsonEntity(query); + assertTotalHits(1, entityAsMap(client().performRequest(searchRequest))); + } else { + // old index should still only return one doc + Request searchRequest = new Request("POST", "/" + indexName + "/_search"); + searchRequest.setJsonEntity(query); + assertTotalHits(1, entityAsMap(client().performRequest(searchRequest))); + + String newIndexName = indexName + "_new"; + createIndex(client(), newIndexName, idxSettings, mapping); + ensureGreen(newIndexName); + + assertOK( + client().performRequest( + newXContentRequest( + HttpMethod.POST, + "/" + newIndexName + "/" + "_doc/1", + (builder, params) -> builder.field("textfield", cedillaForm) + ) + ) + ); + assertOK( + client().performRequest( + newXContentRequest( + HttpMethod.POST, + "/" + newIndexName + "/" + "_doc/2", + (builder, params) -> builder.field("textfield", commaForm) + ) + ) + ); + refresh(newIndexName); + + searchRequest = new Request("POST", "/" + newIndexName + "/_search"); + searchRequest.setJsonEntity(query); + assertTotalHits(2, entityAsMap(client().performRequest(searchRequest))); + + // searching both indices (old and new analysis version) we should get 1 hit from the old and 2 from the new index + searchRequest = new Request("POST", "/" + indexName + "," + newIndexName + "/_search"); + searchRequest.setJsonEntity(query); + assertTotalHits(3, entityAsMap(client().performRequest(searchRequest))); + } + } + /** * This test ensures that soft deletes are enabled a when upgrading a pre-8 cluster to 8.0+ */