Skip to content

Commit

Permalink
Add bwc layer for german and german2 stemmer
Browse files Browse the repository at this point in the history
  • Loading branch information
cbuescher committed Sep 26, 2024
1 parent e16bd8e commit d911d15
Show file tree
Hide file tree
Showing 39 changed files with 587 additions and 14 deletions.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,13 @@
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.sv.SwedishLightStemFilter;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.logging.DeprecationCategory;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.tartarus.snowball.ext.ArmenianStemmer;
import org.tartarus.snowball.ext.BasqueStemmer;
Expand Down Expand Up @@ -82,10 +86,14 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {

private static final TokenStream EMPTY_TOKEN_STREAM = new EmptyTokenStream();

private String language;
private final String language;
private final IndexVersion version;

private static final DeprecationLogger DEPRECATION_LOGGER = DeprecationLogger.getLogger(StemmerTokenFilterFactory.class);

StemmerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
super(name, settings);
this.version = indexSettings.getIndexVersionCreated();
this.language = Strings.capitalize(settings.get("language", settings.get("name", "porter")));
// check that we have a valid language by trying to create a TokenStream
create(EMPTY_TOKEN_STREAM).close();
Expand Down Expand Up @@ -165,9 +173,19 @@ public TokenStream create(TokenStream tokenStream) {

// German stemmers
} else if ("german".equalsIgnoreCase(language)) {
return new SnowballFilter(tokenStream, new GermanStemmer());
if (this.version.onOrAfter(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)) {
return new SnowballFilter(tokenStream, new GermanStemmer());
} else {
// use pre-L10 GermanStemmer that doesn't normalize umlauts etc...
return new SnowballFilter(tokenStream, new LegacyGermanStemmer());
}
} else if ("german2".equalsIgnoreCase(language)) {
// TODO Lucene 10 upgrade: how about bw comp for users relying on german2 stemmer that is now folded into german stemmer?
DEPRECATION_LOGGER.warn(
DeprecationCategory.ANALYSIS,
"german2_stemmer_deprecation",
"The 'german2' stemmer has been deprecated and folged into the 'german' Stemmer. "
+ "Replace all usages of 'german2' with 'german'."
);
return new SnowballFilter(tokenStream, new GermanStemmer());
} else if ("light_german".equalsIgnoreCase(language) || "lightGerman".equalsIgnoreCase(language)) {
return new GermanLightStemFilter(tokenStream);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
public void testDefault() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

/**
* Base class to test {@link WordDelimiterTokenFilterFactory} and
* {@link WordDelimiterGraphTokenFilterFactory}.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class CJKFilterFactoryTests extends ESTokenStreamTestCase {
private static final String RESOURCE = "/org/elasticsearch/analysis/common/cjk_analysis.json";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
import java.io.StringReader;
import java.util.Arrays;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class CharGroupTokenizerFactoryTests extends ESTokenStreamTestCase {

public void testParseTokenChars() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import java.nio.file.Files;
import java.nio.file.Path;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class CommonGramsTokenFilterFactoryTests extends ESTokenStreamTestCase {
public void testDefault() throws IOException {
Settings settings = Settings.builder()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.io.StringReader;
import java.text.ParseException;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.containsString;

public class ESSolrSynonymParserTests extends ESTokenStreamTestCase {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.io.StringReader;
import java.text.ParseException;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.containsString;

public class ESWordnetSynonymParserTests extends ESTokenStreamTestCase {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class EdgeNGramTokenFilterFactoryTests extends ESTokenStreamTestCase {

public void testDefault() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
import java.io.StringReader;
import java.util.Collections;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class EdgeNGramTokenizerTests extends ESTokenStreamTestCase {

private static IndexAnalyzers buildAnalyzers(IndexVersion version, String tokenizer) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import org.apache.lucene.analysis.CharArraySet;
import org.elasticsearch.test.ESTokenStreamTestCase;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;

public class FingerprintAnalyzerTests extends ESTokenStreamTestCase {

public void testFingerprint() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

import java.io.IOException;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class FlattenGraphTokenFilterFactoryTests extends ESTokenStreamTestCase {

public void testBasic() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.instanceOf;

public class KeepFilterFactoryTests extends ESTokenStreamTestCase {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.instanceOf;

public class KeepTypesFilterFactoryTests extends ESTokenStreamTestCase {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import java.io.IOException;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
import static org.hamcrest.Matchers.instanceOf;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class LimitTokenCountFilterFactoryTests extends ESTokenStreamTestCase {
public void testDefault() throws IOException {
Settings settings = Settings.builder()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertStreamHasNumberOfTokens;

public class MinHashFilterFactoryTests extends ESTokenStreamTestCase {
public void testDefault() throws IOException {
int default_hash_count = 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import java.io.IOException;
import java.util.Collections;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;

public class MultiplexerTokenFilterTests extends ESTokenStreamTestCase {

public void testMultiplexingFilter() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class NGramTokenFilterFactoryTests extends ESTokenStreamTestCase {

public void testDefault() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import java.io.StringReader;
import java.util.Arrays;

import static com.carrotsearch.randomizedtesting.RandomizedTest.scaledRandomIntBetween;
import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.instanceOf;

public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase {
Expand Down Expand Up @@ -183,6 +183,9 @@ public void testBackwardsCompatibilityEdgeNgramTokenFilter() throws Exception {
assertThat(edgeNGramTokenFilter, instanceOf(EdgeNGramTokenFilter.class));
}
}
assertWarnings(
"The [side] parameter is deprecated and will be removed. Use a [reverse] before and after the [edge_ngram] instead."
);
}

/*`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import java.io.IOException;
import java.io.StringReader;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class PathHierarchyTokenizerFactoryTests extends ESTokenStreamTestCase {

public void testDefaults() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
import java.util.Arrays;
import java.util.regex.Pattern;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.checkRandomData;

/**
* Verifies the behavior of PatternAnalyzer.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import org.elasticsearch.test.ESTokenStreamTestCase;
import org.elasticsearch.test.IndexSettingsModule;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.elasticsearch.test.ESTestCase.createTestAnalysis;
import static org.hamcrest.Matchers.containsString;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

import java.io.IOException;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class PatternReplaceTokenFilterTests extends ESTokenStreamTestCase {

public void testNormalizer() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import java.io.IOException;
import java.util.Collections;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import java.io.IOException;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.hamcrest.Matchers.instanceOf;

public class RemoveDuplicatesFilterFactoryTests extends ESTokenStreamTestCase {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

import java.util.Collections;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.elasticsearch.test.ESTokenStreamTestCase;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;

public class SnowballAnalyzerTests extends ESTokenStreamTestCase {

public void testEnglish() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
import java.util.List;
import java.util.Locale;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class StemmerOverrideTokenFilterFactoryTests extends ESTokenStreamTestCase {
@Rule
public ExpectedException expectedException = ExpectedException.none();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
Expand All @@ -28,7 +29,7 @@
import java.io.IOException;
import java.io.StringReader;

import static com.carrotsearch.randomizedtesting.RandomizedTest.scaledRandomIntBetween;
import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_VERSION_CREATED;
import static org.hamcrest.Matchers.instanceOf;

Expand Down Expand Up @@ -106,15 +107,38 @@ public void testMultipleLanguagesThrowsException() throws IOException {
}

public void testGermanVsGerman2Stemmer() throws IOException {
Analyzer analyzer = createGermanStemmer("german");
assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buech", "Buch" });

analyzer = createGermanStemmer("german2");
assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buch", "Buch"});
{
IndexVersion v = IndexVersionUtils.randomVersionBetween(
random(),
IndexVersionUtils.getFirstVersion(),
IndexVersionUtils.getPreviousVersion(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)
);
Analyzer analyzer = createGermanStemmer("german", v);
assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buech", "Buch" });

analyzer = createGermanStemmer("german2", v);
assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buch", "Buch" });
}
{
IndexVersion v = IndexVersionUtils.randomVersionBetween(
random(),
IndexVersions.UPGRADE_TO_LUCENE_10_0_0,
IndexVersion.current()
);
Analyzer analyzer = createGermanStemmer("german", v);
assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buch", "Buch" });

analyzer = createGermanStemmer("german2", v);
assertAnalyzesTo(analyzer, "Buecher Bücher", new String[] { "Buch", "Buch" });
assertWarnings(
"The 'german2' stemmer has been deprecated and folged into the 'german' Stemmer. "
+ "Replace all usages of 'german2' with 'german'."
);
}
}

public Analyzer createGermanStemmer(String variant) throws IOException {
IndexVersion v = IndexVersionUtils.randomVersion(random());
public Analyzer createGermanStemmer(String variant, IndexVersion v) throws IOException {

Settings settings = Settings.builder()
.put("index.analysis.filter.my_german.type", "stemmer")
.put("index.analysis.filter.my_german.language", variant)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import org.elasticsearch.test.ESTokenStreamTestCase;
import org.elasticsearch.test.IndexSettingsModule;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;
import static org.elasticsearch.test.ESTestCase.createTestAnalysis;

public class StopAnalyzerTests extends ESTokenStreamTestCase {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

import java.io.IOException;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class TrimTokenFilterTests extends ESTokenStreamTestCase {

public void testNormalizer() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
import java.io.StringReader;
import java.util.Collections;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class WordDelimiterGraphTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
public WordDelimiterGraphTokenFilterFactoryTests() {
super("word_delimiter_graph");
Expand Down
Loading

0 comments on commit d911d15

Please sign in to comment.