-
Notifications
You must be signed in to change notification settings - Fork 46
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
WIP Prevent wildcard expressions for stopwords in simple expressions
- add stopwords to registry - add stopwords_case_insensitive option - support function for getting the stopwords - cache to optimize the parsing of stopwords.txt This transforms the (term AND term*) expression for stopwords, removing the wildcard expression. Such an expression would never match any documents, because solr won't remove the wildcard term, but the stopword will be missing from the index. This workaround does that with no side effects, as stopwords would be ignored by solr anyway. Both case sensitive and case insensitive stopword processing is supported, this depends on the solr schema, and must be set accordingly.
- Loading branch information
Showing
7 changed files
with
354 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import re | ||
|
||
from collective.solr.utils import getConfig | ||
|
||
reLine = re.compile(r"^([A-Za-zÀ-ÖØ-öø-ÿ]*)") | ||
|
||
raw = None | ||
raw_case_insensitive = None | ||
cooked = None | ||
|
||
|
||
def parseStopwords(stopwords, stopwords_case_insensitive): | ||
return list( | ||
map( | ||
lambda word: word.lower() if stopwords_case_insensitive else word, | ||
filter( | ||
lambda word: word, | ||
map(lambda line: reLine.match(line).group(1), stopwords.splitlines()), | ||
), | ||
) | ||
) | ||
|
||
|
||
def getStopWords(config): | ||
global raw | ||
global cooked | ||
global raw_case_insensitive | ||
config = config or getConfig() | ||
stopwords = getattr(config, "stopwords", "") | ||
stopwords_case_insensitive = getattr(config, "stopwords_case_insensitive", False) | ||
if ( | ||
cooked is None | ||
or raw is not stopwords | ||
or raw_case_insensitive != stopwords_case_insensitive | ||
): | ||
raw = stopwords | ||
raw_case_insensitive = stopwords_case_insensitive | ||
cooked = parseStopwords(raw, stopwords_case_insensitive) | ||
return cooked | ||
|
||
|
||
def isStopWord(term, config): | ||
stopwords_case_insensitive = getattr(config, "stopwords_case_insensitive", False) | ||
stopwords = getStopWords(config) | ||
return (term.lower() if stopwords_case_insensitive else term) in stopwords |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.