From 0e5785dc559c9603de0d289c6687e32e4c1d843b Mon Sep 17 00:00:00 2001 From: yetra Date: Wed, 30 Mar 2022 10:35:25 +0200 Subject: [PATCH] Add lemmatization option for normalizing loaded documents --- pke/base.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pke/base.py b/pke/base.py index 14d9df32..f2c6b4ca 100644 --- a/pke/base.py +++ b/pke/base.py @@ -58,7 +58,8 @@ def load_document(self, input, language=None, stoplist=None, stoplist (list): custom list of stopwords, defaults to pke.lang.stopwords[language]. normalization (str): word normalization method, defaults to - 'stemming'. Other possible value is 'none' + 'stemming'. Other possible values are 'lemmatization' + for using lemmas as stems and 'none' for using word surface forms instead of stems/lemmas. spacy_model (spacy.lang): preloaded spacy model when input is a string. @@ -119,6 +120,10 @@ def load_document(self, input, language=None, stoplist=None, for i, sentence in enumerate(self.sentences): self.sentences[i].stems = [stemmer.stem(w).lower() for w in sentence.words] + elif self.normalization == 'lemmatization': + for i, sentence in enumerate(self.sentences): + self.sentences[i].stems = sentence.meta['lemmas'] + else: for i, sentence in enumerate(self.sentences): self.sentences[i].stems = [w.lower() for w in sentence.words]