unitex_tagger.py

import os
import yaml
import re
from collections import defaultdict
from unitex.config import UnitexConfig
from unitex.resources import load_persistent_alphabet
from unitex.processor import UnitexProcessor
from unidecode import unidecode
from timeit import default_timer as timer


class TextExtractor(UnitexProcessor):

    """
    The TextExtractor class allows to apply Unitex preprocessing and
    grammars to a text, and to access to the matches, and the
    sentences that contains them.

    This class inherits from the UnitexProcessor class
    (https://forge.uclouvain.be/watrin/python-unitex). It uses the
    parent import, configuration and processing methods, and adds new
    ones that allow to extract full sentences where a match occurs.
    """

    def __init__(self, config_file):
        """
        Args:
            config_file (str): A .yaml file containing the parameters
                               that have to be used to process text.
        """
        # Imports and applies configuration file.
        self.config = self._configure(config_file)
        super().__init__(self.config)  # Inheritance.
        # Imports alphabets.
        alph = self.config['resources']['alphabet']
        self.alphabet_sorted = self.config['resources']['alphabet-sorted']
        self.alphabet = load_persistent_alphabet(alph)
        # Path to the .snt file.
        self.text_path = None
        self.dir = None
        # Snt file content.
        self.processed_text = None
        # Initial text file content.
        self.text = None
        # Path to the index file generated by locate.
        self.index = None
        # All processed sentences.
        self.all_sentences = None
        # Table
        self.table = None

    def _configure(self, config_file):
        """
        Applies configuration file to the current object.

        Args:
            config_file (str): A .yaml file containing the parameters
                               that have to be used to process text.

        Return:
            A UnitexConfig object.
        """
        options = None
        with open(config_file, "r") as f:
            options = yaml.load(f, yaml.Loader)
        config = UnitexConfig()
        config.load(options)
        return config

    def _get_path(self):
        """ Get file path """
        directory, filename = os.path.split(self.text_path)
        name, extension = os.path.splitext(filename)
        self.dir = os.path.join(directory, "%s_snt" % name)

    def import_text(self, text):
        """
        Imports the content of a text file.

        Open a text file in the object, and applies all Unitex
        processing steps, i.e segmentation into sentences,
        normalisation of text, tokenization and application of
        dictionaries.

        Args:
            text (str): Path to the text file to process.
        """
        # mode:
        # 's': segment (apply Sentence.fst2) // not used, made by hand.
        # 'r': replace (apply Replace.fst2)
        # 't': tokenize
        # 'l': lexicalize (apply dictionaries)
        self.open(text, mode="srtl", tagged=False)
        # Extracts processed text.
        processed = self._UnitexProcessor__snt
        self.text_path = processed
        self._get_path()
        with open(processed, 'r') as fp:
            self.processed_text = fp.read()
        # Divides text to sentences.
        sentences = self.processed_text.split('{S}')
        self.all_sentences = [elt for elt in sentences if elt != ""]
        # Extracts initial text.
        with open(text, 'r') as fq:
            self.text = fq.read()


class TextTagger(TextExtractor):
    def __init__(self, config_file):
        super().__init__(config_file)
        self.regexps = None
        self.tagged_sentences = None
        self.tagged_path = None
        self.extracted = None
        self.initial = None
        self.matched = []
        self.terms = None
        self.terms_dict = None

    def _clean_images(self):
        img_clean = re.compile(r'(<img(.|\s)*?)(?=<(img|p|/div))')
        spans_remove = r'<span(.*?)>(.*?)</span>'
        without_token = self.text.replace('{S}', '')
        images = img_clean.findall(without_token)
        for image in images:
            tmp_img = image[0]
            new_image = re.sub(spans_remove, r'\2', image[0])
            without_token = without_token.replace(tmp_img, new_image)
        return without_token

    def _markup_pages_indices(self, html_string):
        pages_tag = re.compile('((<div id="page)0)')
        current_page = 0
        for tag in pages_tag.findall(html_string):
            incremented_tag = f'{tag[1]}-{current_page}'
            current_page += 1
            html_string = pages_tag.sub(incremented_tag, html_string, 1)
        return html_string

    def tag_text(self, grammar):
        """
        Tags the text using the given grammar.

        Args:
            grammar: The grammar used to tag the text.
    """
        self.tagged_path = os.path.join(self.dir, "tagged.txt")
        self.config["tools"]["concord"]["output"] = self.tagged_path
        self.tag(grammar, self.tagged_path)
        self.import_text(self.tagged_path)
        cleaned_images = self._clean_images()
        marked_pages = self._markup_pages_indices(cleaned_images)
        marked_pages = self.extract_tags(marked_pages)
        with open('tagged_pdf.html', 'w') as fp:
            fp.write(unidecode(
                marked_pages.replace(r'\.', '.').replace(r'\,', '.')))
        return unidecode(
            marked_pages.replace(r'\.', '.').replace(r'\,', '.'))

    def extract_tags(self, html_string):
        """
        Extract tagged sentences.

        Sentences are tagged with HTML span elements. Text corresponding to
        classes is extracted.

        Generates a list of tuples:
            (Sentences,
             Extracted tags/class,
             Index of block of text,
             extracted class with it's content)
        """
        highlight_re = re.compile(
            r'(<span class="(.*?) highlight")(>(.*?)</span>)')
        matches = highlight_re.findall(
            html_string.replace(r'\.', '.').replace(r'\,', '.'))
        counter = {elt[1]: 0 for elt in matches}
        idx_dict = {elt[1]: {} for elt in matches}

        def mark_matches(match):
            html = (rf'{match.group(1)} id={match.group(2)}'
                    rf'-{counter[match.group(2)]}{match.group(3)}"')
            if match.group(4).lower() in idx_dict[match.group(2)]:
                idx_dict[match.group(2)][match.group(4).lower()].append(
                    counter[match.group(2)])
            else:
                idx_dict[match.group(2)][match.group(4).lower()] = [
                    counter[match.group(2)]]
            counter[match.group(2)] += 1
            return html

        html_string = highlight_re.sub(
            mark_matches, html_string.replace(r'\.', '.').replace(r'\,', '.'))
        self.matched = idx_dict

        return html_string

    def _get_terms(self):
        """
        Extracts tagged terms with their text-block indices and their
        corresponding class.
        Format:

            {class: [
                     {term: str, indices: [idx_1, idx_2, ..., idx_n]},
                    ]
            }

        """
        terms = []
        final_format = defaultdict(list)
        terms_idx = defaultdict(list)  # Temporary dict to store indices.
        # Iterates over extracted terms.
        for elt in self.matched:
            for sent in elt[3]:
                # Get and format extracted term.
                this_term = sent[1].lower().replace(
                    r'\.', '.').replace(r'\,', ',').strip()
                terms.append({'term': this_term, 'category': sent[0]})
                # Adds index to occurence.
                terms_idx[this_term].append(elt[2])
        # Filters repeted occurences.
        filtered_terms = list({v['term']: v for v in terms}.values())
        # Generates expected format.
        for elt in filtered_terms:
            elt['indices'] = sorted(list(set(terms_idx[elt['term']])))
            cat = elt.pop('category')
            final_format[cat].append(elt)
        self.terms = final_format


if __name__ == "__main__":

    my_text = ("full_pdf.html")
    my_grammar = "config/graphs/highlight.fst2"
    my_config = "config/unitex-example.yaml"

    tagger = TextTagger(my_config)
    tagger.import_text(my_text)
    tagger.tag_text(my_grammar)
    tagger.terms