Source code for bp_text.text

"""This module implements the text class.

A text is at first a string containing information in a given language.  This
text module uses Flair to split the text into sentences and tokenizes them.
It uses various algorithms to e.g. detect parts of speech or entities which can
be later used for analysis or text-production.

Created: 2025-04-24
Author: Ruben Philipp <me@rubenphilipp.com>

$$ Last modified:  16:46:43 Wed May  7 2025 CEST

"""

from functools import lru_cache
import os

import spacy
import subprocess

from . import language

# from flair.tokenization import SegtokTokenizer
# from flair.splitter import SegtokSentenceSplitter
# from flair.nn import Classifier

################################################################################
### spacy

#: the default spacy models for a given language
LANG_SPACY_MODELS = {
    'en': 'en_core_web_sm',
    'de': 'de_core_news_sm',
    'fr': 'fr_core_news_sm',
    'es': 'es_core_news_sm',
    'it': 'it_core_news_sm',
    'pt': 'pt_core_news_sm',
    'nl': 'nl_core_news_sm',
    'pl': 'pl_core_news_sm',
    'sv': 'sv_core_news_sm',
}

#: Helper to cache the spacy modles

[docs]
@lru_cache(maxsize=10)  # adjust based on number of language models you use
def get_nlp(model_name: str):
    """This loads and returns a spacy language model.  Additionally, it caches
    up to 10 language models in order to minimize memory usage.

    :param model_name: The name of the spacy model (e.g. "en_core_web_sm").
    :type model_name: string
    """
    if model_name:
        try:
            return spacy.load(model_name)
        except OSError:
            print(f"Model '{model_name}' not found. "
                  + "Attempting to download...")
            # download the model if it does not exist
            subprocess.run(["python", "-m", "spacy", "download",
                            model_name])
            return spacy.load(model_name)
    else:
        return False



################################################################################


[docs]
class Text:
    """This is a class implementation of a Text object.  A text holds a natural
    language text as a string and additionally contains segmented and analysed
    data derived from the text.  The text is tokenized (using spaCy) and
    analyzed e.g. for parts of speech or entities.  By default, Text uses
    Flair's most versatile models (e.g.  'pos-multi' for POS tagging and
    'ner-large' for NER tagging).  While introducing some overhead on loading,
    this comes with the advantage of being able to more precisely analyse
    multilingual text.

    Note: The `doc` contains the actual segmented text. 

    :param text: The text to be used as a basis for the analysis.
    :type text: string
    :param lang: The primary language of the text as a ISO 639-1 code.
       Default = "en"
    :type lang: string

    """

[docs]
    def __init__(self,
                 text = "",
                 lang = "en"):
        self._text = text
        self._lang = lang
        self._doc = None
        
        self.update()


    def __call__(self):
        """Calling the object returns the raw text instead of the annotated
        `doc`.
        """
        return self.text

    @property
    def text(self):
        """Getter/setter for text (string).
        
        Changing the text also causes re-generation of the sentence analyses.
        """
        return self._text

    @text.setter
    def text(self, val):
        if isinstance(val, str):
            self._text = val
        else:
            print("Error: value for text is not a String.")

        self.update()

    @property
    def doc(self):
        """Getter for the doc (i.e. the tokenized and analysed elements
        of the text). Read-only.
        """
        return self._doc
    


[docs]
    def update(self):
        """Update the instance.

        This also method also performs the text segmentation and analysis. 
        """
        # sanity checks
        if not isinstance(self._text, str):
            print("Error: Text.text is not a string.")
            return False

        ## load proper model
        model_name = LANG_SPACY_MODELS.get(self._lang)
        
        ## perform analysis
        nlp = get_nlp(model_name)
        if nlp == False:
            print("Text.update(): ERROR. No spaCy model for language "
              + f"'{self._lang}' in LANG_SPACY_MODELS. ")
            return False
        
        doc = nlp(self._text)
        self._doc = doc

        return True




################################################################################
### EOF text.py