Source code for bp_text.text

"""This module implements the text class.

A text is at first a string containing information in a given language.  This
text module uses Flair to split the text into sentences and tokenizes them.
It uses various algorithms to e.g. detect parts of speech or entities which can
be later used for analysis or text-production.

Created: 2025-04-24
Author: Ruben Philipp <me@rubenphilipp.com>

$$ Last modified:  16:46:43 Wed May  7 2025 CEST

"""

from functools import lru_cache
import os

import spacy
import subprocess

from . import language

# from flair.tokenization import SegtokTokenizer
# from flair.splitter import SegtokSentenceSplitter
# from flair.nn import Classifier

################################################################################
### spacy

#: the default spacy models for a given language
LANG_SPACY_MODELS = {
    'en': 'en_core_web_sm',
    'de': 'de_core_news_sm',
    'fr': 'fr_core_news_sm',
    'es': 'es_core_news_sm',
    'it': 'it_core_news_sm',
    'pt': 'pt_core_news_sm',
    'nl': 'nl_core_news_sm',
    'pl': 'pl_core_news_sm',
    'sv': 'sv_core_news_sm',
}

#: Helper to cache the spacy modles
[docs] @lru_cache(maxsize=10) # adjust based on number of language models you use def get_nlp(model_name: str): """This loads and returns a spacy language model. Additionally, it caches up to 10 language models in order to minimize memory usage. :param model_name: The name of the spacy model (e.g. "en_core_web_sm"). :type model_name: string """ if model_name: try: return spacy.load(model_name) except OSError: print(f"Model '{model_name}' not found. " + "Attempting to download...") # download the model if it does not exist subprocess.run(["python", "-m", "spacy", "download", model_name]) return spacy.load(model_name) else: return False
################################################################################
[docs] class Text: """This is a class implementation of a Text object. A text holds a natural language text as a string and additionally contains segmented and analysed data derived from the text. The text is tokenized (using spaCy) and analyzed e.g. for parts of speech or entities. By default, Text uses Flair's most versatile models (e.g. 'pos-multi' for POS tagging and 'ner-large' for NER tagging). While introducing some overhead on loading, this comes with the advantage of being able to more precisely analyse multilingual text. Note: The `doc` contains the actual segmented text. :param text: The text to be used as a basis for the analysis. :type text: string :param lang: The primary language of the text as a ISO 639-1 code. Default = "en" :type lang: string """
[docs] def __init__(self, text = "", lang = "en"): self._text = text self._lang = lang self._doc = None self.update()
def __call__(self): """Calling the object returns the raw text instead of the annotated `doc`. """ return self.text @property def text(self): """Getter/setter for text (string). Changing the text also causes re-generation of the sentence analyses. """ return self._text @text.setter def text(self, val): if isinstance(val, str): self._text = val else: print("Error: value for text is not a String.") self.update() @property def doc(self): """Getter for the doc (i.e. the tokenized and analysed elements of the text). Read-only. """ return self._doc
[docs] def update(self): """Update the instance. This also method also performs the text segmentation and analysis. """ # sanity checks if not isinstance(self._text, str): print("Error: Text.text is not a string.") return False ## load proper model model_name = LANG_SPACY_MODELS.get(self._lang) ## perform analysis nlp = get_nlp(model_name) if nlp == False: print("Text.update(): ERROR. No spaCy model for language " + f"'{self._lang}' in LANG_SPACY_MODELS. ") return False doc = nlp(self._text) self._doc = doc return True
################################################################################ ### EOF text.py