Source code for bp_text.page

"""
This module implements the page class.

Created: 2025-03-28
Author: Ruben Philipp <me@rubenphilipp.com>

$$ Last modified:  00:35:26 Tue Apr 29 2025 CEST
"""

from abc import ABC, abstractmethod


import roman
import langcodes

from . import language
from . import utilities
from . import text

################################################################################

[docs] class Page(ABC): """Abstract base class for a page. Note: The `text` attribute holds a :py:class:`Text` object containing tokenized text derived from the `raw_text`. :param page_num: The page number (zero-based) of the page in the related file. :type page_num: int :param page_label: The actual page label of the page. Its value and meaning differs from the page_num as it is related to the actual page numbering e.g. in a document. Thus, it could also be a roman numeral or be counted from a starting index different from 0. :type page_label: string :param data: Holds page data. :type data: undefined :param raw_text: Holds the actual raw text of the page, extracted from the data. :type raw_text: string :param lang: The language code of the primary language in the alpha3/ISO 639-1 form. :type lang: string :param verbose: Print additional information during performance when True. Default = False :type verbose: boolean """
[docs] def __init__(self, page_num = None, page_label = None, data = None, raw_text = "", lang = "", verbose = False): ## the page number / index self._page_num = page_num ## the page number (number) label ## this might differ from the actual page number e.g. ## when sections of a document are labeled with roman ## numerals self._page_label = page_label ## additional data self._data = data ## the slot for the Text object (empty for now) self._text = None ## the raw text self._raw_text = raw_text ## the primary language of the page's content self._lang = lang ## verbose self._verbose = verbose self.update()
######################################## @property def page_num(self): return self._page_num @page_num.setter def page_num(self, val): self._page_num = val @property def page_label(self): return self._page_label @page_label.setter def page_label(self, val): self._page_label = val @property def data(self): return self._data @data.setter def data(self, val): self._data = val @property def raw_text(self): return self._raw_text @raw_text.setter def raw_text(self, val): if isinstance(val, str): self._raw_text = val else: print("Error: value for raw_text is not a String.") self.update() @property def text(self): """Getter for the Text (read-only). """ return self._text @property def lang(self): return self._lang @lang.setter def lang(self, val): if val != "": self._lang = langcodes.standardize_tag(val) else: self._lang = "" @property def verbose(self): """Verbose setter/getter (bool) """ return self._verbose @verbose.setter def verbose(self, val): self._verbose = val ########################################
[docs] def update(self): """Updates the instance. """ ## detect and update language self.detect_lang(set_lang = True) # create the text object if self._verbose: print("Page.update(): Initializing text object. ") self._text = text.Text(self._raw_text, lang=self.lang)
[docs] def detect_lang(self, set_lang = True): """Detect the primary language of text in the page. :param set_lang: When true, automatically set the language attribute of the page. Default = True. :type set_lang: boolean """ lang = None detector = language.LanguageDetector().detector if self._raw_text != "": lang = detector.detect_language_of(self._raw_text) else: return False # TODO: this fallback needs to be inspected further # RP Tue Apr 29 00:35:25 2025 if lang == None: # fallback to en langcode = "en" else: langcode = lang.iso_code_639_1.name if set_lang: self.lang = langcode return langcode
[docs] def count_words(self): """Counts the words in the text. :return: The number of words in the text. :rtype: integer """ return len(self._raw_text.split())
################################################################################ ## EOF page.py