Source code for bp_text.pool

"""This module implements functionality for a (text) pool.

A text pool is a collection of annotated/tokenized, text-holding objects (e.g.
PdfFiles, TxtFiles) and can be generated from a BibTexDatabase object.  Its main
purpose is to facilitate interacting with a corpus of texts and the metadata
provided by the BibTex entries.

Created: 2025-04-25
Author: Ruben Philipp <me@rubenphilipp.com>

$$ Last modified:  22:36:31 Mon Apr 28 2025 CEST

"""

from random import randrange

from .pdf import PdfFile
from .txt import TxtFile

################################################################################

########################################
## get_data_func functions
## 2025-04-26

[docs] def random_data(ob): """This data getter function returns a random data item index. See :py:func:`PoolItem.get_data` for details. """ return randrange(0, len(ob.data))
[docs] def cycle_data(ob): """This data getter function returns the next data item index and also sets the next item index to return. See :py:func:`PoolItem.get_data` for details. """ this_next = ob.next_data # set new next index ob.next_data = (this_next + 1) % len(ob.data) # return index return this_next
########################################
[docs] class PoolItem: """This class implements a PoolItem. PoolItems are containers for metadata (e.g. retrieved from a BibTeX entry in a BibTexDatabase) and text holding objects (in the `data` attr), e.g. :py:class:`PdfFile` objects. They are meant to be placed into a :py:class:`Pool`. :param key: A (unique) key. This is most likely a BibTeX citekey. :type key: string :param meta: A dict holding metadata, most likely derived from a BibTeX entry. :type meta: dict :param data: A list holding one or more text holding objects (e.g. a :py:class:`PdfFile`). :type data: list :param default_get_data_func: The default function to retrieve a data object via :py:func:`get_data` (cf. :py:func:`get_data`). This could also be an integer which is an index to an element in the `data` attribute of the `PoolItem`. Default = None, which falls back to the default which causes `get_data` to always return the first element of the `data`. :type default_get_data_func: Either an integer or a function which must be a function taking the `PoolItem` as its argument and must return an index to the element of `data` which should be retrieved. Set to `None` to use the default. """
[docs] def __init__(self, key, meta = {}, data = [], default_get_data_func = None): self.key = key self.meta = meta self.data = data ## the index of the next data object self._next_data = 0 self.default_get_data_func = default_get_data_func
######################################## @property def key(self): """Getter/setter for the key. """ return self._key @key.setter def key(self, val): if not isinstance(val, str): print(f"PoolItem: ERROR '{val}' is not a valid key (must be " + "a string.") return False self._key = val @property def next_data(self): """Getter/setter for the next_data id. This is an index (zero-based) to the next element that should be retrieved from the data list when using :py:func:`get_data`. (int) """ return self._next_data @next_data.setter def next_data(self, val): self._next_data = val % len(self._data) @property def meta(self): """Getter/setter for the meta dict. """ return self._meta @meta.setter def meta(self, val): if isinstance(val, dict): self._meta = val else: raise ValueError("PoolItem.meta expects a dict.") @property def data(self): """Getter/setter for the data list. """ return self._data @data.setter def data(self, val): if ( isinstance(val, list) and all(isinstance(x, PdfFile) or isinstance(x, TxtFile) for x in val) ): self._data = val else: raise ValueError("PoolItem.data must be a list of PdfFile or " + "TxtFile objects. ") @property def default_get_data_func(self): return self._default_get_data_func @default_get_data_func.setter def default_get_data_func(self, val): if callable(val): self._default_get_data_func = val elif val == None: self._default_get_data_func = (lambda ignore: 0) elif isinstance(val, int): self._default_get_data_func = val else: raise ValueError("PoolItem.default_get_data_func must be None or " + "a function. ")
[docs] def get_data(self, index = 0): """This function returns a single data object from the data list instead of the data list itself. The `index` argument -- which must be a function taking the `PoolItem` as its argument and must return an index to the element of `data` which should be retrieved -- specifies which element should be returned. By default, it always returns the first item of the `data` list. There are two more functions specified, which could also be used: :py:func:`random_data` and :py:func:`cycle_data`. Example:: # this is an example using a function instead of an integer. the # function cycles through the data by using the pre-defined # `cycle_data` function. # NB: `pitm` here is a `PoolItem` object pitm.get_data(cycle_data) :param index: Either an integer which is a (zero-based) index to an element in `data`, or a function which must be a function taking the `PoolItem` as its argument and must return an index to the element of `data` which should be retrieved. Default = 0. :type index: int or function """ if (not isinstance(index, int)) and callable(index): # get value from function next_index = index(self) elif isinstance(index, int): next_index = index elif val == None: # fallback to index 0 next_index = 0 else: raise ValueError("PoolItem.get_data(): index is neither of type " + "int nor a function.") next_index = next_index % len(self.data) return self.data[next_index]
################################################################################
[docs] class Pool: """Implementation of the Pool class. A (text) Pool is a collection of annotated/tokenized, text-holding objects (e.g. PdfFiles, TxtFiles) and can be generated from a BibTexDatabase object. Its main purpose is to facilitate interacting with a corpus of texts and the metadata provided by the BibTex entries. The `data` of the pool is a `dict` of :py:class:`PoolItem` objects. The keys of the `dict` are typically (e.g. when the `Pool` is created from a :py:class:`BibTexDatabase`) citation keys. :param data: A `dict` with an initial set of :py:class:`PoolItem` objects. :type data: dict """
[docs] def __init__(self, data = {}): self.data = data
@property def data(self): """Getter/setter for the data of the `Pool`. This is a `dict` with :py:class:`PoolItem` objects. Keys are usually citekeys. """ return self._data @data.setter def data(self, val): if not ( isinstance(val, dict) and all(x, PoolItem) for x in val.values() ): raise ValueError("Pool: data must be a dict. all values need " + "to be PoolItem instances.") self._data = val
[docs] def get(self, key): """Get a py:class:`PoolItem` from the pool by its key. """ return self.data.get(key)
################################################################################ ### EOF pool.py