"""This module implements the TextFragment class.
A text fragment is an item that contains (in the data slot) a token or any other
text data, combined with any other relevant data. This is esp. a `key`, which
is most often a BibTeX citekey, and some other meta-data, most likely retreived
from the `meta` slot of a :py:class:`Pool` instance.
Created: 2025-05-06
Author: Ruben Philipp <me@rubenphilipp.com>
$$ Last modified: 16:49:30 Wed May 7 2025 CEST
"""
from spacy.tokens import Token, Span, Doc
################################################################################
[docs]
class TextFragment:
"""This class implements a TextFragment.
A text fragment is an item that contains (in the data slot) a token or any
other text data, combined with any other relevant data. This is esp. a
`key`, which is most often a BibTeX citekey, and some other meta-data, most
likely retreived from the `meta` slot of a :py:class:`Pool` instance.
:param key: A (unique) key. This is most likely a BibTeX citekey.
:type key: string
:param meta: A dict holding metadata, most likely derived from a BibTeX
entry.
:type meta: dict
:param data: Any (text) data associated with this item. This is most likely
a `spacy.doc` or `spacy.token` object.
:type data: any
"""
[docs]
def __init__(self,
key,
page_label = None,
meta = {},
data = None):
self._key = key
self._page_label = page_label
self._meta = meta
self._data = data
########################################
@property
def key(self):
"""Getter/setter for the key.
"""
return self._key
@key.setter
def key(self, val):
self._key = val
@property
def page_label(self):
"""Getter/setter for page label (str).
"""
return self._page_label
@page_label.setter
def page_label(self, val):
self._page_label = val
@property
def page_label(self):
"""Getter/setter for the page_label.
"""
return self._page_label
@page_label.setter
def page_label(self, val):
self._page_label = val
@property
def meta(self):
"""Getter/setter for the meta dict.
"""
return self._meta
@meta.setter
def meta(self, val):
if isinstance(val, dict):
self._meta = val
else:
raise ValueError("PoolItem.meta expects a dict.")
@property
def data(self):
"""Getter/setter for the data.
"""
return self._data
@data.setter
def data(self, val):
self._data = val
########################################
[docs]
def format_org(self,
cite = True,
force_cite = False):
"""This returns a formatted string with the text contained in `data` in
org-mode syntax.
:param cite: When True, an org-cite citation will be appended to the
generated string. Default = True
:type cite: boolean
:param force_cite: When True (and `cite` = True), also for a citation
for text elements that normally don't "require" a citation
(e.g. PUNCT).
Default = False
:type force_cite: boolean
:returns: A string with the content of `text` formatted in org-mode
syntax.
"""
res = ""
res_pos = None
if isinstance(self.data, str):
res = self.data
elif isinstance(self.data, Token):
res = self.data.text
res_pos = self.data.pos_
elif isinstance(self.data, Span):
res = self.data.text
else:
print("TextFragment.format_org: The data does not contain valid "
+ "text.")
return None
if (cite and self.key
and ((res_pos != "PUNCT" and res_pos != "SPACE")
or force_cite)):
if self.page_label:
res += "[cite:@" + self.key + " " + self.page_label + "]"
else:
res += "[cite:@" + self.key + "]"
return res
################################################################################
[docs]
def textfragments_to_org(fragment_list,
cite = True,
force_cite = False):
"""This function takes a `list` of :py:class:`TextFragment` objects and
returns a `string` formatted in org-mode syntax, (optionally) including
org-cite references.
:param fragment_list: The :py:class:`TextFragment` objects.
:type fragment_list: A list with :py:class:`TextFragment` objects.
:param cite: When True, an org-cite citation will be appended to the
generated string. Default = True
:type cite: boolean
:param force_cite: When True (and `cite` = True), also for a citation for
text elements that normally don't "require" a citation (e.g. PUNC).
Default = False
:type force_cite: boolean
"""
if not (isinstance(fragment_list, list)
and all(isinstance(ele, TextFragment) for ele in fragment_list)):
raise ValueError("textfragments_to_org: fragment_list must be a list "
+ "of TextFragment objects.")
res_string = ""
for i, frag in enumerate(fragment_list):
res_string += frag.format_org(cite = cite,
force_cite = force_cite)
## add a space if this is not the last fragment and the next fragment's
## token is not punctuation
## always adds a space if the data contains something else than a
## spacy.Token
if i < len(fragment_list) - 1:
next_frag = fragment_list[i + 1]
if (isinstance(next_frag.data, Token)
and isinstance(frag.data, Token)):
if not next_frag.data.is_punct and not frag.data.is_space:
res_string += " "
else:
res_string += " "
return res_string
################################################################################
## EOF textfragment.py