"""
This module implements functionality for TXT files.
Created: 2025-03-29
Author: Ruben Philipp <me@rubenphilipp.com>
$$ Last modified: 11:04:38 Wed May 7 2025 CEST
"""
import os
import sys
import langcodes
from . import language
from . import utilities
from . import text
from .file import File
from .page import Page
################################################################################
[docs]
class TxtPage(Page):
"""This is a class implementation of a TXT page. Usually TXT files (.txt)
only contain a single page. Anyway, esp. in order to comply with the
structure of `PdfFile` objects, `TxtFile` objects also use (usually) one
`TxtPage` to store the (analyzed/tokenized) contents).
The `data` attribute (read-only) is an alias to the `raw_text` attribute of
the respective page while the `text` attribute contains the
analyzed/tokenized text.
:param page_num: The page number (zero-based) of the page in the related
file.
:type page_num: int
:param page_label: The actual page label of the page in the PDF file.
This will be used e.g. for citations in generated text.
:type page_label: string
:param raw_text: Holds the actual raw text of the page, extracted from the
data.
:type raw_text: string
:param lang: The language code of the primary language in the ISO-639-1 form
(e.g. "de" or "en").
:type lang: string
:param verbose: Print additional information during performance when True.
Default = True
:type verbose: boolean
"""
def __init(self,
page_num = None,
page_label = None,
raw_text = "",
lang = "",
verbose = True):
super(TxtPage, self).__init__(page_num = page_num,
page_label = page_label,
data = None,
raw_text = raw_text,
lang = lang,
verbose = verbose)
########################################
@property
def data(self):
"""Getter (alias) for the `raw_text` (read-only).
"""
return self._raw_text
################################################################################
[docs]
class TxtFile(File):
"""Implementation of the text-file (txt) class.
Note: The `data` attribute holds a list of (usually one) `TxtPage`
object(s). This is intentionally analogous to :py:class:`PdfPage`.
Example::
## instantiate the text file object and read its contents
text = txt.TxtFile("something.txt")
## get the primary language
print(text.lang)
## => "en"
:param file: The path to the text file.
:type file: string
:param lang: The language of the text file (e.g. "en", "de" etc.).
:type lang: string
"""
[docs]
def __init__(self,
file: str,
lang = ""):
# the primary language
self._lang = lang
## the raw text (empty for now)
self._raw_text = None
self._data = None
########################################
super(TxtFile, self).__init__(file,
self._data)
########################################
self.update()
@File.file.setter
def file(self, val):
# call superclass's setter
super(TxtFile, self.__class__).file.fset(self, val)
self.update()
@property
def lang(self):
"""Getter/setter for the language.
"""
return self._lang
@lang.setter
def lang(self, val):
self._lang = langcodes.standardize_tag(val)
return self._lang
@property
def data(self):
"""Getter/setter for the data (i.e. the txtfile content).
Setting the data (i.e. the raw text) will also update the instace and
re-initializes the text attribute by re-instantiating a Text-object.
"""
return self._data
@File.data.setter
def data(self, val):
# call superclass's setter
super(TxtFile, self.__class__).data.fset(self, val)
self.update()
########################################
[docs]
def update(self):
"""Updates the instance.
"""
if not os.path.isfile(self._file):
print(f"Error: The file {self._file} does not exist. ")
return False
## set data
with open(self.file, "r") as f:
self._raw_text = f.read()
## set language
self.lang = self.get_primary_lang()
## now, we create a list with one TxtPage object
##
## maybe, there will be an option to further split a TXT file into
## multiple pages, but not now
## RP Mon May 5 23:41:43 2025
self._data = [TxtPage(page_num = None,
page_label = None,
raw_text = self._raw_text,
lang = self._lang,
verbose = self._verbose)]
return self
[docs]
def get_primary_lang(self):
"""Detect the primary language of the text in `data` and set the
`lang` attribute accordingly.
"""
if self._raw_text == "" or self._raw_text == None:
print("Error: Cannot detect language. No data!")
return False
detector = language.LanguageDetector().detector
lang = detector.detect_language_of(self._raw_text)
return lang.iso_code_639_1.name
################################################################################
## EOF txt.py