"""
This module implements functionality for PDF files.
Created: 2025-03-27
Author: Ruben Philipp <me@rubenphilipp.com>
$$ Last modified: 11:05:21 Wed May 7 2025 CEST
"""
import os
import sys
import re
from pathlib import Path
from abc import ABC, abstractmethod
from lingua import Language, LanguageDetectorBuilder
from pypdf import PageObject
from pypdf import PdfReader
from pypdf.errors import PdfReadError
from pdf2image import convert_from_path
import pytesseract
import roman
import langcodes
from .file import File
from .page import Page
from . import language
from . import utilities
from . import text
################################################################################
[docs]
class PdfPage(Page):
"""This is a class implementation for a PDF page. A PDF page holds is a
reference to a page in a PDF document, usually related to a
:py:class:`bp_text.pdf.PdfFile` object.
The `data` attribute is also capable of holding a pypdf.PageObject
(optional), while the `text` attribute contains the analyzed/tokenized text.
Please note that the pypdf.PageObject instances in the `data` attribute will
**not** be (re-)stored when (un-)pickling the PdfPage.
:param page_num: The page number (zero-based) of the page in the related
file.
:type page_num: int
:param page_label: The actual page label of the page in the PDF file.
The actual PDF page number/label is defined in the PDF header and could
differ from the `page_num` (e.g. by varying the start index being a
roman instead of an arabic numeral.
:type page_label: string
:param data: A `pypdf.PageObject`. Default = None
:type data: A `pypdf.PageObject`
:param raw_text: Holds the actual raw text of the page, extracted from the
data.
:type raw_text: string
:param lang: The language code of the primary language in the ISO-639-1 form
(e.g. "de" or "en").
:type lang: string
:param file: An optional (back-)reference to a PdfFile object.
:type file: A :py:class:`PdfFile` object.
:param verbose: Print additional information during performance when True.
Default = True
:type verbose: boolean
"""
[docs]
def __init__(self,
page_num = None,
page_label = None,
## here, data holds a pypdf.PageObject (or None)
data = None,
raw_text = "",
lang = "",
## can include a reference to a PdfFile object
file = None,
verbose = True):
"""Constructor method.
"""
self._file = file
super(PdfPage, self).__init__(page_num,
page_label,
data,
raw_text,
lang,
verbose)
## call this again to perform tests
self.data = data
########################################
# remove data when pickling
# RP Tue Apr 29 12:30:09 2025
def __getstate__(self):
"""Remove unpicklable PageObject before pickling."""
state = self.__dict__.copy()
state['_data'] = None # Remove pypdf.PageObject
return state
########################################
@property
def data(self):
"""
Read/write property for the data of the object.
"""
return self._data
@data.setter
def data(self, val):
"""
Set the `data` value of the object.
"""
## test if data is a pypdf PageObject
if val != None and not isinstance(val, PageObject):
print(f"Error: The value for data is not a pypdf.PageObject, but "
+ "a {type(val)}")
return False
self._data = val
@property
def file(self):
"""
Read/write property.
"""
return self._file
@file.setter
def file(self, val):
if isinstance(val, PdfFile):
self._file = val
else:
print("Error: val is not of type PdfFile. ")
return False
########################################
################################################################################
[docs]
class PdfFile(File):
"""
This is a class implementation of a PDF file. A PDF file object is related
to an actual PDF file (e.g. retrieved from a database entry). Its methods
facilitate e.g. the retrieval of data/text from the pages.
The `data` attribute holds the :py:class:`PdfPage` objects as a `list`.
Examples::
## load a PDF file and extract the content from its pages
pdfFile = pdf.PdfFile("bajohr2024a.pdf", auto_extract=True)
## get the primary language
print(pdfFile.lang)
## => "de"
## get the label from the second page (in this case a roman numeral)
pdfFile.data[1].page_label
## => "II"
## get the text from the third page
pdfFile.data[2].text
:param file: The filepath.
:type file: string
:param auto_extract: Automatically extract the text from all pages in the
file when instantiating the object? This also automatically creates
:py:class:`PdfPage` objects for each page. Default = True
:type auto_extract: boolean
:param use_ocr: Use OCR by default for text extraction? Default = False
:type ose_ocr: boolean
:param fallback_to_ocr: If text extraction without OCR yields little text,
fallback to OCR? Default = True
:type fallback_to_ocr: boolean
:param ocr_dpi: The DPI amount for OCR. Default = 300
:type ocr_dpi: integer
:param ocr_default_lang: The default language for OCR. Default = "eng"
:type ocr_default_lang: string
:param verbose: Print additional information during performance when True.
Default = True
:type verbose: boolean
"""
[docs]
def __init__(self,
file: str,
auto_extract = True,
use_ocr = False,
fallback_to_ocr = True,
ocr_dpi = 300,
ocr_default_lang = 'eng',
verbose=True):
## The pypdf.PdfReader object
self._reader = None
## The number tree of the PDF
## cf. https://www.w3.org/WAI/GL/WCAG20-TECHS/PDF17.html
self._number_tree = None
## The PDF primary language
self._lang = ""
## Automatically extract data
self._auto_extract = auto_extract
## Extraction args
self._use_ocr = use_ocr
self._fallback_to_ocr = fallback_to_ocr
self._ocr_dpi = ocr_dpi
self._ocr_default_lang = ocr_default_lang
########################################
super(PdfFile, self).__init__(file = file,
data = None,
verbose = verbose)
########################################
self.update()
########################################
# remove pypdf objects before pickling
# RP Tue Apr 29 11:54:22 2025
def __getstate__(self):
"""Customize pickling: remove unpicklable attributes."""
state = self.__dict__.copy()
state['_reader'] = None # Exclude reader
state['_number_tree'] = None
# Remove unpicklable PageObject references from each PdfPage
if state['_data']:
for page in state['_data']:
page.data = None
return state
def __setstate__(self, state):
"""Customize unpickling: restore state and update reader."""
self.__dict__.update(state)
if self._file:
self.set_reader() # Recreate PdfReader TODO
########################################
@File.file.setter
def file(self, val):
super(PdfFile, self.__class__).file.fset(self, val)
self.update()
@property
def lang(self):
"""The language.
"""
return self._lang
@lang.setter
def lang(self, val):
# fallback to "en" if no lang is set
# TODO: is there a better solution?
# RP Tue Apr 29 00:55:31 2025
if not val:
val = "en"
self._lang = langcodes.standardize_tag(val)
## also set OCR default lang (alpha3)
self._ocr_default_lang = langcodes.get(self._lang) \
.to_alpha3()
return self._lang
@property
def data(self):
"""The data.
This is a list of :py:class:`PdfPage` objects, provided the data has
been extracted (cf. `auto_extract`).
"""
return self._data
@File.data.setter
def data(self, val):
self._data = val
@property
def reader(self):
"""The `pypdf.PdfReader` object (read-only).
"""
return self._reader
@property
def auto_extract(self):
"""Do auto-extraction?
"""
return self._auto_extract
@auto_extract.setter
def auto_extract(self, val):
if isinstance(val, bool):
self._auto_extract = val
else:
print(f"Error: '{val}' is not of type Boolean")
return False
########################################
[docs]
def set_reader(self):
"""This method sets the reader slot to the file.
This was previously done in the update method, but since pypdf objects
(just as the reader) cannot be pickled, we seperate this process here
in order to be at least able to reconstruct the reader when unpickling
a PdfFile object.
"""
if self._file and os.path.isfile(self._file):
try:
self._reader = PdfReader(self._file)
except PdfReadError:
print(f"Error: Invalid PDF file {self._file}")
return False
else:
return True
else:
# the file does not exist
return False
[docs]
def update(self):
"""Updates the instance.
"""
## (re-)initialize the reader object
if not self.set_reader():
print(f"PdfFile.update(): Error: The file '{self._file}' does not "
+ "exist or could not be read.")
return False
## Initialize the number tree
self._number_tree = self._reader.trailer['/Root'] \
.get('/PageLabels')
## auto-extract
if self._auto_extract:
self._data = self.extract_text()
## set (primary) language if not given
if (self.lang == "" or not self.lang) and self.data:
self.lang = self.get_primary_lang()
return self
[docs]
def get_page_label(self, page_num):
"""
Returns the label (i.e. the page number according to the PDF number
tree) of a pdf page by index (page_num, zero-based).
:param page_num: The page number (zero-based) the label should be
retrieved from.
:type page_num: integer
"""
if not self._number_tree:
# no number tree, use page numstring instead
return str(page_num + 1)
label_tuples = self._number_tree.get_object().get('/Nums')
if label_tuples:
if len(label_tuples) % 2 != 0:
print("Error: Label number tree is malformed.");
return str(page_num + 1)
page_labels = {}
for i in range(0, len(label_tuples), 2):
start_index = label_tuples[i]
label_dict = label_tuples[i + 1].get_object()
prefix = label_dict.get('P', '')
start_number = label_dict.get('/St', 1)
style = label_dict.get('/S')
if style == '/D': # Decimal
def ret_label(index):
return str(start_number + index)
elif style == '/R': # Uppercase Roman
def ret_label(index):
return roman.toRoman(start_number + index).upper()
elif style == '/r': # Lowercase Roman
def ret_label(index):
return roman.toRoman(start_number + index).lower()
else:
def ret_label(index):
return ""
page_labels[start_index] = (prefix, ret_label, start_number)
## determine page label
page_label = str(page_num + 1)
for start_index, (prefix,
label_func,
start_number) in page_labels.items():
if page_num >= start_index:
page_label = prefix + label_func(page_num
- start_index)
# edge case: non label mapping defined
# RP Tue Apr 29 00:52:36 2025
else:
page_label = str(1 + page_num)
return page_label
[docs]
def get_primary_lang(self):
"""
Get the primary language of a PDF.
"""
if self._data == "" or self._data == None:
print("Error: Cannot detect language. No data!")
return False
pages_langs = map(lambda p: p.lang, self.data)
pages_langs_lst = list(pages_langs)
langs = dict.fromkeys(pages_langs_lst)
if len(langs) == 1:
return list(langs.keys())[0]
else:
## get most used lang
for lang in langs:
langs[lang] = pages_langs_lst.count(lang)
return sorted(langs.items(),
key=lambda item: item[1],
reverse=True)[0][0]
[docs]
def get_page(self, page_index):
"""Returns the `PdfPage` object for the page at index (zero-based).
"""
if page_index < len(self.data):
return self.data[page_index]
else:
return False
################################################################################
## EOF pdf.py