Source code for bp_text.pdf

"""
This module implements functionality for PDF files. 

Created: 2025-03-27
Author: Ruben Philipp <me@rubenphilipp.com>

$$ Last modified:  11:05:21 Wed May  7 2025 CEST
"""

import os
import sys
import re
from pathlib import Path
from abc import ABC, abstractmethod

from lingua import Language, LanguageDetectorBuilder

from pypdf import PageObject
from pypdf import PdfReader
from pypdf.errors import PdfReadError
from pdf2image import convert_from_path
import pytesseract
import roman
import langcodes

from .file import File
from .page import Page
from . import language
from . import utilities
from . import text


################################################################################


[docs]
class PdfPage(Page):
    """This is a class implementation for a PDF page.  A PDF page holds is a
    reference to a page in a PDF document, usually related to a
    :py:class:`bp_text.pdf.PdfFile` object.

    The `data` attribute is also capable of holding a pypdf.PageObject
    (optional), while the `text` attribute contains the analyzed/tokenized text.
    
    Please note that the pypdf.PageObject instances in the `data` attribute will
    **not** be (re-)stored when (un-)pickling the PdfPage. 

    :param page_num: The page number (zero-based) of the page in the related
        file.
    :type page_num: int
    :param page_label: The actual page label of the page in the PDF file.
        The actual PDF page number/label is defined in the PDF header and could
        differ from the `page_num` (e.g. by varying the start index being a
        roman instead of an arabic numeral.
    :type page_label: string
    :param data: A `pypdf.PageObject`. Default = None
    :type data: A `pypdf.PageObject`
    :param raw_text: Holds the actual raw text of the page, extracted from the
       data. 
    :type raw_text: string
    :param lang: The language code of the primary language in the ISO-639-1 form
       (e.g. "de" or "en"). 
    :type lang: string
    :param file: An optional (back-)reference to a PdfFile object.
    :type file: A :py:class:`PdfFile` object.
    :param verbose: Print additional information during performance when True.
       Default = True
    :type verbose: boolean

    """

[docs]
    def __init__(self,
                 page_num = None,
                 page_label = None,
                 ## here, data holds a pypdf.PageObject (or None)
                 data = None,
                 raw_text = "",
                 lang = "",
                 ## can include a reference to a PdfFile object
                 file = None,
                 verbose = True):
        """Constructor method.
        """
        self._file = file
        super(PdfPage, self).__init__(page_num,
                                      page_label,
                                      data,
                                      raw_text,
                                      lang,
                                      verbose)
        ## call this again to perform tests
        self.data = data


    ########################################
    # remove data when pickling
    # RP  Tue Apr 29 12:30:09 2025

    def __getstate__(self):
        """Remove unpicklable PageObject before pickling."""
        state = self.__dict__.copy()
        state['_data'] = None  # Remove pypdf.PageObject
        return state


    ########################################

    @property
    def data(self):
        """
        Read/write property for the data of the object. 
        """
        return self._data

    @data.setter
    def data(self, val):
        """
        Set the `data` value of the object. 
        """
        ## test if data is a pypdf PageObject
        if val != None and not isinstance(val, PageObject):
            print(f"Error: The value for data is not a pypdf.PageObject, but "
                  + "a {type(val)}")
            return False
        self._data = val

    @property
    def file(self):
        """
        Read/write property. 
        """
        return self._file

    @file.setter
    def file(self, val):
        if isinstance(val, PdfFile):
            self._file = val
        else:
            print("Error: val is not of type PdfFile. ")
            return False

    ########################################


[docs]
    def extract_text(self, update_text = True):
        """
        Extract text from a PDF page using direct extraction.
        Returns the text as a string.

        :param update_text: Update the text attribute with the extracted text?
        :type update_text: boolean
        :return: The retrieved text.
        :rtype: string
        
        """
        if not self.data:
            print("Error: No data.")
            return False
        
        text = self.data.extract_text()

        if update_text:
            self._raw_text = text
        
        return text





################################################################################


[docs]
class PdfFile(File):
    """
    This is a class implementation of a PDF file.  A PDF file object is related
    to an actual PDF file (e.g. retrieved from a database entry).  Its methods
    facilitate e.g. the retrieval of data/text from the pages.

    The `data` attribute holds the :py:class:`PdfPage` objects as a `list`. 

    Examples::

        ## load a PDF file and extract the content from its pages
        pdfFile = pdf.PdfFile("bajohr2024a.pdf", auto_extract=True)
        
        ## get the primary language
        print(pdfFile.lang)
        ## => "de"
        
        ## get the label from the second page (in this case a roman numeral)
        pdfFile.data[1].page_label
        ## => "II"

        ## get the text from the third page
        pdfFile.data[2].text
        

    :param file: The filepath.
    :type file: string
    :param auto_extract: Automatically extract the text from all pages in the
        file when instantiating the object? This also automatically creates
        :py:class:`PdfPage` objects for each page. Default = True
    :type auto_extract: boolean
    :param use_ocr: Use OCR by default for text extraction? Default = False
    :type ose_ocr: boolean
    :param fallback_to_ocr: If text extraction without OCR yields little text,
        fallback to OCR? Default = True
    :type fallback_to_ocr: boolean
    :param ocr_dpi: The DPI amount for OCR. Default = 300
    :type ocr_dpi: integer
    :param ocr_default_lang: The default language for OCR. Default = "eng"
    :type ocr_default_lang: string
    :param verbose: Print additional information during performance when True.
       Default = True
    :type verbose: boolean
    
    """

[docs]
    def __init__(self,
                 file: str,
                 auto_extract = True,
                 use_ocr = False,
                 fallback_to_ocr = True,
                 ocr_dpi = 300,
                 ocr_default_lang = 'eng',
                 verbose=True):
        ## The pypdf.PdfReader object
        self._reader = None
        ## The number tree of the PDF
        ## cf. https://www.w3.org/WAI/GL/WCAG20-TECHS/PDF17.html
        self._number_tree = None
        ## The PDF primary language
        self._lang = ""
        ## Automatically extract data
        self._auto_extract = auto_extract
        ## Extraction args
        self._use_ocr = use_ocr
        self._fallback_to_ocr = fallback_to_ocr
        self._ocr_dpi = ocr_dpi
        self._ocr_default_lang = ocr_default_lang
        ########################################
        super(PdfFile, self).__init__(file = file,
                                      data = None,
                                      verbose = verbose)
        ########################################
        self.update()


    ########################################
    # remove pypdf objects before pickling
    # RP  Tue Apr 29 11:54:22 2025

    def __getstate__(self):
        """Customize pickling: remove unpicklable attributes."""
        state = self.__dict__.copy()
        state['_reader'] = None  # Exclude reader
        state['_number_tree'] = None
        # Remove unpicklable PageObject references from each PdfPage
        if state['_data']:
            for page in state['_data']:
                page.data = None
                
        return state

    def __setstate__(self, state):
        """Customize unpickling: restore state and update reader."""
        self.__dict__.update(state)
        if self._file:
            self.set_reader()  # Recreate PdfReader TODO
    

    ########################################
        
    @File.file.setter
    def file(self, val):
        super(PdfFile, self.__class__).file.fset(self, val)
        self.update()


    @property
    def lang(self):
        """The language. 
        """
        return self._lang

    @lang.setter
    def lang(self, val):
        # fallback to "en" if no lang is set
        # TODO: is there a better solution?
        # RP  Tue Apr 29 00:55:31 2025
        if not val:
            val = "en"
        self._lang = langcodes.standardize_tag(val)
        ## also set OCR default lang (alpha3)
        self._ocr_default_lang = langcodes.get(self._lang) \
                                          .to_alpha3()
        return self._lang

    @property
    def data(self):
        """The data.
        This is a list of :py:class:`PdfPage` objects, provided the data has
        been extracted (cf. `auto_extract`). 
        """
        return self._data

    @File.data.setter
    def data(self, val):
        self._data = val

    @property
    def reader(self):
        """The `pypdf.PdfReader` object (read-only). 
        """
        return self._reader

    @property
    def auto_extract(self):
        """Do auto-extraction?
        """
        return self._auto_extract

    @auto_extract.setter
    def auto_extract(self, val):
        if isinstance(val, bool):
            self._auto_extract = val
        else:
            print(f"Error: '{val}' is not of type Boolean")
            return False

    ########################################


[docs]
    def set_reader(self):
        """This method sets the reader slot to the file.

        This was previously done in the update method, but since pypdf objects
        (just as the reader) cannot be pickled, we seperate this process here
        in order to be at least able to reconstruct the reader when unpickling
        a PdfFile object. 
        """
        if self._file and os.path.isfile(self._file):
            try:
                self._reader = PdfReader(self._file)
            except PdfReadError:
                print(f"Error: Invalid PDF file {self._file}")
                return False
                
            else:
                return True
        else:
            # the file does not exist
            return False

            


[docs]
    def update(self):
        """Updates the instance.
        """
        ## (re-)initialize the reader object
        if not self.set_reader():
            print(f"PdfFile.update(): Error: The file '{self._file}' does not "
                  + "exist or could not be read.")
            return False
        
        ## Initialize the number tree
        self._number_tree = self._reader.trailer['/Root'] \
                                        .get('/PageLabels')
        
        ## auto-extract
        if self._auto_extract:
            self._data = self.extract_text()
        ## set (primary) language if not given
        if (self.lang == "" or not self.lang) and self.data:
            self.lang = self.get_primary_lang()
        return self



[docs]
    def extract_text_without_ocr(self):
        """
        Extract text from a PDF using pypdf.
        Returns a list of PdfPage objects. 
        """
        text = []

        if self._verbose:
            print(f"NO OCR: Processing {self._file}:")

        # workaround for now (using try/catch)
        # TODO: need to update to pypdf
        # RP  Tue Apr 29 01:13:42 2025
        try:
            pages = self.reader.pages
        except KeyError as e:
            print("Error with PDF...")
            return ""
        for i, page in enumerate(pages):
            if self._verbose:
                print(f"NO OCR: Processing page {i+1}/"
                      + f"{len(self.reader.pages)}...")

            # since some pdf files might contain corrupted or encrypted data,
            # we need to use this fallback
            # RP  Tue Apr 29 14:01:01 2025
            try:
                page_text = page.extract_text()
            except Exception as e:
                print(f"Failed to extract text from page {i}: {e}. ")
                page_text = ""
            
            page_ob = PdfPage(lang = self.lang,
                              raw_text = page_text,
                              data = page,
                              file = self,
                              page_num = i,
                              page_label = self.get_page_label(i))
            text.append(page_ob)

        return text



[docs]
    def extract_text_with_ocr(self):
        """
        Extract text from a PDF using Tesseract OCR.
        Returns a list of PdfPage objects.
        """
        text = []

        try:
            # convert pdf to images
            if self._verbose:
                print(f"OCR: Processing {self._file}:")
                print("OCR: Converting pages to images...")
                
            images = convert_from_path(self._file,
                                       dpi=self._ocr_dpi)
            for i, image in enumerate(images):
                if self._verbose:
                    print(f"OCR: Processing page {i+1}/{len(images)}...")
                    
                page_text = pytesseract.image_to_string(
                    image,
                    lang = self._ocr_default_lang)
                page_ob = PdfPage(page_num = i,
                                  page_label = self.get_page_label(i),
                                  raw_text = page_text,
                                  file = self,
                                  lang = self.lang)
                text.append(page_ob)
                
            return text
        except Exception as e:
            print(f"Error extracting text with OCR: {e}")
            return []




[docs]
    def extract_text(self):
        """
        Extract text from a PDF using direct extraction or OCR.
        Returns a list of PdfPage objects. 
        """
        if not os.path.exists(self.file):
            print(f"PDF file not found: {self.file}")
            return []

        use_ocr = self._use_ocr
        # Try direct extraction first
        if not use_ocr:
            
            text = self.extract_text_without_ocr()
            
            ## get the sum of words in result
            text_words = sum(map(lambda p: p.count_words(), text))
            # Fall back to OCR if needed
            if self._fallback_to_ocr and (not text or text_words < 20):
                print("Direct extraction yielded little text, "
                      +"falling back to OCR")
                use_ocr = True

        if use_ocr:
            text = self.extract_text_with_ocr()

        return text



[docs]
    def get_page_label(self, page_num):
        """
        Returns the label (i.e. the page number according to the PDF number
        tree) of a pdf page by index (page_num, zero-based).

        :param page_num: The page number (zero-based) the label should be
            retrieved from.
        :type page_num: integer
        
        """
        if not self._number_tree:
            # no number tree, use page numstring instead
            return str(page_num + 1)
        
        label_tuples = self._number_tree.get_object().get('/Nums')

        if label_tuples:
        
            if len(label_tuples) % 2 != 0:
                print("Error: Label number tree is malformed.");
                return str(page_num + 1)

            page_labels = {}
            for i in range(0, len(label_tuples), 2):
                start_index = label_tuples[i]
                label_dict = label_tuples[i + 1].get_object()
                
                prefix = label_dict.get('P', '')
                start_number = label_dict.get('/St', 1)
                style = label_dict.get('/S')
                
                if style == '/D': # Decimal
                    def ret_label(index):
                        return str(start_number + index)
                elif style == '/R': # Uppercase Roman
                    def ret_label(index):
                        return roman.toRoman(start_number + index).upper()
                elif style == '/r': # Lowercase Roman
                    def ret_label(index):
                        return roman.toRoman(start_number + index).lower()
                else:
                    def ret_label(index):
                        return ""
                    
                page_labels[start_index] = (prefix, ret_label, start_number)
                    
                ## determine page label
                page_label = str(page_num + 1)
                for start_index, (prefix,
                                  label_func,
                                  start_number) in page_labels.items():
                    if page_num >= start_index:
                        page_label = prefix + label_func(page_num
                                                             - start_index)
        # edge case: non label mapping defined
        # RP  Tue Apr 29 00:52:36 2025
        else:
            page_label = str(1 + page_num)
            
        return page_label

    


[docs]
    def get_primary_lang(self):
        """
        Get the primary language of a PDF.
        """
        if self._data == "" or self._data == None:
            print("Error: Cannot detect language. No data!")
            return False
        pages_langs = map(lambda p: p.lang, self.data)
        pages_langs_lst = list(pages_langs)
        langs = dict.fromkeys(pages_langs_lst)
        if len(langs) == 1:
            return list(langs.keys())[0]
        else:
            ## get most used lang
            for lang in langs:
                langs[lang] = pages_langs_lst.count(lang)
            return sorted(langs.items(),
                          key=lambda item: item[1],
                          reverse=True)[0][0]



[docs]
    def get_page(self, page_index):
        """Returns the `PdfPage` object for the page at index (zero-based).
        """
        if page_index < len(self.data):
            return self.data[page_index]
        else:
            return False



    

        
################################################################################
## EOF pdf.py