Edit on GitHub

leaf_focus.app

Main application.

  1"""Main application."""
  2
  3from __future__ import annotations
  4
  5import dataclasses
  6import datetime
  7import logging
  8import pathlib
  9
 10from beartype import beartype, typing
 11
 12from leaf_focus import utils
 13from leaf_focus.ocr import keras_ocr
 14from leaf_focus.ocr import model as ocr_model
 15from leaf_focus.pdf import model as pdf_model
 16from leaf_focus.pdf import xpdf
 17from leaf_focus.utils import ValidatePathMethod
 18
 19
 20logger = logging.getLogger(__name__)
 21
 22
 23@beartype
 24@dataclasses.dataclass
 25class AppArgs:
 26    """Arguments for running the application."""
 27
 28    input_pdf: pathlib.Path
 29    """path to the pdf file"""
 30
 31    output_dir: pathlib.Path
 32    """path to the output directory to save text files"""
 33
 34    first_page: int | None = None
 35    """the first pdf page to process"""
 36
 37    last_page: int | None = None
 38    """the last pdf page to process"""
 39
 40    save_page_images: bool = False
 41    """save each page of the pdf to a separate image"""
 42
 43    run_ocr: bool = False
 44    """run OCR over each page of the pdf"""
 45
 46    log_level: str | None = None
 47    """the log level"""
 48
 49
 50@beartype
 51class App:
 52    """The main application."""
 53
 54    def __init__(self, exe_dir: pathlib.Path) -> None:
 55        """Create a new instance of the application.
 56
 57        Args:
 58            exe_dir: The path to the directory containing the executable files.
 59        """
 60        if not exe_dir or not exe_dir.exists() or not exe_dir.is_dir():
 61            msg = f"The path '{exe_dir or ''}' is not a directory."
 62            raise NotADirectoryError(msg)
 63        self._exe_dir = exe_dir
 64
 65    def run(self, app_args: AppArgs) -> bool:
 66        """Run the application.
 67
 68        Args:
 69            app_args: The application arguments.
 70
 71        Returns:
 72            bool: True if the text extraction succeeded, otherwise false.
 73        """
 74        timestamp_start = datetime.datetime.now(tz=datetime.timezone.utc)
 75        logger.info("Starting leaf-focus")
 76
 77        input_pdf = utils.validate_path(
 78            "input pdf",
 79            app_args.input_pdf,
 80            ValidatePathMethod.MUST_EXIST,
 81        )
 82        app_args.input_pdf = input_pdf
 83
 84        output_dir = utils.validate_path(
 85            "output directory",
 86            app_args.output_dir,
 87            ValidatePathMethod.NO_OPINION,
 88        )
 89        app_args.output_dir = output_dir
 90
 91        # create the output directory
 92        if not output_dir.is_dir():
 93            logger.warning("Creating output directory '%s'.", output_dir)
 94            output_dir.mkdir(exist_ok=True, parents=True)
 95        else:
 96            logger.info("Using output directory '%s'.", output_dir)
 97
 98        # run the pdf text extraction
 99        xpdf_prog = xpdf.XpdfProgram(self._exe_dir)
100
101        # pdf file info
102        self.pdf_info(xpdf_prog, app_args)
103
104        # pdf embedded text
105        self.pdf_text(xpdf_prog, app_args)
106
107        # pdf page image
108        xpdf_image = None
109        if app_args.save_page_images or app_args.run_ocr:
110            xpdf_image = self.pdf_images(xpdf_prog, app_args)
111
112        # pdf page image ocr
113        if app_args.run_ocr and xpdf_image:
114            list(self.pdf_ocr(xpdf_image, app_args))
115
116        timestamp_finish = datetime.datetime.now(tz=datetime.timezone.utc)
117        program_duration = timestamp_finish - timestamp_start
118        logger.info("Finished (duration %s)", program_duration)
119        return True
120
121    def pdf_info(
122        self,
123        prog: xpdf.XpdfProgram,
124        app_args: AppArgs,
125    ) -> pdf_model.XpdfInfoResult:
126        """Get the pdf file information.
127
128        Args:
129            prog: The program to run.
130            app_args: The application arguments.
131
132        Returns:
133            pdf_model.XpdfInfoResult: The result from the program.
134        """
135        xpdf_info_args = pdf_model.XpdfInfoArgs(
136            include_metadata=True,
137            first_page=app_args.first_page,
138            last_page=app_args.last_page,
139        )
140        return prog.info(app_args.input_pdf, app_args.output_dir, xpdf_info_args)
141
142    def pdf_text(
143        self,
144        prog: xpdf.XpdfProgram,
145        app_args: AppArgs,
146    ) -> pdf_model.XpdfTextResult:
147        """Get the text embedded in the pdf.
148
149        Args:
150            prog: The program to run.
151            app_args: The application arguments.
152
153        Returns:
154            pdf_model.XpdfTextResult: The result from the program.
155        """
156        xpdf_text_args = pdf_model.XpdfTextArgs(
157            line_end_type=pdf_model.XpdfTextArgs.get_line_ending(),
158            use_original_layout=True,
159            first_page=app_args.first_page,
160            last_page=app_args.last_page,
161        )
162        return prog.text(app_args.input_pdf, app_args.output_dir, xpdf_text_args)
163
164    def pdf_images(
165        self,
166        prog: xpdf.XpdfProgram,
167        app_args: AppArgs,
168    ) -> pdf_model.XpdfImageResult:
169        """Get each page in the pdf as a separate image.
170
171        Args:
172            prog: The program to run.
173            app_args: The application arguments.
174
175        Returns:
176            pdf_model.XpdfImageResult: The result from the program.
177        """
178        xpdf_image_args = pdf_model.XpdfImageArgs(use_grayscale=True)
179        xpdf_image = prog.image(
180            app_args.input_pdf,
181            app_args.output_dir,
182            xpdf_image_args,
183        )
184        return xpdf_image
185
186    def pdf_ocr(
187        self,
188        xpdf_image: pdf_model.XpdfImageResult,
189        app_args: AppArgs,
190    ) -> typing.Generator[ocr_model.KerasOcrResult, typing.Any, None]:
191        """Recognise text on the pdf page images.
192
193        Args:
194            xpdf_image: The result from the pdf image program.
195            app_args: The application arguments.
196
197        Returns:
198            typing.Generator[ocr_model.KerasOcrResult, typing.Any, None]: Yield text
199                recognition results for each pdf page image.
200        """
201        keras_ocr_prog = keras_ocr.OpticalCharacterRecognition()
202        for xpdf_image_file in xpdf_image.output_files:
203            yield keras_ocr_prog.recognise_text(xpdf_image_file, app_args.output_dir)
logger = <Logger leaf_focus.app (WARNING)>
@beartype
@dataclasses.dataclass
class AppArgs:
24@beartype
25@dataclasses.dataclass
26class AppArgs:
27    """Arguments for running the application."""
28
29    input_pdf: pathlib.Path
30    """path to the pdf file"""
31
32    output_dir: pathlib.Path
33    """path to the output directory to save text files"""
34
35    first_page: int | None = None
36    """the first pdf page to process"""
37
38    last_page: int | None = None
39    """the last pdf page to process"""
40
41    save_page_images: bool = False
42    """save each page of the pdf to a separate image"""
43
44    run_ocr: bool = False
45    """run OCR over each page of the pdf"""
46
47    log_level: str | None = None
48    """the log level"""

Arguments for running the application.

AppArgs( input_pdf: pathlib.Path, output_dir: pathlib.Path, first_page: int | None = None, last_page: int | None = None, save_page_images: bool = False, run_ocr: bool = False, log_level: str | None = None)
input_pdf: pathlib.Path

path to the pdf file

output_dir: pathlib.Path

path to the output directory to save text files

first_page: int | None = None

the first pdf page to process

last_page: int | None = None

the last pdf page to process

save_page_images: bool = False

save each page of the pdf to a separate image

run_ocr: bool = False

run OCR over each page of the pdf

log_level: str | None = None

the log level

@beartype
class App:
 51@beartype
 52class App:
 53    """The main application."""
 54
 55    def __init__(self, exe_dir: pathlib.Path) -> None:
 56        """Create a new instance of the application.
 57
 58        Args:
 59            exe_dir: The path to the directory containing the executable files.
 60        """
 61        if not exe_dir or not exe_dir.exists() or not exe_dir.is_dir():
 62            msg = f"The path '{exe_dir or ''}' is not a directory."
 63            raise NotADirectoryError(msg)
 64        self._exe_dir = exe_dir
 65
 66    def run(self, app_args: AppArgs) -> bool:
 67        """Run the application.
 68
 69        Args:
 70            app_args: The application arguments.
 71
 72        Returns:
 73            bool: True if the text extraction succeeded, otherwise false.
 74        """
 75        timestamp_start = datetime.datetime.now(tz=datetime.timezone.utc)
 76        logger.info("Starting leaf-focus")
 77
 78        input_pdf = utils.validate_path(
 79            "input pdf",
 80            app_args.input_pdf,
 81            ValidatePathMethod.MUST_EXIST,
 82        )
 83        app_args.input_pdf = input_pdf
 84
 85        output_dir = utils.validate_path(
 86            "output directory",
 87            app_args.output_dir,
 88            ValidatePathMethod.NO_OPINION,
 89        )
 90        app_args.output_dir = output_dir
 91
 92        # create the output directory
 93        if not output_dir.is_dir():
 94            logger.warning("Creating output directory '%s'.", output_dir)
 95            output_dir.mkdir(exist_ok=True, parents=True)
 96        else:
 97            logger.info("Using output directory '%s'.", output_dir)
 98
 99        # run the pdf text extraction
100        xpdf_prog = xpdf.XpdfProgram(self._exe_dir)
101
102        # pdf file info
103        self.pdf_info(xpdf_prog, app_args)
104
105        # pdf embedded text
106        self.pdf_text(xpdf_prog, app_args)
107
108        # pdf page image
109        xpdf_image = None
110        if app_args.save_page_images or app_args.run_ocr:
111            xpdf_image = self.pdf_images(xpdf_prog, app_args)
112
113        # pdf page image ocr
114        if app_args.run_ocr and xpdf_image:
115            list(self.pdf_ocr(xpdf_image, app_args))
116
117        timestamp_finish = datetime.datetime.now(tz=datetime.timezone.utc)
118        program_duration = timestamp_finish - timestamp_start
119        logger.info("Finished (duration %s)", program_duration)
120        return True
121
122    def pdf_info(
123        self,
124        prog: xpdf.XpdfProgram,
125        app_args: AppArgs,
126    ) -> pdf_model.XpdfInfoResult:
127        """Get the pdf file information.
128
129        Args:
130            prog: The program to run.
131            app_args: The application arguments.
132
133        Returns:
134            pdf_model.XpdfInfoResult: The result from the program.
135        """
136        xpdf_info_args = pdf_model.XpdfInfoArgs(
137            include_metadata=True,
138            first_page=app_args.first_page,
139            last_page=app_args.last_page,
140        )
141        return prog.info(app_args.input_pdf, app_args.output_dir, xpdf_info_args)
142
143    def pdf_text(
144        self,
145        prog: xpdf.XpdfProgram,
146        app_args: AppArgs,
147    ) -> pdf_model.XpdfTextResult:
148        """Get the text embedded in the pdf.
149
150        Args:
151            prog: The program to run.
152            app_args: The application arguments.
153
154        Returns:
155            pdf_model.XpdfTextResult: The result from the program.
156        """
157        xpdf_text_args = pdf_model.XpdfTextArgs(
158            line_end_type=pdf_model.XpdfTextArgs.get_line_ending(),
159            use_original_layout=True,
160            first_page=app_args.first_page,
161            last_page=app_args.last_page,
162        )
163        return prog.text(app_args.input_pdf, app_args.output_dir, xpdf_text_args)
164
165    def pdf_images(
166        self,
167        prog: xpdf.XpdfProgram,
168        app_args: AppArgs,
169    ) -> pdf_model.XpdfImageResult:
170        """Get each page in the pdf as a separate image.
171
172        Args:
173            prog: The program to run.
174            app_args: The application arguments.
175
176        Returns:
177            pdf_model.XpdfImageResult: The result from the program.
178        """
179        xpdf_image_args = pdf_model.XpdfImageArgs(use_grayscale=True)
180        xpdf_image = prog.image(
181            app_args.input_pdf,
182            app_args.output_dir,
183            xpdf_image_args,
184        )
185        return xpdf_image
186
187    def pdf_ocr(
188        self,
189        xpdf_image: pdf_model.XpdfImageResult,
190        app_args: AppArgs,
191    ) -> typing.Generator[ocr_model.KerasOcrResult, typing.Any, None]:
192        """Recognise text on the pdf page images.
193
194        Args:
195            xpdf_image: The result from the pdf image program.
196            app_args: The application arguments.
197
198        Returns:
199            typing.Generator[ocr_model.KerasOcrResult, typing.Any, None]: Yield text
200                recognition results for each pdf page image.
201        """
202        keras_ocr_prog = keras_ocr.OpticalCharacterRecognition()
203        for xpdf_image_file in xpdf_image.output_files:
204            yield keras_ocr_prog.recognise_text(xpdf_image_file, app_args.output_dir)

The main application.

App(exe_dir: pathlib.Path)
55    def __init__(self, exe_dir: pathlib.Path) -> None:
56        """Create a new instance of the application.
57
58        Args:
59            exe_dir: The path to the directory containing the executable files.
60        """
61        if not exe_dir or not exe_dir.exists() or not exe_dir.is_dir():
62            msg = f"The path '{exe_dir or ''}' is not a directory."
63            raise NotADirectoryError(msg)
64        self._exe_dir = exe_dir

Create a new instance of the application.

Arguments:
  • exe_dir: The path to the directory containing the executable files.
def run(self, app_args: AppArgs) -> bool:
 66    def run(self, app_args: AppArgs) -> bool:
 67        """Run the application.
 68
 69        Args:
 70            app_args: The application arguments.
 71
 72        Returns:
 73            bool: True if the text extraction succeeded, otherwise false.
 74        """
 75        timestamp_start = datetime.datetime.now(tz=datetime.timezone.utc)
 76        logger.info("Starting leaf-focus")
 77
 78        input_pdf = utils.validate_path(
 79            "input pdf",
 80            app_args.input_pdf,
 81            ValidatePathMethod.MUST_EXIST,
 82        )
 83        app_args.input_pdf = input_pdf
 84
 85        output_dir = utils.validate_path(
 86            "output directory",
 87            app_args.output_dir,
 88            ValidatePathMethod.NO_OPINION,
 89        )
 90        app_args.output_dir = output_dir
 91
 92        # create the output directory
 93        if not output_dir.is_dir():
 94            logger.warning("Creating output directory '%s'.", output_dir)
 95            output_dir.mkdir(exist_ok=True, parents=True)
 96        else:
 97            logger.info("Using output directory '%s'.", output_dir)
 98
 99        # run the pdf text extraction
100        xpdf_prog = xpdf.XpdfProgram(self._exe_dir)
101
102        # pdf file info
103        self.pdf_info(xpdf_prog, app_args)
104
105        # pdf embedded text
106        self.pdf_text(xpdf_prog, app_args)
107
108        # pdf page image
109        xpdf_image = None
110        if app_args.save_page_images or app_args.run_ocr:
111            xpdf_image = self.pdf_images(xpdf_prog, app_args)
112
113        # pdf page image ocr
114        if app_args.run_ocr and xpdf_image:
115            list(self.pdf_ocr(xpdf_image, app_args))
116
117        timestamp_finish = datetime.datetime.now(tz=datetime.timezone.utc)
118        program_duration = timestamp_finish - timestamp_start
119        logger.info("Finished (duration %s)", program_duration)
120        return True

Run the application.

Arguments:
  • app_args: The application arguments.
Returns:

bool: True if the text extraction succeeded, otherwise false.

def pdf_info( self, prog: leaf_focus.pdf.xpdf.XpdfProgram, app_args: AppArgs) -> leaf_focus.pdf.model.XpdfInfoResult:
122    def pdf_info(
123        self,
124        prog: xpdf.XpdfProgram,
125        app_args: AppArgs,
126    ) -> pdf_model.XpdfInfoResult:
127        """Get the pdf file information.
128
129        Args:
130            prog: The program to run.
131            app_args: The application arguments.
132
133        Returns:
134            pdf_model.XpdfInfoResult: The result from the program.
135        """
136        xpdf_info_args = pdf_model.XpdfInfoArgs(
137            include_metadata=True,
138            first_page=app_args.first_page,
139            last_page=app_args.last_page,
140        )
141        return prog.info(app_args.input_pdf, app_args.output_dir, xpdf_info_args)

Get the pdf file information.

Arguments:
  • prog: The program to run.
  • app_args: The application arguments.
Returns:

pdf_model.XpdfInfoResult: The result from the program.

def pdf_text( self, prog: leaf_focus.pdf.xpdf.XpdfProgram, app_args: AppArgs) -> leaf_focus.pdf.model.XpdfTextResult:
143    def pdf_text(
144        self,
145        prog: xpdf.XpdfProgram,
146        app_args: AppArgs,
147    ) -> pdf_model.XpdfTextResult:
148        """Get the text embedded in the pdf.
149
150        Args:
151            prog: The program to run.
152            app_args: The application arguments.
153
154        Returns:
155            pdf_model.XpdfTextResult: The result from the program.
156        """
157        xpdf_text_args = pdf_model.XpdfTextArgs(
158            line_end_type=pdf_model.XpdfTextArgs.get_line_ending(),
159            use_original_layout=True,
160            first_page=app_args.first_page,
161            last_page=app_args.last_page,
162        )
163        return prog.text(app_args.input_pdf, app_args.output_dir, xpdf_text_args)

Get the text embedded in the pdf.

Arguments:
  • prog: The program to run.
  • app_args: The application arguments.
Returns:

pdf_model.XpdfTextResult: The result from the program.

def pdf_images( self, prog: leaf_focus.pdf.xpdf.XpdfProgram, app_args: AppArgs) -> leaf_focus.pdf.model.XpdfImageResult:
165    def pdf_images(
166        self,
167        prog: xpdf.XpdfProgram,
168        app_args: AppArgs,
169    ) -> pdf_model.XpdfImageResult:
170        """Get each page in the pdf as a separate image.
171
172        Args:
173            prog: The program to run.
174            app_args: The application arguments.
175
176        Returns:
177            pdf_model.XpdfImageResult: The result from the program.
178        """
179        xpdf_image_args = pdf_model.XpdfImageArgs(use_grayscale=True)
180        xpdf_image = prog.image(
181            app_args.input_pdf,
182            app_args.output_dir,
183            xpdf_image_args,
184        )
185        return xpdf_image

Get each page in the pdf as a separate image.

Arguments:
  • prog: The program to run.
  • app_args: The application arguments.
Returns:

pdf_model.XpdfImageResult: The result from the program.

def pdf_ocr( self, xpdf_image: leaf_focus.pdf.model.XpdfImageResult, app_args: AppArgs) -> Generator[leaf_focus.ocr.model.KerasOcrResult, typing.Any, None]:
187    def pdf_ocr(
188        self,
189        xpdf_image: pdf_model.XpdfImageResult,
190        app_args: AppArgs,
191    ) -> typing.Generator[ocr_model.KerasOcrResult, typing.Any, None]:
192        """Recognise text on the pdf page images.
193
194        Args:
195            xpdf_image: The result from the pdf image program.
196            app_args: The application arguments.
197
198        Returns:
199            typing.Generator[ocr_model.KerasOcrResult, typing.Any, None]: Yield text
200                recognition results for each pdf page image.
201        """
202        keras_ocr_prog = keras_ocr.OpticalCharacterRecognition()
203        for xpdf_image_file in xpdf_image.output_files:
204            yield keras_ocr_prog.recognise_text(xpdf_image_file, app_args.output_dir)

Recognise text on the pdf page images.

Arguments:
  • xpdf_image: The result from the pdf image program.
  • app_args: The application arguments.
Returns:

typing.Generator[ocr_model.KerasOcrResult, typing.Any, None]: Yield text recognition results for each pdf page image.