leaf_focus.app
Main application.
1"""Main application.""" 2 3from __future__ import annotations 4 5import dataclasses 6import datetime 7import logging 8import pathlib 9 10from beartype import beartype, typing 11 12from leaf_focus import utils 13from leaf_focus.ocr import keras_ocr 14from leaf_focus.ocr import model as ocr_model 15from leaf_focus.pdf import model as pdf_model 16from leaf_focus.pdf import xpdf 17from leaf_focus.utils import ValidatePathMethod 18 19 20logger = logging.getLogger(__name__) 21 22 23@beartype 24@dataclasses.dataclass 25class AppArgs: 26 """Arguments for running the application.""" 27 28 input_pdf: pathlib.Path 29 """path to the pdf file""" 30 31 output_dir: pathlib.Path 32 """path to the output directory to save text files""" 33 34 first_page: int | None = None 35 """the first pdf page to process""" 36 37 last_page: int | None = None 38 """the last pdf page to process""" 39 40 save_page_images: bool = False 41 """save each page of the pdf to a separate image""" 42 43 run_ocr: bool = False 44 """run OCR over each page of the pdf""" 45 46 log_level: str | None = None 47 """the log level""" 48 49 50@beartype 51class App: 52 """The main application.""" 53 54 def __init__(self, exe_dir: pathlib.Path) -> None: 55 """Create a new instance of the application. 56 57 Args: 58 exe_dir: The path to the directory containing the executable files. 59 """ 60 if not exe_dir or not exe_dir.exists() or not exe_dir.is_dir(): 61 msg = f"The path '{exe_dir or ''}' is not a directory." 62 raise NotADirectoryError(msg) 63 self._exe_dir = exe_dir 64 65 def run(self, app_args: AppArgs) -> bool: 66 """Run the application. 67 68 Args: 69 app_args: The application arguments. 70 71 Returns: 72 bool: True if the text extraction succeeded, otherwise false. 73 """ 74 timestamp_start = datetime.datetime.now(tz=datetime.timezone.utc) 75 logger.info("Starting leaf-focus") 76 77 input_pdf = utils.validate_path( 78 "input pdf", 79 app_args.input_pdf, 80 ValidatePathMethod.MUST_EXIST, 81 ) 82 app_args.input_pdf = input_pdf 83 84 output_dir = utils.validate_path( 85 "output directory", 86 app_args.output_dir, 87 ValidatePathMethod.NO_OPINION, 88 ) 89 app_args.output_dir = output_dir 90 91 # create the output directory 92 if not output_dir.is_dir(): 93 logger.warning("Creating output directory '%s'.", output_dir) 94 output_dir.mkdir(exist_ok=True, parents=True) 95 else: 96 logger.info("Using output directory '%s'.", output_dir) 97 98 # run the pdf text extraction 99 xpdf_prog = xpdf.XpdfProgram(self._exe_dir) 100 101 # pdf file info 102 self.pdf_info(xpdf_prog, app_args) 103 104 # pdf embedded text 105 self.pdf_text(xpdf_prog, app_args) 106 107 # pdf page image 108 xpdf_image = None 109 if app_args.save_page_images or app_args.run_ocr: 110 xpdf_image = self.pdf_images(xpdf_prog, app_args) 111 112 # pdf page image ocr 113 if app_args.run_ocr and xpdf_image: 114 list(self.pdf_ocr(xpdf_image, app_args)) 115 116 timestamp_finish = datetime.datetime.now(tz=datetime.timezone.utc) 117 program_duration = timestamp_finish - timestamp_start 118 logger.info("Finished (duration %s)", program_duration) 119 return True 120 121 def pdf_info( 122 self, 123 prog: xpdf.XpdfProgram, 124 app_args: AppArgs, 125 ) -> pdf_model.XpdfInfoResult: 126 """Get the pdf file information. 127 128 Args: 129 prog: The program to run. 130 app_args: The application arguments. 131 132 Returns: 133 pdf_model.XpdfInfoResult: The result from the program. 134 """ 135 xpdf_info_args = pdf_model.XpdfInfoArgs( 136 include_metadata=True, 137 first_page=app_args.first_page, 138 last_page=app_args.last_page, 139 ) 140 return prog.info(app_args.input_pdf, app_args.output_dir, xpdf_info_args) 141 142 def pdf_text( 143 self, 144 prog: xpdf.XpdfProgram, 145 app_args: AppArgs, 146 ) -> pdf_model.XpdfTextResult: 147 """Get the text embedded in the pdf. 148 149 Args: 150 prog: The program to run. 151 app_args: The application arguments. 152 153 Returns: 154 pdf_model.XpdfTextResult: The result from the program. 155 """ 156 xpdf_text_args = pdf_model.XpdfTextArgs( 157 line_end_type=pdf_model.XpdfTextArgs.get_line_ending(), 158 use_original_layout=True, 159 first_page=app_args.first_page, 160 last_page=app_args.last_page, 161 ) 162 return prog.text(app_args.input_pdf, app_args.output_dir, xpdf_text_args) 163 164 def pdf_images( 165 self, 166 prog: xpdf.XpdfProgram, 167 app_args: AppArgs, 168 ) -> pdf_model.XpdfImageResult: 169 """Get each page in the pdf as a separate image. 170 171 Args: 172 prog: The program to run. 173 app_args: The application arguments. 174 175 Returns: 176 pdf_model.XpdfImageResult: The result from the program. 177 """ 178 xpdf_image_args = pdf_model.XpdfImageArgs(use_grayscale=True) 179 xpdf_image = prog.image( 180 app_args.input_pdf, 181 app_args.output_dir, 182 xpdf_image_args, 183 ) 184 return xpdf_image 185 186 def pdf_ocr( 187 self, 188 xpdf_image: pdf_model.XpdfImageResult, 189 app_args: AppArgs, 190 ) -> typing.Generator[ocr_model.KerasOcrResult, typing.Any, None]: 191 """Recognise text on the pdf page images. 192 193 Args: 194 xpdf_image: The result from the pdf image program. 195 app_args: The application arguments. 196 197 Returns: 198 typing.Generator[ocr_model.KerasOcrResult, typing.Any, None]: Yield text 199 recognition results for each pdf page image. 200 """ 201 keras_ocr_prog = keras_ocr.OpticalCharacterRecognition() 202 for xpdf_image_file in xpdf_image.output_files: 203 yield keras_ocr_prog.recognise_text(xpdf_image_file, app_args.output_dir)
logger =
<Logger leaf_focus.app (WARNING)>
@beartype
@dataclasses.dataclass
class
AppArgs:
24@beartype 25@dataclasses.dataclass 26class AppArgs: 27 """Arguments for running the application.""" 28 29 input_pdf: pathlib.Path 30 """path to the pdf file""" 31 32 output_dir: pathlib.Path 33 """path to the output directory to save text files""" 34 35 first_page: int | None = None 36 """the first pdf page to process""" 37 38 last_page: int | None = None 39 """the last pdf page to process""" 40 41 save_page_images: bool = False 42 """save each page of the pdf to a separate image""" 43 44 run_ocr: bool = False 45 """run OCR over each page of the pdf""" 46 47 log_level: str | None = None 48 """the log level"""
Arguments for running the application.
@beartype
class
App:
51@beartype 52class App: 53 """The main application.""" 54 55 def __init__(self, exe_dir: pathlib.Path) -> None: 56 """Create a new instance of the application. 57 58 Args: 59 exe_dir: The path to the directory containing the executable files. 60 """ 61 if not exe_dir or not exe_dir.exists() or not exe_dir.is_dir(): 62 msg = f"The path '{exe_dir or ''}' is not a directory." 63 raise NotADirectoryError(msg) 64 self._exe_dir = exe_dir 65 66 def run(self, app_args: AppArgs) -> bool: 67 """Run the application. 68 69 Args: 70 app_args: The application arguments. 71 72 Returns: 73 bool: True if the text extraction succeeded, otherwise false. 74 """ 75 timestamp_start = datetime.datetime.now(tz=datetime.timezone.utc) 76 logger.info("Starting leaf-focus") 77 78 input_pdf = utils.validate_path( 79 "input pdf", 80 app_args.input_pdf, 81 ValidatePathMethod.MUST_EXIST, 82 ) 83 app_args.input_pdf = input_pdf 84 85 output_dir = utils.validate_path( 86 "output directory", 87 app_args.output_dir, 88 ValidatePathMethod.NO_OPINION, 89 ) 90 app_args.output_dir = output_dir 91 92 # create the output directory 93 if not output_dir.is_dir(): 94 logger.warning("Creating output directory '%s'.", output_dir) 95 output_dir.mkdir(exist_ok=True, parents=True) 96 else: 97 logger.info("Using output directory '%s'.", output_dir) 98 99 # run the pdf text extraction 100 xpdf_prog = xpdf.XpdfProgram(self._exe_dir) 101 102 # pdf file info 103 self.pdf_info(xpdf_prog, app_args) 104 105 # pdf embedded text 106 self.pdf_text(xpdf_prog, app_args) 107 108 # pdf page image 109 xpdf_image = None 110 if app_args.save_page_images or app_args.run_ocr: 111 xpdf_image = self.pdf_images(xpdf_prog, app_args) 112 113 # pdf page image ocr 114 if app_args.run_ocr and xpdf_image: 115 list(self.pdf_ocr(xpdf_image, app_args)) 116 117 timestamp_finish = datetime.datetime.now(tz=datetime.timezone.utc) 118 program_duration = timestamp_finish - timestamp_start 119 logger.info("Finished (duration %s)", program_duration) 120 return True 121 122 def pdf_info( 123 self, 124 prog: xpdf.XpdfProgram, 125 app_args: AppArgs, 126 ) -> pdf_model.XpdfInfoResult: 127 """Get the pdf file information. 128 129 Args: 130 prog: The program to run. 131 app_args: The application arguments. 132 133 Returns: 134 pdf_model.XpdfInfoResult: The result from the program. 135 """ 136 xpdf_info_args = pdf_model.XpdfInfoArgs( 137 include_metadata=True, 138 first_page=app_args.first_page, 139 last_page=app_args.last_page, 140 ) 141 return prog.info(app_args.input_pdf, app_args.output_dir, xpdf_info_args) 142 143 def pdf_text( 144 self, 145 prog: xpdf.XpdfProgram, 146 app_args: AppArgs, 147 ) -> pdf_model.XpdfTextResult: 148 """Get the text embedded in the pdf. 149 150 Args: 151 prog: The program to run. 152 app_args: The application arguments. 153 154 Returns: 155 pdf_model.XpdfTextResult: The result from the program. 156 """ 157 xpdf_text_args = pdf_model.XpdfTextArgs( 158 line_end_type=pdf_model.XpdfTextArgs.get_line_ending(), 159 use_original_layout=True, 160 first_page=app_args.first_page, 161 last_page=app_args.last_page, 162 ) 163 return prog.text(app_args.input_pdf, app_args.output_dir, xpdf_text_args) 164 165 def pdf_images( 166 self, 167 prog: xpdf.XpdfProgram, 168 app_args: AppArgs, 169 ) -> pdf_model.XpdfImageResult: 170 """Get each page in the pdf as a separate image. 171 172 Args: 173 prog: The program to run. 174 app_args: The application arguments. 175 176 Returns: 177 pdf_model.XpdfImageResult: The result from the program. 178 """ 179 xpdf_image_args = pdf_model.XpdfImageArgs(use_grayscale=True) 180 xpdf_image = prog.image( 181 app_args.input_pdf, 182 app_args.output_dir, 183 xpdf_image_args, 184 ) 185 return xpdf_image 186 187 def pdf_ocr( 188 self, 189 xpdf_image: pdf_model.XpdfImageResult, 190 app_args: AppArgs, 191 ) -> typing.Generator[ocr_model.KerasOcrResult, typing.Any, None]: 192 """Recognise text on the pdf page images. 193 194 Args: 195 xpdf_image: The result from the pdf image program. 196 app_args: The application arguments. 197 198 Returns: 199 typing.Generator[ocr_model.KerasOcrResult, typing.Any, None]: Yield text 200 recognition results for each pdf page image. 201 """ 202 keras_ocr_prog = keras_ocr.OpticalCharacterRecognition() 203 for xpdf_image_file in xpdf_image.output_files: 204 yield keras_ocr_prog.recognise_text(xpdf_image_file, app_args.output_dir)
The main application.
App(exe_dir: pathlib.Path)
55 def __init__(self, exe_dir: pathlib.Path) -> None: 56 """Create a new instance of the application. 57 58 Args: 59 exe_dir: The path to the directory containing the executable files. 60 """ 61 if not exe_dir or not exe_dir.exists() or not exe_dir.is_dir(): 62 msg = f"The path '{exe_dir or ''}' is not a directory." 63 raise NotADirectoryError(msg) 64 self._exe_dir = exe_dir
Create a new instance of the application.
Arguments:
- exe_dir: The path to the directory containing the executable files.
66 def run(self, app_args: AppArgs) -> bool: 67 """Run the application. 68 69 Args: 70 app_args: The application arguments. 71 72 Returns: 73 bool: True if the text extraction succeeded, otherwise false. 74 """ 75 timestamp_start = datetime.datetime.now(tz=datetime.timezone.utc) 76 logger.info("Starting leaf-focus") 77 78 input_pdf = utils.validate_path( 79 "input pdf", 80 app_args.input_pdf, 81 ValidatePathMethod.MUST_EXIST, 82 ) 83 app_args.input_pdf = input_pdf 84 85 output_dir = utils.validate_path( 86 "output directory", 87 app_args.output_dir, 88 ValidatePathMethod.NO_OPINION, 89 ) 90 app_args.output_dir = output_dir 91 92 # create the output directory 93 if not output_dir.is_dir(): 94 logger.warning("Creating output directory '%s'.", output_dir) 95 output_dir.mkdir(exist_ok=True, parents=True) 96 else: 97 logger.info("Using output directory '%s'.", output_dir) 98 99 # run the pdf text extraction 100 xpdf_prog = xpdf.XpdfProgram(self._exe_dir) 101 102 # pdf file info 103 self.pdf_info(xpdf_prog, app_args) 104 105 # pdf embedded text 106 self.pdf_text(xpdf_prog, app_args) 107 108 # pdf page image 109 xpdf_image = None 110 if app_args.save_page_images or app_args.run_ocr: 111 xpdf_image = self.pdf_images(xpdf_prog, app_args) 112 113 # pdf page image ocr 114 if app_args.run_ocr and xpdf_image: 115 list(self.pdf_ocr(xpdf_image, app_args)) 116 117 timestamp_finish = datetime.datetime.now(tz=datetime.timezone.utc) 118 program_duration = timestamp_finish - timestamp_start 119 logger.info("Finished (duration %s)", program_duration) 120 return True
Run the application.
Arguments:
- app_args: The application arguments.
Returns:
bool: True if the text extraction succeeded, otherwise false.
def
pdf_info( self, prog: leaf_focus.pdf.xpdf.XpdfProgram, app_args: AppArgs) -> leaf_focus.pdf.model.XpdfInfoResult:
122 def pdf_info( 123 self, 124 prog: xpdf.XpdfProgram, 125 app_args: AppArgs, 126 ) -> pdf_model.XpdfInfoResult: 127 """Get the pdf file information. 128 129 Args: 130 prog: The program to run. 131 app_args: The application arguments. 132 133 Returns: 134 pdf_model.XpdfInfoResult: The result from the program. 135 """ 136 xpdf_info_args = pdf_model.XpdfInfoArgs( 137 include_metadata=True, 138 first_page=app_args.first_page, 139 last_page=app_args.last_page, 140 ) 141 return prog.info(app_args.input_pdf, app_args.output_dir, xpdf_info_args)
Get the pdf file information.
Arguments:
- prog: The program to run.
- app_args: The application arguments.
Returns:
pdf_model.XpdfInfoResult: The result from the program.
def
pdf_text( self, prog: leaf_focus.pdf.xpdf.XpdfProgram, app_args: AppArgs) -> leaf_focus.pdf.model.XpdfTextResult:
143 def pdf_text( 144 self, 145 prog: xpdf.XpdfProgram, 146 app_args: AppArgs, 147 ) -> pdf_model.XpdfTextResult: 148 """Get the text embedded in the pdf. 149 150 Args: 151 prog: The program to run. 152 app_args: The application arguments. 153 154 Returns: 155 pdf_model.XpdfTextResult: The result from the program. 156 """ 157 xpdf_text_args = pdf_model.XpdfTextArgs( 158 line_end_type=pdf_model.XpdfTextArgs.get_line_ending(), 159 use_original_layout=True, 160 first_page=app_args.first_page, 161 last_page=app_args.last_page, 162 ) 163 return prog.text(app_args.input_pdf, app_args.output_dir, xpdf_text_args)
Get the text embedded in the pdf.
Arguments:
- prog: The program to run.
- app_args: The application arguments.
Returns:
pdf_model.XpdfTextResult: The result from the program.
def
pdf_images( self, prog: leaf_focus.pdf.xpdf.XpdfProgram, app_args: AppArgs) -> leaf_focus.pdf.model.XpdfImageResult:
165 def pdf_images( 166 self, 167 prog: xpdf.XpdfProgram, 168 app_args: AppArgs, 169 ) -> pdf_model.XpdfImageResult: 170 """Get each page in the pdf as a separate image. 171 172 Args: 173 prog: The program to run. 174 app_args: The application arguments. 175 176 Returns: 177 pdf_model.XpdfImageResult: The result from the program. 178 """ 179 xpdf_image_args = pdf_model.XpdfImageArgs(use_grayscale=True) 180 xpdf_image = prog.image( 181 app_args.input_pdf, 182 app_args.output_dir, 183 xpdf_image_args, 184 ) 185 return xpdf_image
Get each page in the pdf as a separate image.
Arguments:
- prog: The program to run.
- app_args: The application arguments.
Returns:
pdf_model.XpdfImageResult: The result from the program.
def
pdf_ocr( self, xpdf_image: leaf_focus.pdf.model.XpdfImageResult, app_args: AppArgs) -> Generator[leaf_focus.ocr.model.KerasOcrResult, typing.Any, None]:
187 def pdf_ocr( 188 self, 189 xpdf_image: pdf_model.XpdfImageResult, 190 app_args: AppArgs, 191 ) -> typing.Generator[ocr_model.KerasOcrResult, typing.Any, None]: 192 """Recognise text on the pdf page images. 193 194 Args: 195 xpdf_image: The result from the pdf image program. 196 app_args: The application arguments. 197 198 Returns: 199 typing.Generator[ocr_model.KerasOcrResult, typing.Any, None]: Yield text 200 recognition results for each pdf page image. 201 """ 202 keras_ocr_prog = keras_ocr.OpticalCharacterRecognition() 203 for xpdf_image_file in xpdf_image.output_files: 204 yield keras_ocr_prog.recognise_text(xpdf_image_file, app_args.output_dir)
Recognise text on the pdf page images.
Arguments:
- xpdf_image: The result from the pdf image program.
- app_args: The application arguments.
Returns:
typing.Generator[ocr_model.KerasOcrResult, typing.Any, None]: Yield text recognition results for each pdf page image.