Edit on GitHub

leaf_focus.pdf.model

PDF processing models.

  1"""PDF processing models."""
  2
  3from __future__ import annotations
  4
  5import dataclasses
  6import logging
  7import pathlib
  8import platform
  9
 10from datetime import datetime
 11
 12from beartype import beartype, typing
 13
 14
 15logger = logging.getLogger(__name__)
 16
 17
 18@beartype
 19@dataclasses.dataclass
 20class XpdfArgs:
 21    """xpdf arguments common to all commands."""
 22
 23    owner_password: str | None = dataclasses.field(
 24        metadata={"leaf_focus": {"cmd": "-opw", "cmd_type": "single"}},
 25        default=None,
 26    )
 27    """
 28    Specify the owner password for the PDF file.
 29    Providing this will bypass all security restrictions.
 30
 31    -opw <string>          : owner password (for encrypted files)
 32    """
 33
 34    user_password: str | None = dataclasses.field(
 35        metadata={"leaf_focus": {"cmd": "-upw", "cmd_type": "single"}},
 36        default=None,
 37    )
 38    """
 39    Specify the user password for the PDF file.
 40
 41    -upw <string>          : user password (for encrypted files)
 42    """
 43
 44    first_page: int | None = dataclasses.field(
 45        metadata={"leaf_focus": {"cmd": "-f", "cmd_type": "single"}},
 46        default=None,
 47    )
 48    """
 49    Specifies the first page to convert.
 50
 51    -f <int>               : first page to convert
 52    """
 53
 54    last_page: int | None = dataclasses.field(
 55        metadata={"leaf_focus": {"cmd": "-l", "cmd_type": "single"}},
 56        default=None,
 57    )
 58    """
 59    Specifies the last page to convert.
 60
 61    -l <int>               : last page to convert
 62    """
 63
 64    use_verbose: bool | None = dataclasses.field(
 65        metadata={"leaf_focus": {"cmd": "-verbose", "cmd_type": "bool"}},
 66        default=False,
 67    )
 68    """
 69    Print a status message (to stdout) before processing each page.
 70
 71    -verbose               : print per-page status information
 72    """
 73
 74    config_file: pathlib.Path | None = dataclasses.field(
 75        metadata={"leaf_focus": {"cmd": "-cfg", "cmd_type": "single"}},
 76        default=None,
 77    )
 78    """
 79    Read config-file in place of ~/.xpdfrc or the system-wide config file.
 80
 81    -cfg <string>     : configuration file to use in place of .xpdfrc
 82    """
 83
 84    program_info: bool | None = dataclasses.field(
 85        metadata={"leaf_focus": {"cmd": "-v", "cmd_type": "bool"}},
 86        default=False,
 87    )
 88    """
 89    Print copyright and version information.
 90
 91    -v                : print copyright and version info
 92    """
 93
 94
 95@beartype
 96@dataclasses.dataclass
 97class XpdfInfoArgs(XpdfArgs):
 98    """Arguments for xpdf pdfinfo program."""
 99
100    include_page_bounding_boxes: bool | None = dataclasses.field(
101        metadata={"leaf_focus": {"cmd": "-box", "cmd_type": "bool"}},
102        default=False,
103    )
104    """
105    Prints the page box bounding boxes:
106    MediaBox, CropBox, BleedBox, TrimBox, and ArtBox.
107
108    -box              : print the page bounding boxes
109    """
110
111    include_metadata: bool | None = dataclasses.field(
112        metadata={"leaf_focus": {"cmd": "-meta", "cmd_type": "bool"}},
113        default=False,
114    )
115    """
116    Prints document-level metadata.
117    This is the "Metadata" stream from the PDF file`s Catalog object.
118
119    -meta             : print the document metadata (XML)
120    """
121
122    include_raw_dates: bool | None = dataclasses.field(
123        metadata={"leaf_focus": {"cmd": "-rawdates", "cmd_type": "bool"}},
124        default=False,
125    )
126    """
127    Prints the raw (undecoded) date strings, directly from the PDF file.
128
129    -rawdates         : print the undecoded date strings directly from the PDF file
130    """
131
132    encoding: str | None = dataclasses.field(
133        metadata={"leaf_focus": {"cmd": "-enc", "cmd_type": "single"}},
134        default="Latin1",
135    )
136    """
137    Sets the encoding to use for text output.
138    The encoding-name must be defined with the unicodeMap command.
139    This defaults to "Latin1" (which is a built-in encoding).
140
141    -enc <string>          : output text encoding name
142    """
143
144
145@beartype
146@dataclasses.dataclass
147class XpdfInfoResult:
148    """Result from xpdf pdfinfo program."""
149
150    # pdf info
151    title: str | None = dataclasses.field(
152        metadata={"leaf_focus": {"name": "Title"}},
153    )
154    subject: str | None = dataclasses.field(
155        metadata={"leaf_focus": {"name": "Subject"}},
156    )
157    keywords: str | None = dataclasses.field(
158        metadata={"leaf_focus": {"name": "Keywords"}},
159    )
160    author: str | None = dataclasses.field(
161        metadata={"leaf_focus": {"name": "Author"}},
162    )
163    creator: str | None = dataclasses.field(
164        metadata={"leaf_focus": {"name": "Creator"}},
165    )
166    producer: str | None = dataclasses.field(
167        metadata={"leaf_focus": {"name": "Producer"}},
168    )
169    creation_date: datetime | None = dataclasses.field(
170        metadata={"leaf_focus": {"name": "CreationDate"}},
171    )
172    modification_date: datetime | None = dataclasses.field(
173        metadata={"leaf_focus": {"name": "ModDate"}},
174    )
175
176    # additional info
177    tagged: bool | None = dataclasses.field(
178        metadata={"leaf_focus": {"name": "Tagged"}},
179    )
180    form: str | None = dataclasses.field(
181        metadata={"leaf_focus": {"name": "Form"}},
182    )
183    pages: int | None = dataclasses.field(
184        metadata={"leaf_focus": {"name": "Pages"}},
185    )
186    encrypted: bool | None = dataclasses.field(
187        metadata={"leaf_focus": {"name": "Encrypted"}},
188    )
189    page_size: str | None = dataclasses.field(
190        metadata={"leaf_focus": {"name": "Page size"}},
191    )
192    media_box: str | None = dataclasses.field(
193        metadata={"leaf_focus": {"name": "MediaBox"}},
194    )
195    crop_box: str | None = dataclasses.field(
196        metadata={"leaf_focus": {"name": "CropBox"}},
197    )
198    bleed_box: str | None = dataclasses.field(
199        metadata={"leaf_focus": {"name": "BleedBox"}},
200    )
201    trim_box: str | None = dataclasses.field(
202        metadata={"leaf_focus": {"name": "TrimBox"}},
203    )
204    art_box: str | None = dataclasses.field(
205        metadata={"leaf_focus": {"name": "ArtBox"}},
206    )
207    file_size_bytes: int | None = dataclasses.field(
208        metadata={"leaf_focus": {"name": "File size"}},
209    )
210    optimized: bool | None = dataclasses.field(
211        metadata={"leaf_focus": {"name": "Optimized"}},
212    )
213    pdf_version: str | None = dataclasses.field(
214        metadata={"leaf_focus": {"name": "PDF version"}},
215    )
216    javascript: str | None = dataclasses.field(
217        metadata={"leaf_focus": {"name": "JavaScript"}},
218    )
219
220    # xml metadata
221    metadata: dict[str, typing.Any] | None = dataclasses.field(
222        metadata={"leaf_focus": {"name": "Metadata"}},
223    )
224
225
226@beartype
227@dataclasses.dataclass
228class XpdfTextArgs(XpdfArgs):
229    """Arguments for xpdf pdftotext program."""
230
231    use_original_layout: bool | None = dataclasses.field(
232        metadata={"leaf_focus": {"cmd": "-layout", "cmd_type": "bool"}},
233        default=False,
234    )
235    """
236    Maintain (as best as possible) the original physical layout of the text.
237
238    -layout                : maintain original physical layout
239    """
240
241    use_simple_layout: bool | None = dataclasses.field(
242        metadata={"leaf_focus": {"cmd": "-simple", "cmd_type": "bool"}},
243        default=False,
244    )
245    """
246    optimized for simple one-column pages.
247    This mode will do a better job of maintaining horizontal spacing,
248    but it will only work properly with a single column of text.
249
250    -simple                : simple one-column page layout
251    """
252
253    use_simple2_layout: bool | None = dataclasses.field(
254        metadata={"leaf_focus": {"cmd": "-simple2", "cmd_type": "bool"}},
255        default=False,
256    )
257    """
258    handles slightly rotated text (e.g., OCR output) better.
259    Only works for pages with a single column of text.
260
261    -simple2               : simple one-column page layout, version 2
262    """
263
264    use_table_layout: bool | None = dataclasses.field(
265        metadata={"leaf_focus": {"cmd": "-table", "cmd_type": "bool"}},
266        default=False,
267    )
268    """
269    Table mode is similar to physical layout mode, but optimized for tabular data,
270    with the goal of keeping rows and columns aligned
271    (at the expense of inserting extra whitespace).
272    If the -fixed option is given, character spacing within
273    each line will be determined by the specified character pitch.
274
275    -table                 : similar to -layout, but optimized for tables
276    """
277
278    use_line_printer: bool | None = dataclasses.field(
279        metadata={"leaf_focus": {"cmd": "-lineprinter", "cmd_type": "bool"}},
280        default=False,
281    )
282    """
283    Line printer mode uses a strict fixed-character-pitch and -height layout.
284    That is, the page is broken into a grid, and characters are placed into that grid.
285    If the grid spacing is too small for the actual characters,
286    the result is extra whitespace.
287    If the grid spacing is too large, the result is missing whitespace.
288    The grid spacing can be specified using the -fixed and -linespacing options.
289    If one or both are not given on the command line,
290    pdftotext will attempt to compute appropriate value(s).
291
292    -lineprinter           : use strict fixed-pitch/height layout
293    """
294
295    use_raw_string_order: bool | None = dataclasses.field(
296        metadata={"leaf_focus": {"cmd": "-raw", "cmd_type": "bool"}},
297        default=False,
298    )
299    """
300    Keep the text in content stream order.
301    Depending on how the PDF file was generated, this may or may not be useful.
302
303    -raw                   : keep strings in content stream order
304    """
305
306    use_text_clip: bool | None = dataclasses.field(
307        metadata={"leaf_focus": {"cmd": "-clip", "cmd_type": "bool"}},
308        default=False,
309    )
310    """
311    Text which is hidden because of clipping is removed before doing layout,
312    and then added back in. This can be helpful for tables where
313    clipped (invisible) text would overlap the next column.
314
315    -clip                  : separate clipped text
316    """
317
318    use_no_diag: bool | None = dataclasses.field(
319        metadata={"leaf_focus": {"cmd": "-nodiag", "cmd_type": "bool"}},
320        default=False,
321    )
322    """
323    Diagonal text, i.e., text that is not close to one of
324    the 0, 90, 180, or 270 degree axes, is discarded.
325    This is useful to skip watermarks drawn on top of body text, etc.
326
327    -nodiag                : discard diagonal text
328    """
329
330    use_no_page_break: bool | None = dataclasses.field(
331        metadata={"leaf_focus": {"cmd": "-nopgbrk", "cmd_type": "bool"}},
332        default=False,
333    )
334    """
335    Don't insert a page break (form feed character) at the
336    end of each page.
337
338    -nopgbrk               : don't insert a page break at the end of each page
339    """
340
341    use_bom: bool | None = dataclasses.field(
342        metadata={"leaf_focus": {"cmd": "-nom", "cmd_type": "bool"}},
343        default=False,
344    )
345    """
346    Insert a Unicode byte order marker (BOM) at the start of the text output.
347
348    -bom                   : insert a Unicode BOM at the start of the text file
349    """
350
351    use_verbose: bool | None = dataclasses.field(
352        metadata={"leaf_focus": {"cmd": "-verbose", "cmd_type": "bool"}},
353        default=False,
354    )
355    """
356    Print a status message (to stdout) before processing each page.
357
358    -verbose               : print per-page status information
359    """
360
361    fixed_text_number: int | None = dataclasses.field(
362        metadata={"leaf_focus": {"cmd": "-fixed", "cmd_type": "single"}},
363        default=None,
364    )
365    """
366    Specify the character pitch (character width), in points,
367    for physical layout, table, or line printer mode.
368    This is ignored in all other modes.
369
370    -fixed <number>        : assume fixed-pitch (or tabular) text
371    """
372
373    line_space_number: int | None = dataclasses.field(
374        metadata={"leaf_focus": {"cmd": "-linespacing", "cmd_type": "single"}},
375        default=None,
376    )
377    """
378    Specify the line spacing, in points, for line printer mode.
379    This is ignored in all other modes.
380
381    -linespacing <number>  : fixed line spacing for LinePrinter mode
382    """
383
384    line_end_type: str | None = dataclasses.field(
385        metadata={"leaf_focus": {"cmd": "-eol", "cmd_type": "single"}},
386        default=None,
387    )
388    """
389    Sets the end-of-line convention to use for text output.
390
391    -eol <string>          : output end-of-line convention (unix, dos, or mac)
392    """
393
394    margin_left_number: int | None = dataclasses.field(
395        metadata={"leaf_focus": {"cmd": "-marginl", "cmd_type": "single"}},
396        default=0,
397    )
398    """
399    Specifies the left margin, in points.
400    Text in the left margin
401    (i.e., within that many points of the left edge of the page) is discarded.
402    The default value is zero.
403
404    -marginl <number>      : left page margin
405    """
406
407    margin_right_number: int | None = dataclasses.field(
408        metadata={"leaf_focus": {"cmd": "-marginr", "cmd_type": "single"}},
409        default=0,
410    )
411    """
412    Specifies the right margin, in points.
413    Text in the right margin (i.e., within that many points of the
414    right edge of the page) is discarded.
415    The default value is zero.
416
417    -marginr <number>      : right page margin
418    """
419
420    margin_topnumber: int | None = dataclasses.field(
421        metadata={"leaf_focus": {"cmd": "-margint", "cmd_type": "single"}},
422        default=0,
423    )
424    """
425    Specifies the top margin, in points.
426    Text in the top margin (i.e., within that many points of the top
427    edge of the page) is discarded.
428    The default value is zero.
429
430    -margint <number>      : top page margin
431    """
432
433    margin_bottom_number: int | None = dataclasses.field(
434        metadata={"leaf_focus": {"cmd": "-marginb", "cmd_type": "single"}},
435        default=0,
436    )
437    """
438    Specifies the bottom margin, in points.
439    Text in the bottom margin (i.e., within that many points of the
440    bottom edge of the page) is discarded.
441    The default value is zero.
442
443    -marginb <number>      : bottom page margin
444    """
445
446    @classmethod
447    def get_line_ending(cls) -> str:
448        """Get the line endings based on the current platform.
449
450        Returns:
451            The line ending style.
452        """
453        opts = {
454            "Linux": "unix",
455            "Darwin": "mac",
456            "Windows": "dos",
457        }
458        plat = platform.system()
459
460        return opts[plat]
461
462
463@beartype
464@dataclasses.dataclass
465class XpdfTextResult:
466    """Result for xpdf pdftotext program."""
467
468    output_path: pathlib.Path
469    stdout: typing.Collection[str] = dataclasses.field(default_factory=list)
470    stderr: typing.Collection[str] = dataclasses.field(default_factory=list)
471
472
473@beartype
474@dataclasses.dataclass
475class XpdfImageArgs(XpdfArgs):
476    """Arguments for xpdf pdftopng program."""
477
478    resolution: int | None = dataclasses.field(
479        metadata={"leaf_focus": {"cmd": "-r", "cmd_type": "single"}},
480        default=150,
481    )
482    """
483    Specifies the resolution, in DPI. The default is 150 DPI.
484
485    -r <number>       : resolution, in DPI (default is 150)
486    """
487    use_monochrome: bool | None = dataclasses.field(
488        metadata={"leaf_focus": {"cmd": "-mono", "cmd_type": "bool"}},
489        default=False,
490    )
491    """
492    Generate a monochrome image (instead of a color image).
493
494    -mono             : generate a monochrome PNG file
495    """
496
497    use_grayscale: bool | None = dataclasses.field(
498        metadata={"leaf_focus": {"cmd": "-gray", "cmd_type": "bool"}},
499        default=False,
500    )
501    """
502    Generate a grayscale image (instead of a color image).
503
504    -gray             : generate a grayscale PNG file
505    """
506    use_alpha_channel: bool | None = dataclasses.field(
507        metadata={"leaf_focus": {"cmd": "-alpha", "cmd_type": "bool"}},
508        default=False,
509    )
510    """
511    Generate an alpha channel in the PNG file.
512    This is only useful with PDF files that have been constructed
513    with a transparent background.
514    The -alpha flag cannot be used with -mono.
515
516    -alpha            : include an alpha channel in the PNG file
517    """
518
519    rotation: int | None = dataclasses.field(
520        metadata={"leaf_focus": {"cmd": "-rot", "cmd_type": "single"}},
521        default=None,
522    )
523    """
524    Rotate pages by 0 (the default), 90, 180, or 270 degrees.
525
526    -rot <int>        : set page rotation: 0, 90, 180, or 270
527    """
528
529    free_type: str | None = dataclasses.field(
530        metadata={"leaf_focus": {"cmd": "-freetype", "cmd_type": "single"}},
531        default="yes",
532    )
533    """
534    Enable or disable FreeType (a TrueType / Type 1 font rasterizer).
535    This defaults to "yes".
536
537    -freetype <string>: enable FreeType font rasterizer: yes, no
538    """
539    anti_aliasing: str | None = dataclasses.field(
540        metadata={"leaf_focus": {"cmd": "-aa", "cmd_type": "single"}},
541        default="yes",
542    )
543    """
544    Enable or disable font anti-aliasing.
545    This defaults to "yes".
546
547    -aa <string>      : enable font anti-aliasing: yes, no
548    """
549    vector_anti_aliasing: str | None = dataclasses.field(
550        metadata={"leaf_focus": {"cmd": "-aaVector", "cmd_type": "single"}},
551        default="yes",
552    )
553    """
554    Enable or disable vector anti-aliasing.
555    This defaults to "yes".
556
557     -aaVector <string>: enable vector anti-aliasing: yes, no
558    """
559
560
561@beartype
562@dataclasses.dataclass
563class XpdfImageResult:
564    """Result for xpdf pdftopng program."""
565
566    output_dir: pathlib.Path
567    output_files: typing.Collection[pathlib.Path]
568    stdout: typing.Collection[str] = dataclasses.field(default_factory=list)
569    stderr: typing.Collection[str] = dataclasses.field(default_factory=list)
logger = <Logger leaf_focus.pdf.model (WARNING)>
@beartype
@dataclasses.dataclass
class XpdfArgs:
19@beartype
20@dataclasses.dataclass
21class XpdfArgs:
22    """xpdf arguments common to all commands."""
23
24    owner_password: str | None = dataclasses.field(
25        metadata={"leaf_focus": {"cmd": "-opw", "cmd_type": "single"}},
26        default=None,
27    )
28    """
29    Specify the owner password for the PDF file.
30    Providing this will bypass all security restrictions.
31
32    -opw <string>          : owner password (for encrypted files)
33    """
34
35    user_password: str | None = dataclasses.field(
36        metadata={"leaf_focus": {"cmd": "-upw", "cmd_type": "single"}},
37        default=None,
38    )
39    """
40    Specify the user password for the PDF file.
41
42    -upw <string>          : user password (for encrypted files)
43    """
44
45    first_page: int | None = dataclasses.field(
46        metadata={"leaf_focus": {"cmd": "-f", "cmd_type": "single"}},
47        default=None,
48    )
49    """
50    Specifies the first page to convert.
51
52    -f <int>               : first page to convert
53    """
54
55    last_page: int | None = dataclasses.field(
56        metadata={"leaf_focus": {"cmd": "-l", "cmd_type": "single"}},
57        default=None,
58    )
59    """
60    Specifies the last page to convert.
61
62    -l <int>               : last page to convert
63    """
64
65    use_verbose: bool | None = dataclasses.field(
66        metadata={"leaf_focus": {"cmd": "-verbose", "cmd_type": "bool"}},
67        default=False,
68    )
69    """
70    Print a status message (to stdout) before processing each page.
71
72    -verbose               : print per-page status information
73    """
74
75    config_file: pathlib.Path | None = dataclasses.field(
76        metadata={"leaf_focus": {"cmd": "-cfg", "cmd_type": "single"}},
77        default=None,
78    )
79    """
80    Read config-file in place of ~/.xpdfrc or the system-wide config file.
81
82    -cfg <string>     : configuration file to use in place of .xpdfrc
83    """
84
85    program_info: bool | None = dataclasses.field(
86        metadata={"leaf_focus": {"cmd": "-v", "cmd_type": "bool"}},
87        default=False,
88    )
89    """
90    Print copyright and version information.
91
92    -v                : print copyright and version info
93    """

xpdf arguments common to all commands.

XpdfArgs( owner_password: str | None = None, user_password: str | None = None, first_page: int | None = None, last_page: int | None = None, use_verbose: bool | None = False, config_file: pathlib.Path | None = None, program_info: bool | None = False)
owner_password: str | None = None

Specify the owner password for the PDF file. Providing this will bypass all security restrictions.

-opw : owner password (for encrypted files)

user_password: str | None = None

Specify the user password for the PDF file.

-upw : user password (for encrypted files)

first_page: int | None = None

Specifies the first page to convert.

-f : first page to convert

last_page: int | None = None

Specifies the last page to convert.

-l : last page to convert

use_verbose: bool | None = False

Print a status message (to stdout) before processing each page.

-verbose : print per-page status information

config_file: pathlib.Path | None = None

Read config-file in place of ~/.xpdfrc or the system-wide config file.

-cfg : configuration file to use in place of .xpdfrc

program_info: bool | None = False

Print copyright and version information.

-v : print copyright and version info

@beartype
@dataclasses.dataclass
class XpdfInfoArgs(XpdfArgs):
 96@beartype
 97@dataclasses.dataclass
 98class XpdfInfoArgs(XpdfArgs):
 99    """Arguments for xpdf pdfinfo program."""
100
101    include_page_bounding_boxes: bool | None = dataclasses.field(
102        metadata={"leaf_focus": {"cmd": "-box", "cmd_type": "bool"}},
103        default=False,
104    )
105    """
106    Prints the page box bounding boxes:
107    MediaBox, CropBox, BleedBox, TrimBox, and ArtBox.
108
109    -box              : print the page bounding boxes
110    """
111
112    include_metadata: bool | None = dataclasses.field(
113        metadata={"leaf_focus": {"cmd": "-meta", "cmd_type": "bool"}},
114        default=False,
115    )
116    """
117    Prints document-level metadata.
118    This is the "Metadata" stream from the PDF file`s Catalog object.
119
120    -meta             : print the document metadata (XML)
121    """
122
123    include_raw_dates: bool | None = dataclasses.field(
124        metadata={"leaf_focus": {"cmd": "-rawdates", "cmd_type": "bool"}},
125        default=False,
126    )
127    """
128    Prints the raw (undecoded) date strings, directly from the PDF file.
129
130    -rawdates         : print the undecoded date strings directly from the PDF file
131    """
132
133    encoding: str | None = dataclasses.field(
134        metadata={"leaf_focus": {"cmd": "-enc", "cmd_type": "single"}},
135        default="Latin1",
136    )
137    """
138    Sets the encoding to use for text output.
139    The encoding-name must be defined with the unicodeMap command.
140    This defaults to "Latin1" (which is a built-in encoding).
141
142    -enc <string>          : output text encoding name
143    """

Arguments for xpdf pdfinfo program.

XpdfInfoArgs( owner_password: str | None = None, user_password: str | None = None, first_page: int | None = None, last_page: int | None = None, use_verbose: bool | None = False, config_file: pathlib.Path | None = None, program_info: bool | None = False, include_page_bounding_boxes: bool | None = False, include_metadata: bool | None = False, include_raw_dates: bool | None = False, encoding: str | None = 'Latin1')
include_page_bounding_boxes: bool | None = False

Prints the page box bounding boxes: MediaBox, CropBox, BleedBox, TrimBox, and ArtBox.

-box : print the page bounding boxes

include_metadata: bool | None = False

Prints document-level metadata. This is the "Metadata" stream from the PDF file`s Catalog object.

-meta : print the document metadata (XML)

include_raw_dates: bool | None = False

Prints the raw (undecoded) date strings, directly from the PDF file.

-rawdates : print the undecoded date strings directly from the PDF file

encoding: str | None = 'Latin1'

Sets the encoding to use for text output. The encoding-name must be defined with the unicodeMap command. This defaults to "Latin1" (which is a built-in encoding).

-enc : output text encoding name

@beartype
@dataclasses.dataclass
class XpdfInfoResult:
146@beartype
147@dataclasses.dataclass
148class XpdfInfoResult:
149    """Result from xpdf pdfinfo program."""
150
151    # pdf info
152    title: str | None = dataclasses.field(
153        metadata={"leaf_focus": {"name": "Title"}},
154    )
155    subject: str | None = dataclasses.field(
156        metadata={"leaf_focus": {"name": "Subject"}},
157    )
158    keywords: str | None = dataclasses.field(
159        metadata={"leaf_focus": {"name": "Keywords"}},
160    )
161    author: str | None = dataclasses.field(
162        metadata={"leaf_focus": {"name": "Author"}},
163    )
164    creator: str | None = dataclasses.field(
165        metadata={"leaf_focus": {"name": "Creator"}},
166    )
167    producer: str | None = dataclasses.field(
168        metadata={"leaf_focus": {"name": "Producer"}},
169    )
170    creation_date: datetime | None = dataclasses.field(
171        metadata={"leaf_focus": {"name": "CreationDate"}},
172    )
173    modification_date: datetime | None = dataclasses.field(
174        metadata={"leaf_focus": {"name": "ModDate"}},
175    )
176
177    # additional info
178    tagged: bool | None = dataclasses.field(
179        metadata={"leaf_focus": {"name": "Tagged"}},
180    )
181    form: str | None = dataclasses.field(
182        metadata={"leaf_focus": {"name": "Form"}},
183    )
184    pages: int | None = dataclasses.field(
185        metadata={"leaf_focus": {"name": "Pages"}},
186    )
187    encrypted: bool | None = dataclasses.field(
188        metadata={"leaf_focus": {"name": "Encrypted"}},
189    )
190    page_size: str | None = dataclasses.field(
191        metadata={"leaf_focus": {"name": "Page size"}},
192    )
193    media_box: str | None = dataclasses.field(
194        metadata={"leaf_focus": {"name": "MediaBox"}},
195    )
196    crop_box: str | None = dataclasses.field(
197        metadata={"leaf_focus": {"name": "CropBox"}},
198    )
199    bleed_box: str | None = dataclasses.field(
200        metadata={"leaf_focus": {"name": "BleedBox"}},
201    )
202    trim_box: str | None = dataclasses.field(
203        metadata={"leaf_focus": {"name": "TrimBox"}},
204    )
205    art_box: str | None = dataclasses.field(
206        metadata={"leaf_focus": {"name": "ArtBox"}},
207    )
208    file_size_bytes: int | None = dataclasses.field(
209        metadata={"leaf_focus": {"name": "File size"}},
210    )
211    optimized: bool | None = dataclasses.field(
212        metadata={"leaf_focus": {"name": "Optimized"}},
213    )
214    pdf_version: str | None = dataclasses.field(
215        metadata={"leaf_focus": {"name": "PDF version"}},
216    )
217    javascript: str | None = dataclasses.field(
218        metadata={"leaf_focus": {"name": "JavaScript"}},
219    )
220
221    # xml metadata
222    metadata: dict[str, typing.Any] | None = dataclasses.field(
223        metadata={"leaf_focus": {"name": "Metadata"}},
224    )

Result from xpdf pdfinfo program.

XpdfInfoResult( title: str | None, subject: str | None, keywords: str | None, author: str | None, creator: str | None, producer: str | None, creation_date: datetime.datetime | None, modification_date: datetime.datetime | None, tagged: bool | None, form: str | None, pages: int | None, encrypted: bool | None, page_size: str | None, media_box: str | None, crop_box: str | None, bleed_box: str | None, trim_box: str | None, art_box: str | None, file_size_bytes: int | None, optimized: bool | None, pdf_version: str | None, javascript: str | None, metadata: dict[str, typing.Any] | None)
title: str | None
subject: str | None
keywords: str | None
author: str | None
creator: str | None
producer: str | None
creation_date: datetime.datetime | None
modification_date: datetime.datetime | None
tagged: bool | None
form: str | None
pages: int | None
encrypted: bool | None
page_size: str | None
media_box: str | None
crop_box: str | None
bleed_box: str | None
trim_box: str | None
art_box: str | None
file_size_bytes: int | None
optimized: bool | None
pdf_version: str | None
javascript: str | None
metadata: dict[str, typing.Any] | None
@beartype
@dataclasses.dataclass
class XpdfTextArgs(XpdfArgs):
227@beartype
228@dataclasses.dataclass
229class XpdfTextArgs(XpdfArgs):
230    """Arguments for xpdf pdftotext program."""
231
232    use_original_layout: bool | None = dataclasses.field(
233        metadata={"leaf_focus": {"cmd": "-layout", "cmd_type": "bool"}},
234        default=False,
235    )
236    """
237    Maintain (as best as possible) the original physical layout of the text.
238
239    -layout                : maintain original physical layout
240    """
241
242    use_simple_layout: bool | None = dataclasses.field(
243        metadata={"leaf_focus": {"cmd": "-simple", "cmd_type": "bool"}},
244        default=False,
245    )
246    """
247    optimized for simple one-column pages.
248    This mode will do a better job of maintaining horizontal spacing,
249    but it will only work properly with a single column of text.
250
251    -simple                : simple one-column page layout
252    """
253
254    use_simple2_layout: bool | None = dataclasses.field(
255        metadata={"leaf_focus": {"cmd": "-simple2", "cmd_type": "bool"}},
256        default=False,
257    )
258    """
259    handles slightly rotated text (e.g., OCR output) better.
260    Only works for pages with a single column of text.
261
262    -simple2               : simple one-column page layout, version 2
263    """
264
265    use_table_layout: bool | None = dataclasses.field(
266        metadata={"leaf_focus": {"cmd": "-table", "cmd_type": "bool"}},
267        default=False,
268    )
269    """
270    Table mode is similar to physical layout mode, but optimized for tabular data,
271    with the goal of keeping rows and columns aligned
272    (at the expense of inserting extra whitespace).
273    If the -fixed option is given, character spacing within
274    each line will be determined by the specified character pitch.
275
276    -table                 : similar to -layout, but optimized for tables
277    """
278
279    use_line_printer: bool | None = dataclasses.field(
280        metadata={"leaf_focus": {"cmd": "-lineprinter", "cmd_type": "bool"}},
281        default=False,
282    )
283    """
284    Line printer mode uses a strict fixed-character-pitch and -height layout.
285    That is, the page is broken into a grid, and characters are placed into that grid.
286    If the grid spacing is too small for the actual characters,
287    the result is extra whitespace.
288    If the grid spacing is too large, the result is missing whitespace.
289    The grid spacing can be specified using the -fixed and -linespacing options.
290    If one or both are not given on the command line,
291    pdftotext will attempt to compute appropriate value(s).
292
293    -lineprinter           : use strict fixed-pitch/height layout
294    """
295
296    use_raw_string_order: bool | None = dataclasses.field(
297        metadata={"leaf_focus": {"cmd": "-raw", "cmd_type": "bool"}},
298        default=False,
299    )
300    """
301    Keep the text in content stream order.
302    Depending on how the PDF file was generated, this may or may not be useful.
303
304    -raw                   : keep strings in content stream order
305    """
306
307    use_text_clip: bool | None = dataclasses.field(
308        metadata={"leaf_focus": {"cmd": "-clip", "cmd_type": "bool"}},
309        default=False,
310    )
311    """
312    Text which is hidden because of clipping is removed before doing layout,
313    and then added back in. This can be helpful for tables where
314    clipped (invisible) text would overlap the next column.
315
316    -clip                  : separate clipped text
317    """
318
319    use_no_diag: bool | None = dataclasses.field(
320        metadata={"leaf_focus": {"cmd": "-nodiag", "cmd_type": "bool"}},
321        default=False,
322    )
323    """
324    Diagonal text, i.e., text that is not close to one of
325    the 0, 90, 180, or 270 degree axes, is discarded.
326    This is useful to skip watermarks drawn on top of body text, etc.
327
328    -nodiag                : discard diagonal text
329    """
330
331    use_no_page_break: bool | None = dataclasses.field(
332        metadata={"leaf_focus": {"cmd": "-nopgbrk", "cmd_type": "bool"}},
333        default=False,
334    )
335    """
336    Don't insert a page break (form feed character) at the
337    end of each page.
338
339    -nopgbrk               : don't insert a page break at the end of each page
340    """
341
342    use_bom: bool | None = dataclasses.field(
343        metadata={"leaf_focus": {"cmd": "-nom", "cmd_type": "bool"}},
344        default=False,
345    )
346    """
347    Insert a Unicode byte order marker (BOM) at the start of the text output.
348
349    -bom                   : insert a Unicode BOM at the start of the text file
350    """
351
352    use_verbose: bool | None = dataclasses.field(
353        metadata={"leaf_focus": {"cmd": "-verbose", "cmd_type": "bool"}},
354        default=False,
355    )
356    """
357    Print a status message (to stdout) before processing each page.
358
359    -verbose               : print per-page status information
360    """
361
362    fixed_text_number: int | None = dataclasses.field(
363        metadata={"leaf_focus": {"cmd": "-fixed", "cmd_type": "single"}},
364        default=None,
365    )
366    """
367    Specify the character pitch (character width), in points,
368    for physical layout, table, or line printer mode.
369    This is ignored in all other modes.
370
371    -fixed <number>        : assume fixed-pitch (or tabular) text
372    """
373
374    line_space_number: int | None = dataclasses.field(
375        metadata={"leaf_focus": {"cmd": "-linespacing", "cmd_type": "single"}},
376        default=None,
377    )
378    """
379    Specify the line spacing, in points, for line printer mode.
380    This is ignored in all other modes.
381
382    -linespacing <number>  : fixed line spacing for LinePrinter mode
383    """
384
385    line_end_type: str | None = dataclasses.field(
386        metadata={"leaf_focus": {"cmd": "-eol", "cmd_type": "single"}},
387        default=None,
388    )
389    """
390    Sets the end-of-line convention to use for text output.
391
392    -eol <string>          : output end-of-line convention (unix, dos, or mac)
393    """
394
395    margin_left_number: int | None = dataclasses.field(
396        metadata={"leaf_focus": {"cmd": "-marginl", "cmd_type": "single"}},
397        default=0,
398    )
399    """
400    Specifies the left margin, in points.
401    Text in the left margin
402    (i.e., within that many points of the left edge of the page) is discarded.
403    The default value is zero.
404
405    -marginl <number>      : left page margin
406    """
407
408    margin_right_number: int | None = dataclasses.field(
409        metadata={"leaf_focus": {"cmd": "-marginr", "cmd_type": "single"}},
410        default=0,
411    )
412    """
413    Specifies the right margin, in points.
414    Text in the right margin (i.e., within that many points of the
415    right edge of the page) is discarded.
416    The default value is zero.
417
418    -marginr <number>      : right page margin
419    """
420
421    margin_topnumber: int | None = dataclasses.field(
422        metadata={"leaf_focus": {"cmd": "-margint", "cmd_type": "single"}},
423        default=0,
424    )
425    """
426    Specifies the top margin, in points.
427    Text in the top margin (i.e., within that many points of the top
428    edge of the page) is discarded.
429    The default value is zero.
430
431    -margint <number>      : top page margin
432    """
433
434    margin_bottom_number: int | None = dataclasses.field(
435        metadata={"leaf_focus": {"cmd": "-marginb", "cmd_type": "single"}},
436        default=0,
437    )
438    """
439    Specifies the bottom margin, in points.
440    Text in the bottom margin (i.e., within that many points of the
441    bottom edge of the page) is discarded.
442    The default value is zero.
443
444    -marginb <number>      : bottom page margin
445    """
446
447    @classmethod
448    def get_line_ending(cls) -> str:
449        """Get the line endings based on the current platform.
450
451        Returns:
452            The line ending style.
453        """
454        opts = {
455            "Linux": "unix",
456            "Darwin": "mac",
457            "Windows": "dos",
458        }
459        plat = platform.system()
460
461        return opts[plat]

Arguments for xpdf pdftotext program.

XpdfTextArgs( owner_password: str | None = None, user_password: str | None = None, first_page: int | None = None, last_page: int | None = None, use_verbose: bool | None = False, config_file: pathlib.Path | None = None, program_info: bool | None = False, use_original_layout: bool | None = False, use_simple_layout: bool | None = False, use_simple2_layout: bool | None = False, use_table_layout: bool | None = False, use_line_printer: bool | None = False, use_raw_string_order: bool | None = False, use_text_clip: bool | None = False, use_no_diag: bool | None = False, use_no_page_break: bool | None = False, use_bom: bool | None = False, fixed_text_number: int | None = None, line_space_number: int | None = None, line_end_type: str | None = None, margin_left_number: int | None = 0, margin_right_number: int | None = 0, margin_topnumber: int | None = 0, margin_bottom_number: int | None = 0)
use_original_layout: bool | None = False

Maintain (as best as possible) the original physical layout of the text.

-layout : maintain original physical layout

use_simple_layout: bool | None = False

optimized for simple one-column pages. This mode will do a better job of maintaining horizontal spacing, but it will only work properly with a single column of text.

-simple : simple one-column page layout

use_simple2_layout: bool | None = False

handles slightly rotated text (e.g., OCR output) better. Only works for pages with a single column of text.

-simple2 : simple one-column page layout, version 2

use_table_layout: bool | None = False

Table mode is similar to physical layout mode, but optimized for tabular data, with the goal of keeping rows and columns aligned (at the expense of inserting extra whitespace). If the -fixed option is given, character spacing within each line will be determined by the specified character pitch.

-table : similar to -layout, but optimized for tables

use_line_printer: bool | None = False

Line printer mode uses a strict fixed-character-pitch and -height layout. That is, the page is broken into a grid, and characters are placed into that grid. If the grid spacing is too small for the actual characters, the result is extra whitespace. If the grid spacing is too large, the result is missing whitespace. The grid spacing can be specified using the -fixed and -linespacing options. If one or both are not given on the command line, pdftotext will attempt to compute appropriate value(s).

-lineprinter : use strict fixed-pitch/height layout

use_raw_string_order: bool | None = False

Keep the text in content stream order. Depending on how the PDF file was generated, this may or may not be useful.

-raw : keep strings in content stream order

use_text_clip: bool | None = False

Text which is hidden because of clipping is removed before doing layout, and then added back in. This can be helpful for tables where clipped (invisible) text would overlap the next column.

-clip : separate clipped text

use_no_diag: bool | None = False

Diagonal text, i.e., text that is not close to one of the 0, 90, 180, or 270 degree axes, is discarded. This is useful to skip watermarks drawn on top of body text, etc.

-nodiag : discard diagonal text

use_no_page_break: bool | None = False

Don't insert a page break (form feed character) at the end of each page.

-nopgbrk : don't insert a page break at the end of each page

use_bom: bool | None = False

Insert a Unicode byte order marker (BOM) at the start of the text output.

-bom : insert a Unicode BOM at the start of the text file

use_verbose: bool | None = False

Print a status message (to stdout) before processing each page.

-verbose : print per-page status information

fixed_text_number: int | None = None

Specify the character pitch (character width), in points, for physical layout, table, or line printer mode. This is ignored in all other modes.

-fixed : assume fixed-pitch (or tabular) text

line_space_number: int | None = None

Specify the line spacing, in points, for line printer mode. This is ignored in all other modes.

-linespacing : fixed line spacing for LinePrinter mode

line_end_type: str | None = None

Sets the end-of-line convention to use for text output.

-eol : output end-of-line convention (unix, dos, or mac)

margin_left_number: int | None = 0

Specifies the left margin, in points. Text in the left margin (i.e., within that many points of the left edge of the page) is discarded. The default value is zero.

-marginl : left page margin

margin_right_number: int | None = 0

Specifies the right margin, in points. Text in the right margin (i.e., within that many points of the right edge of the page) is discarded. The default value is zero.

-marginr : right page margin

margin_topnumber: int | None = 0

Specifies the top margin, in points. Text in the top margin (i.e., within that many points of the top edge of the page) is discarded. The default value is zero.

-margint : top page margin

margin_bottom_number: int | None = 0

Specifies the bottom margin, in points. Text in the bottom margin (i.e., within that many points of the bottom edge of the page) is discarded. The default value is zero.

-marginb : bottom page margin

@classmethod
def get_line_ending(cls) -> str:
447    @classmethod
448    def get_line_ending(cls) -> str:
449        """Get the line endings based on the current platform.
450
451        Returns:
452            The line ending style.
453        """
454        opts = {
455            "Linux": "unix",
456            "Darwin": "mac",
457            "Windows": "dos",
458        }
459        plat = platform.system()
460
461        return opts[plat]

Get the line endings based on the current platform.

Returns:

The line ending style.

@beartype
@dataclasses.dataclass
class XpdfTextResult:
464@beartype
465@dataclasses.dataclass
466class XpdfTextResult:
467    """Result for xpdf pdftotext program."""
468
469    output_path: pathlib.Path
470    stdout: typing.Collection[str] = dataclasses.field(default_factory=list)
471    stderr: typing.Collection[str] = dataclasses.field(default_factory=list)

Result for xpdf pdftotext program.

XpdfTextResult( output_path: pathlib.Path, stdout: Collection[str] = <factory>, stderr: Collection[str] = <factory>)
output_path: pathlib.Path
stdout: Collection[str]
stderr: Collection[str]
@beartype
@dataclasses.dataclass
class XpdfImageArgs(XpdfArgs):
474@beartype
475@dataclasses.dataclass
476class XpdfImageArgs(XpdfArgs):
477    """Arguments for xpdf pdftopng program."""
478
479    resolution: int | None = dataclasses.field(
480        metadata={"leaf_focus": {"cmd": "-r", "cmd_type": "single"}},
481        default=150,
482    )
483    """
484    Specifies the resolution, in DPI. The default is 150 DPI.
485
486    -r <number>       : resolution, in DPI (default is 150)
487    """
488    use_monochrome: bool | None = dataclasses.field(
489        metadata={"leaf_focus": {"cmd": "-mono", "cmd_type": "bool"}},
490        default=False,
491    )
492    """
493    Generate a monochrome image (instead of a color image).
494
495    -mono             : generate a monochrome PNG file
496    """
497
498    use_grayscale: bool | None = dataclasses.field(
499        metadata={"leaf_focus": {"cmd": "-gray", "cmd_type": "bool"}},
500        default=False,
501    )
502    """
503    Generate a grayscale image (instead of a color image).
504
505    -gray             : generate a grayscale PNG file
506    """
507    use_alpha_channel: bool | None = dataclasses.field(
508        metadata={"leaf_focus": {"cmd": "-alpha", "cmd_type": "bool"}},
509        default=False,
510    )
511    """
512    Generate an alpha channel in the PNG file.
513    This is only useful with PDF files that have been constructed
514    with a transparent background.
515    The -alpha flag cannot be used with -mono.
516
517    -alpha            : include an alpha channel in the PNG file
518    """
519
520    rotation: int | None = dataclasses.field(
521        metadata={"leaf_focus": {"cmd": "-rot", "cmd_type": "single"}},
522        default=None,
523    )
524    """
525    Rotate pages by 0 (the default), 90, 180, or 270 degrees.
526
527    -rot <int>        : set page rotation: 0, 90, 180, or 270
528    """
529
530    free_type: str | None = dataclasses.field(
531        metadata={"leaf_focus": {"cmd": "-freetype", "cmd_type": "single"}},
532        default="yes",
533    )
534    """
535    Enable or disable FreeType (a TrueType / Type 1 font rasterizer).
536    This defaults to "yes".
537
538    -freetype <string>: enable FreeType font rasterizer: yes, no
539    """
540    anti_aliasing: str | None = dataclasses.field(
541        metadata={"leaf_focus": {"cmd": "-aa", "cmd_type": "single"}},
542        default="yes",
543    )
544    """
545    Enable or disable font anti-aliasing.
546    This defaults to "yes".
547
548    -aa <string>      : enable font anti-aliasing: yes, no
549    """
550    vector_anti_aliasing: str | None = dataclasses.field(
551        metadata={"leaf_focus": {"cmd": "-aaVector", "cmd_type": "single"}},
552        default="yes",
553    )
554    """
555    Enable or disable vector anti-aliasing.
556    This defaults to "yes".
557
558     -aaVector <string>: enable vector anti-aliasing: yes, no
559    """

Arguments for xpdf pdftopng program.

XpdfImageArgs( owner_password: str | None = None, user_password: str | None = None, first_page: int | None = None, last_page: int | None = None, use_verbose: bool | None = False, config_file: pathlib.Path | None = None, program_info: bool | None = False, resolution: int | None = 150, use_monochrome: bool | None = False, use_grayscale: bool | None = False, use_alpha_channel: bool | None = False, rotation: int | None = None, free_type: str | None = 'yes', anti_aliasing: str | None = 'yes', vector_anti_aliasing: str | None = 'yes')
resolution: int | None = 150

Specifies the resolution, in DPI. The default is 150 DPI.

-r : resolution, in DPI (default is 150)

use_monochrome: bool | None = False

Generate a monochrome image (instead of a color image).

-mono : generate a monochrome PNG file

use_grayscale: bool | None = False

Generate a grayscale image (instead of a color image).

-gray : generate a grayscale PNG file

use_alpha_channel: bool | None = False

Generate an alpha channel in the PNG file. This is only useful with PDF files that have been constructed with a transparent background. The -alpha flag cannot be used with -mono.

-alpha : include an alpha channel in the PNG file

rotation: int | None = None

Rotate pages by 0 (the default), 90, 180, or 270 degrees.

-rot : set page rotation: 0, 90, 180, or 270

free_type: str | None = 'yes'

Enable or disable FreeType (a TrueType / Type 1 font rasterizer). This defaults to "yes".

-freetype : enable FreeType font rasterizer: yes, no

anti_aliasing: str | None = 'yes'

Enable or disable font anti-aliasing. This defaults to "yes".

-aa : enable font anti-aliasing: yes, no

vector_anti_aliasing: str | None = 'yes'

Enable or disable vector anti-aliasing. This defaults to "yes".

-aaVector : enable vector anti-aliasing: yes, no

@beartype
@dataclasses.dataclass
class XpdfImageResult:
562@beartype
563@dataclasses.dataclass
564class XpdfImageResult:
565    """Result for xpdf pdftopng program."""
566
567    output_dir: pathlib.Path
568    output_files: typing.Collection[pathlib.Path]
569    stdout: typing.Collection[str] = dataclasses.field(default_factory=list)
570    stderr: typing.Collection[str] = dataclasses.field(default_factory=list)

Result for xpdf pdftopng program.

XpdfImageResult( output_dir: pathlib.Path, output_files: Collection[pathlib.Path], stdout: Collection[str] = <factory>, stderr: Collection[str] = <factory>)
output_dir: pathlib.Path
output_files: Collection[pathlib.Path]
stdout: Collection[str]
stderr: Collection[str]