leaf_focus.pdf.model
PDF processing models.
1"""PDF processing models.""" 2 3from __future__ import annotations 4 5import dataclasses 6import logging 7import pathlib 8import platform 9 10from datetime import datetime 11 12from beartype import beartype, typing 13 14 15logger = logging.getLogger(__name__) 16 17 18@beartype 19@dataclasses.dataclass 20class XpdfArgs: 21 """xpdf arguments common to all commands.""" 22 23 owner_password: str | None = dataclasses.field( 24 metadata={"leaf_focus": {"cmd": "-opw", "cmd_type": "single"}}, 25 default=None, 26 ) 27 """ 28 Specify the owner password for the PDF file. 29 Providing this will bypass all security restrictions. 30 31 -opw <string> : owner password (for encrypted files) 32 """ 33 34 user_password: str | None = dataclasses.field( 35 metadata={"leaf_focus": {"cmd": "-upw", "cmd_type": "single"}}, 36 default=None, 37 ) 38 """ 39 Specify the user password for the PDF file. 40 41 -upw <string> : user password (for encrypted files) 42 """ 43 44 first_page: int | None = dataclasses.field( 45 metadata={"leaf_focus": {"cmd": "-f", "cmd_type": "single"}}, 46 default=None, 47 ) 48 """ 49 Specifies the first page to convert. 50 51 -f <int> : first page to convert 52 """ 53 54 last_page: int | None = dataclasses.field( 55 metadata={"leaf_focus": {"cmd": "-l", "cmd_type": "single"}}, 56 default=None, 57 ) 58 """ 59 Specifies the last page to convert. 60 61 -l <int> : last page to convert 62 """ 63 64 use_verbose: bool | None = dataclasses.field( 65 metadata={"leaf_focus": {"cmd": "-verbose", "cmd_type": "bool"}}, 66 default=False, 67 ) 68 """ 69 Print a status message (to stdout) before processing each page. 70 71 -verbose : print per-page status information 72 """ 73 74 config_file: pathlib.Path | None = dataclasses.field( 75 metadata={"leaf_focus": {"cmd": "-cfg", "cmd_type": "single"}}, 76 default=None, 77 ) 78 """ 79 Read config-file in place of ~/.xpdfrc or the system-wide config file. 80 81 -cfg <string> : configuration file to use in place of .xpdfrc 82 """ 83 84 program_info: bool | None = dataclasses.field( 85 metadata={"leaf_focus": {"cmd": "-v", "cmd_type": "bool"}}, 86 default=False, 87 ) 88 """ 89 Print copyright and version information. 90 91 -v : print copyright and version info 92 """ 93 94 95@beartype 96@dataclasses.dataclass 97class XpdfInfoArgs(XpdfArgs): 98 """Arguments for xpdf pdfinfo program.""" 99 100 include_page_bounding_boxes: bool | None = dataclasses.field( 101 metadata={"leaf_focus": {"cmd": "-box", "cmd_type": "bool"}}, 102 default=False, 103 ) 104 """ 105 Prints the page box bounding boxes: 106 MediaBox, CropBox, BleedBox, TrimBox, and ArtBox. 107 108 -box : print the page bounding boxes 109 """ 110 111 include_metadata: bool | None = dataclasses.field( 112 metadata={"leaf_focus": {"cmd": "-meta", "cmd_type": "bool"}}, 113 default=False, 114 ) 115 """ 116 Prints document-level metadata. 117 This is the "Metadata" stream from the PDF file`s Catalog object. 118 119 -meta : print the document metadata (XML) 120 """ 121 122 include_raw_dates: bool | None = dataclasses.field( 123 metadata={"leaf_focus": {"cmd": "-rawdates", "cmd_type": "bool"}}, 124 default=False, 125 ) 126 """ 127 Prints the raw (undecoded) date strings, directly from the PDF file. 128 129 -rawdates : print the undecoded date strings directly from the PDF file 130 """ 131 132 encoding: str | None = dataclasses.field( 133 metadata={"leaf_focus": {"cmd": "-enc", "cmd_type": "single"}}, 134 default="Latin1", 135 ) 136 """ 137 Sets the encoding to use for text output. 138 The encoding-name must be defined with the unicodeMap command. 139 This defaults to "Latin1" (which is a built-in encoding). 140 141 -enc <string> : output text encoding name 142 """ 143 144 145@beartype 146@dataclasses.dataclass 147class XpdfInfoResult: 148 """Result from xpdf pdfinfo program.""" 149 150 # pdf info 151 title: str | None = dataclasses.field( 152 metadata={"leaf_focus": {"name": "Title"}}, 153 ) 154 subject: str | None = dataclasses.field( 155 metadata={"leaf_focus": {"name": "Subject"}}, 156 ) 157 keywords: str | None = dataclasses.field( 158 metadata={"leaf_focus": {"name": "Keywords"}}, 159 ) 160 author: str | None = dataclasses.field( 161 metadata={"leaf_focus": {"name": "Author"}}, 162 ) 163 creator: str | None = dataclasses.field( 164 metadata={"leaf_focus": {"name": "Creator"}}, 165 ) 166 producer: str | None = dataclasses.field( 167 metadata={"leaf_focus": {"name": "Producer"}}, 168 ) 169 creation_date: datetime | None = dataclasses.field( 170 metadata={"leaf_focus": {"name": "CreationDate"}}, 171 ) 172 modification_date: datetime | None = dataclasses.field( 173 metadata={"leaf_focus": {"name": "ModDate"}}, 174 ) 175 176 # additional info 177 tagged: bool | None = dataclasses.field( 178 metadata={"leaf_focus": {"name": "Tagged"}}, 179 ) 180 form: str | None = dataclasses.field( 181 metadata={"leaf_focus": {"name": "Form"}}, 182 ) 183 pages: int | None = dataclasses.field( 184 metadata={"leaf_focus": {"name": "Pages"}}, 185 ) 186 encrypted: bool | None = dataclasses.field( 187 metadata={"leaf_focus": {"name": "Encrypted"}}, 188 ) 189 page_size: str | None = dataclasses.field( 190 metadata={"leaf_focus": {"name": "Page size"}}, 191 ) 192 media_box: str | None = dataclasses.field( 193 metadata={"leaf_focus": {"name": "MediaBox"}}, 194 ) 195 crop_box: str | None = dataclasses.field( 196 metadata={"leaf_focus": {"name": "CropBox"}}, 197 ) 198 bleed_box: str | None = dataclasses.field( 199 metadata={"leaf_focus": {"name": "BleedBox"}}, 200 ) 201 trim_box: str | None = dataclasses.field( 202 metadata={"leaf_focus": {"name": "TrimBox"}}, 203 ) 204 art_box: str | None = dataclasses.field( 205 metadata={"leaf_focus": {"name": "ArtBox"}}, 206 ) 207 file_size_bytes: int | None = dataclasses.field( 208 metadata={"leaf_focus": {"name": "File size"}}, 209 ) 210 optimized: bool | None = dataclasses.field( 211 metadata={"leaf_focus": {"name": "Optimized"}}, 212 ) 213 pdf_version: str | None = dataclasses.field( 214 metadata={"leaf_focus": {"name": "PDF version"}}, 215 ) 216 javascript: str | None = dataclasses.field( 217 metadata={"leaf_focus": {"name": "JavaScript"}}, 218 ) 219 220 # xml metadata 221 metadata: dict[str, typing.Any] | None = dataclasses.field( 222 metadata={"leaf_focus": {"name": "Metadata"}}, 223 ) 224 225 226@beartype 227@dataclasses.dataclass 228class XpdfTextArgs(XpdfArgs): 229 """Arguments for xpdf pdftotext program.""" 230 231 use_original_layout: bool | None = dataclasses.field( 232 metadata={"leaf_focus": {"cmd": "-layout", "cmd_type": "bool"}}, 233 default=False, 234 ) 235 """ 236 Maintain (as best as possible) the original physical layout of the text. 237 238 -layout : maintain original physical layout 239 """ 240 241 use_simple_layout: bool | None = dataclasses.field( 242 metadata={"leaf_focus": {"cmd": "-simple", "cmd_type": "bool"}}, 243 default=False, 244 ) 245 """ 246 optimized for simple one-column pages. 247 This mode will do a better job of maintaining horizontal spacing, 248 but it will only work properly with a single column of text. 249 250 -simple : simple one-column page layout 251 """ 252 253 use_simple2_layout: bool | None = dataclasses.field( 254 metadata={"leaf_focus": {"cmd": "-simple2", "cmd_type": "bool"}}, 255 default=False, 256 ) 257 """ 258 handles slightly rotated text (e.g., OCR output) better. 259 Only works for pages with a single column of text. 260 261 -simple2 : simple one-column page layout, version 2 262 """ 263 264 use_table_layout: bool | None = dataclasses.field( 265 metadata={"leaf_focus": {"cmd": "-table", "cmd_type": "bool"}}, 266 default=False, 267 ) 268 """ 269 Table mode is similar to physical layout mode, but optimized for tabular data, 270 with the goal of keeping rows and columns aligned 271 (at the expense of inserting extra whitespace). 272 If the -fixed option is given, character spacing within 273 each line will be determined by the specified character pitch. 274 275 -table : similar to -layout, but optimized for tables 276 """ 277 278 use_line_printer: bool | None = dataclasses.field( 279 metadata={"leaf_focus": {"cmd": "-lineprinter", "cmd_type": "bool"}}, 280 default=False, 281 ) 282 """ 283 Line printer mode uses a strict fixed-character-pitch and -height layout. 284 That is, the page is broken into a grid, and characters are placed into that grid. 285 If the grid spacing is too small for the actual characters, 286 the result is extra whitespace. 287 If the grid spacing is too large, the result is missing whitespace. 288 The grid spacing can be specified using the -fixed and -linespacing options. 289 If one or both are not given on the command line, 290 pdftotext will attempt to compute appropriate value(s). 291 292 -lineprinter : use strict fixed-pitch/height layout 293 """ 294 295 use_raw_string_order: bool | None = dataclasses.field( 296 metadata={"leaf_focus": {"cmd": "-raw", "cmd_type": "bool"}}, 297 default=False, 298 ) 299 """ 300 Keep the text in content stream order. 301 Depending on how the PDF file was generated, this may or may not be useful. 302 303 -raw : keep strings in content stream order 304 """ 305 306 use_text_clip: bool | None = dataclasses.field( 307 metadata={"leaf_focus": {"cmd": "-clip", "cmd_type": "bool"}}, 308 default=False, 309 ) 310 """ 311 Text which is hidden because of clipping is removed before doing layout, 312 and then added back in. This can be helpful for tables where 313 clipped (invisible) text would overlap the next column. 314 315 -clip : separate clipped text 316 """ 317 318 use_no_diag: bool | None = dataclasses.field( 319 metadata={"leaf_focus": {"cmd": "-nodiag", "cmd_type": "bool"}}, 320 default=False, 321 ) 322 """ 323 Diagonal text, i.e., text that is not close to one of 324 the 0, 90, 180, or 270 degree axes, is discarded. 325 This is useful to skip watermarks drawn on top of body text, etc. 326 327 -nodiag : discard diagonal text 328 """ 329 330 use_no_page_break: bool | None = dataclasses.field( 331 metadata={"leaf_focus": {"cmd": "-nopgbrk", "cmd_type": "bool"}}, 332 default=False, 333 ) 334 """ 335 Don't insert a page break (form feed character) at the 336 end of each page. 337 338 -nopgbrk : don't insert a page break at the end of each page 339 """ 340 341 use_bom: bool | None = dataclasses.field( 342 metadata={"leaf_focus": {"cmd": "-nom", "cmd_type": "bool"}}, 343 default=False, 344 ) 345 """ 346 Insert a Unicode byte order marker (BOM) at the start of the text output. 347 348 -bom : insert a Unicode BOM at the start of the text file 349 """ 350 351 use_verbose: bool | None = dataclasses.field( 352 metadata={"leaf_focus": {"cmd": "-verbose", "cmd_type": "bool"}}, 353 default=False, 354 ) 355 """ 356 Print a status message (to stdout) before processing each page. 357 358 -verbose : print per-page status information 359 """ 360 361 fixed_text_number: int | None = dataclasses.field( 362 metadata={"leaf_focus": {"cmd": "-fixed", "cmd_type": "single"}}, 363 default=None, 364 ) 365 """ 366 Specify the character pitch (character width), in points, 367 for physical layout, table, or line printer mode. 368 This is ignored in all other modes. 369 370 -fixed <number> : assume fixed-pitch (or tabular) text 371 """ 372 373 line_space_number: int | None = dataclasses.field( 374 metadata={"leaf_focus": {"cmd": "-linespacing", "cmd_type": "single"}}, 375 default=None, 376 ) 377 """ 378 Specify the line spacing, in points, for line printer mode. 379 This is ignored in all other modes. 380 381 -linespacing <number> : fixed line spacing for LinePrinter mode 382 """ 383 384 line_end_type: str | None = dataclasses.field( 385 metadata={"leaf_focus": {"cmd": "-eol", "cmd_type": "single"}}, 386 default=None, 387 ) 388 """ 389 Sets the end-of-line convention to use for text output. 390 391 -eol <string> : output end-of-line convention (unix, dos, or mac) 392 """ 393 394 margin_left_number: int | None = dataclasses.field( 395 metadata={"leaf_focus": {"cmd": "-marginl", "cmd_type": "single"}}, 396 default=0, 397 ) 398 """ 399 Specifies the left margin, in points. 400 Text in the left margin 401 (i.e., within that many points of the left edge of the page) is discarded. 402 The default value is zero. 403 404 -marginl <number> : left page margin 405 """ 406 407 margin_right_number: int | None = dataclasses.field( 408 metadata={"leaf_focus": {"cmd": "-marginr", "cmd_type": "single"}}, 409 default=0, 410 ) 411 """ 412 Specifies the right margin, in points. 413 Text in the right margin (i.e., within that many points of the 414 right edge of the page) is discarded. 415 The default value is zero. 416 417 -marginr <number> : right page margin 418 """ 419 420 margin_topnumber: int | None = dataclasses.field( 421 metadata={"leaf_focus": {"cmd": "-margint", "cmd_type": "single"}}, 422 default=0, 423 ) 424 """ 425 Specifies the top margin, in points. 426 Text in the top margin (i.e., within that many points of the top 427 edge of the page) is discarded. 428 The default value is zero. 429 430 -margint <number> : top page margin 431 """ 432 433 margin_bottom_number: int | None = dataclasses.field( 434 metadata={"leaf_focus": {"cmd": "-marginb", "cmd_type": "single"}}, 435 default=0, 436 ) 437 """ 438 Specifies the bottom margin, in points. 439 Text in the bottom margin (i.e., within that many points of the 440 bottom edge of the page) is discarded. 441 The default value is zero. 442 443 -marginb <number> : bottom page margin 444 """ 445 446 @classmethod 447 def get_line_ending(cls) -> str: 448 """Get the line endings based on the current platform. 449 450 Returns: 451 The line ending style. 452 """ 453 opts = { 454 "Linux": "unix", 455 "Darwin": "mac", 456 "Windows": "dos", 457 } 458 plat = platform.system() 459 460 return opts[plat] 461 462 463@beartype 464@dataclasses.dataclass 465class XpdfTextResult: 466 """Result for xpdf pdftotext program.""" 467 468 output_path: pathlib.Path 469 stdout: typing.Collection[str] = dataclasses.field(default_factory=list) 470 stderr: typing.Collection[str] = dataclasses.field(default_factory=list) 471 472 473@beartype 474@dataclasses.dataclass 475class XpdfImageArgs(XpdfArgs): 476 """Arguments for xpdf pdftopng program.""" 477 478 resolution: int | None = dataclasses.field( 479 metadata={"leaf_focus": {"cmd": "-r", "cmd_type": "single"}}, 480 default=150, 481 ) 482 """ 483 Specifies the resolution, in DPI. The default is 150 DPI. 484 485 -r <number> : resolution, in DPI (default is 150) 486 """ 487 use_monochrome: bool | None = dataclasses.field( 488 metadata={"leaf_focus": {"cmd": "-mono", "cmd_type": "bool"}}, 489 default=False, 490 ) 491 """ 492 Generate a monochrome image (instead of a color image). 493 494 -mono : generate a monochrome PNG file 495 """ 496 497 use_grayscale: bool | None = dataclasses.field( 498 metadata={"leaf_focus": {"cmd": "-gray", "cmd_type": "bool"}}, 499 default=False, 500 ) 501 """ 502 Generate a grayscale image (instead of a color image). 503 504 -gray : generate a grayscale PNG file 505 """ 506 use_alpha_channel: bool | None = dataclasses.field( 507 metadata={"leaf_focus": {"cmd": "-alpha", "cmd_type": "bool"}}, 508 default=False, 509 ) 510 """ 511 Generate an alpha channel in the PNG file. 512 This is only useful with PDF files that have been constructed 513 with a transparent background. 514 The -alpha flag cannot be used with -mono. 515 516 -alpha : include an alpha channel in the PNG file 517 """ 518 519 rotation: int | None = dataclasses.field( 520 metadata={"leaf_focus": {"cmd": "-rot", "cmd_type": "single"}}, 521 default=None, 522 ) 523 """ 524 Rotate pages by 0 (the default), 90, 180, or 270 degrees. 525 526 -rot <int> : set page rotation: 0, 90, 180, or 270 527 """ 528 529 free_type: str | None = dataclasses.field( 530 metadata={"leaf_focus": {"cmd": "-freetype", "cmd_type": "single"}}, 531 default="yes", 532 ) 533 """ 534 Enable or disable FreeType (a TrueType / Type 1 font rasterizer). 535 This defaults to "yes". 536 537 -freetype <string>: enable FreeType font rasterizer: yes, no 538 """ 539 anti_aliasing: str | None = dataclasses.field( 540 metadata={"leaf_focus": {"cmd": "-aa", "cmd_type": "single"}}, 541 default="yes", 542 ) 543 """ 544 Enable or disable font anti-aliasing. 545 This defaults to "yes". 546 547 -aa <string> : enable font anti-aliasing: yes, no 548 """ 549 vector_anti_aliasing: str | None = dataclasses.field( 550 metadata={"leaf_focus": {"cmd": "-aaVector", "cmd_type": "single"}}, 551 default="yes", 552 ) 553 """ 554 Enable or disable vector anti-aliasing. 555 This defaults to "yes". 556 557 -aaVector <string>: enable vector anti-aliasing: yes, no 558 """ 559 560 561@beartype 562@dataclasses.dataclass 563class XpdfImageResult: 564 """Result for xpdf pdftopng program.""" 565 566 output_dir: pathlib.Path 567 output_files: typing.Collection[pathlib.Path] 568 stdout: typing.Collection[str] = dataclasses.field(default_factory=list) 569 stderr: typing.Collection[str] = dataclasses.field(default_factory=list)
19@beartype 20@dataclasses.dataclass 21class XpdfArgs: 22 """xpdf arguments common to all commands.""" 23 24 owner_password: str | None = dataclasses.field( 25 metadata={"leaf_focus": {"cmd": "-opw", "cmd_type": "single"}}, 26 default=None, 27 ) 28 """ 29 Specify the owner password for the PDF file. 30 Providing this will bypass all security restrictions. 31 32 -opw <string> : owner password (for encrypted files) 33 """ 34 35 user_password: str | None = dataclasses.field( 36 metadata={"leaf_focus": {"cmd": "-upw", "cmd_type": "single"}}, 37 default=None, 38 ) 39 """ 40 Specify the user password for the PDF file. 41 42 -upw <string> : user password (for encrypted files) 43 """ 44 45 first_page: int | None = dataclasses.field( 46 metadata={"leaf_focus": {"cmd": "-f", "cmd_type": "single"}}, 47 default=None, 48 ) 49 """ 50 Specifies the first page to convert. 51 52 -f <int> : first page to convert 53 """ 54 55 last_page: int | None = dataclasses.field( 56 metadata={"leaf_focus": {"cmd": "-l", "cmd_type": "single"}}, 57 default=None, 58 ) 59 """ 60 Specifies the last page to convert. 61 62 -l <int> : last page to convert 63 """ 64 65 use_verbose: bool | None = dataclasses.field( 66 metadata={"leaf_focus": {"cmd": "-verbose", "cmd_type": "bool"}}, 67 default=False, 68 ) 69 """ 70 Print a status message (to stdout) before processing each page. 71 72 -verbose : print per-page status information 73 """ 74 75 config_file: pathlib.Path | None = dataclasses.field( 76 metadata={"leaf_focus": {"cmd": "-cfg", "cmd_type": "single"}}, 77 default=None, 78 ) 79 """ 80 Read config-file in place of ~/.xpdfrc or the system-wide config file. 81 82 -cfg <string> : configuration file to use in place of .xpdfrc 83 """ 84 85 program_info: bool | None = dataclasses.field( 86 metadata={"leaf_focus": {"cmd": "-v", "cmd_type": "bool"}}, 87 default=False, 88 ) 89 """ 90 Print copyright and version information. 91 92 -v : print copyright and version info 93 """
xpdf arguments common to all commands.
Specify the owner password for the PDF file. Providing this will bypass all security restrictions.
-opw
Specify the user password for the PDF file.
-upw
Print a status message (to stdout) before processing each page.
-verbose : print per-page status information
96@beartype 97@dataclasses.dataclass 98class XpdfInfoArgs(XpdfArgs): 99 """Arguments for xpdf pdfinfo program.""" 100 101 include_page_bounding_boxes: bool | None = dataclasses.field( 102 metadata={"leaf_focus": {"cmd": "-box", "cmd_type": "bool"}}, 103 default=False, 104 ) 105 """ 106 Prints the page box bounding boxes: 107 MediaBox, CropBox, BleedBox, TrimBox, and ArtBox. 108 109 -box : print the page bounding boxes 110 """ 111 112 include_metadata: bool | None = dataclasses.field( 113 metadata={"leaf_focus": {"cmd": "-meta", "cmd_type": "bool"}}, 114 default=False, 115 ) 116 """ 117 Prints document-level metadata. 118 This is the "Metadata" stream from the PDF file`s Catalog object. 119 120 -meta : print the document metadata (XML) 121 """ 122 123 include_raw_dates: bool | None = dataclasses.field( 124 metadata={"leaf_focus": {"cmd": "-rawdates", "cmd_type": "bool"}}, 125 default=False, 126 ) 127 """ 128 Prints the raw (undecoded) date strings, directly from the PDF file. 129 130 -rawdates : print the undecoded date strings directly from the PDF file 131 """ 132 133 encoding: str | None = dataclasses.field( 134 metadata={"leaf_focus": {"cmd": "-enc", "cmd_type": "single"}}, 135 default="Latin1", 136 ) 137 """ 138 Sets the encoding to use for text output. 139 The encoding-name must be defined with the unicodeMap command. 140 This defaults to "Latin1" (which is a built-in encoding). 141 142 -enc <string> : output text encoding name 143 """
Arguments for xpdf pdfinfo program.
Prints the page box bounding boxes: MediaBox, CropBox, BleedBox, TrimBox, and ArtBox.
-box : print the page bounding boxes
Prints document-level metadata. This is the "Metadata" stream from the PDF file`s Catalog object.
-meta : print the document metadata (XML)
Prints the raw (undecoded) date strings, directly from the PDF file.
-rawdates : print the undecoded date strings directly from the PDF file
Sets the encoding to use for text output. The encoding-name must be defined with the unicodeMap command. This defaults to "Latin1" (which is a built-in encoding).
-enc
Inherited Members
146@beartype 147@dataclasses.dataclass 148class XpdfInfoResult: 149 """Result from xpdf pdfinfo program.""" 150 151 # pdf info 152 title: str | None = dataclasses.field( 153 metadata={"leaf_focus": {"name": "Title"}}, 154 ) 155 subject: str | None = dataclasses.field( 156 metadata={"leaf_focus": {"name": "Subject"}}, 157 ) 158 keywords: str | None = dataclasses.field( 159 metadata={"leaf_focus": {"name": "Keywords"}}, 160 ) 161 author: str | None = dataclasses.field( 162 metadata={"leaf_focus": {"name": "Author"}}, 163 ) 164 creator: str | None = dataclasses.field( 165 metadata={"leaf_focus": {"name": "Creator"}}, 166 ) 167 producer: str | None = dataclasses.field( 168 metadata={"leaf_focus": {"name": "Producer"}}, 169 ) 170 creation_date: datetime | None = dataclasses.field( 171 metadata={"leaf_focus": {"name": "CreationDate"}}, 172 ) 173 modification_date: datetime | None = dataclasses.field( 174 metadata={"leaf_focus": {"name": "ModDate"}}, 175 ) 176 177 # additional info 178 tagged: bool | None = dataclasses.field( 179 metadata={"leaf_focus": {"name": "Tagged"}}, 180 ) 181 form: str | None = dataclasses.field( 182 metadata={"leaf_focus": {"name": "Form"}}, 183 ) 184 pages: int | None = dataclasses.field( 185 metadata={"leaf_focus": {"name": "Pages"}}, 186 ) 187 encrypted: bool | None = dataclasses.field( 188 metadata={"leaf_focus": {"name": "Encrypted"}}, 189 ) 190 page_size: str | None = dataclasses.field( 191 metadata={"leaf_focus": {"name": "Page size"}}, 192 ) 193 media_box: str | None = dataclasses.field( 194 metadata={"leaf_focus": {"name": "MediaBox"}}, 195 ) 196 crop_box: str | None = dataclasses.field( 197 metadata={"leaf_focus": {"name": "CropBox"}}, 198 ) 199 bleed_box: str | None = dataclasses.field( 200 metadata={"leaf_focus": {"name": "BleedBox"}}, 201 ) 202 trim_box: str | None = dataclasses.field( 203 metadata={"leaf_focus": {"name": "TrimBox"}}, 204 ) 205 art_box: str | None = dataclasses.field( 206 metadata={"leaf_focus": {"name": "ArtBox"}}, 207 ) 208 file_size_bytes: int | None = dataclasses.field( 209 metadata={"leaf_focus": {"name": "File size"}}, 210 ) 211 optimized: bool | None = dataclasses.field( 212 metadata={"leaf_focus": {"name": "Optimized"}}, 213 ) 214 pdf_version: str | None = dataclasses.field( 215 metadata={"leaf_focus": {"name": "PDF version"}}, 216 ) 217 javascript: str | None = dataclasses.field( 218 metadata={"leaf_focus": {"name": "JavaScript"}}, 219 ) 220 221 # xml metadata 222 metadata: dict[str, typing.Any] | None = dataclasses.field( 223 metadata={"leaf_focus": {"name": "Metadata"}}, 224 )
Result from xpdf pdfinfo program.
227@beartype 228@dataclasses.dataclass 229class XpdfTextArgs(XpdfArgs): 230 """Arguments for xpdf pdftotext program.""" 231 232 use_original_layout: bool | None = dataclasses.field( 233 metadata={"leaf_focus": {"cmd": "-layout", "cmd_type": "bool"}}, 234 default=False, 235 ) 236 """ 237 Maintain (as best as possible) the original physical layout of the text. 238 239 -layout : maintain original physical layout 240 """ 241 242 use_simple_layout: bool | None = dataclasses.field( 243 metadata={"leaf_focus": {"cmd": "-simple", "cmd_type": "bool"}}, 244 default=False, 245 ) 246 """ 247 optimized for simple one-column pages. 248 This mode will do a better job of maintaining horizontal spacing, 249 but it will only work properly with a single column of text. 250 251 -simple : simple one-column page layout 252 """ 253 254 use_simple2_layout: bool | None = dataclasses.field( 255 metadata={"leaf_focus": {"cmd": "-simple2", "cmd_type": "bool"}}, 256 default=False, 257 ) 258 """ 259 handles slightly rotated text (e.g., OCR output) better. 260 Only works for pages with a single column of text. 261 262 -simple2 : simple one-column page layout, version 2 263 """ 264 265 use_table_layout: bool | None = dataclasses.field( 266 metadata={"leaf_focus": {"cmd": "-table", "cmd_type": "bool"}}, 267 default=False, 268 ) 269 """ 270 Table mode is similar to physical layout mode, but optimized for tabular data, 271 with the goal of keeping rows and columns aligned 272 (at the expense of inserting extra whitespace). 273 If the -fixed option is given, character spacing within 274 each line will be determined by the specified character pitch. 275 276 -table : similar to -layout, but optimized for tables 277 """ 278 279 use_line_printer: bool | None = dataclasses.field( 280 metadata={"leaf_focus": {"cmd": "-lineprinter", "cmd_type": "bool"}}, 281 default=False, 282 ) 283 """ 284 Line printer mode uses a strict fixed-character-pitch and -height layout. 285 That is, the page is broken into a grid, and characters are placed into that grid. 286 If the grid spacing is too small for the actual characters, 287 the result is extra whitespace. 288 If the grid spacing is too large, the result is missing whitespace. 289 The grid spacing can be specified using the -fixed and -linespacing options. 290 If one or both are not given on the command line, 291 pdftotext will attempt to compute appropriate value(s). 292 293 -lineprinter : use strict fixed-pitch/height layout 294 """ 295 296 use_raw_string_order: bool | None = dataclasses.field( 297 metadata={"leaf_focus": {"cmd": "-raw", "cmd_type": "bool"}}, 298 default=False, 299 ) 300 """ 301 Keep the text in content stream order. 302 Depending on how the PDF file was generated, this may or may not be useful. 303 304 -raw : keep strings in content stream order 305 """ 306 307 use_text_clip: bool | None = dataclasses.field( 308 metadata={"leaf_focus": {"cmd": "-clip", "cmd_type": "bool"}}, 309 default=False, 310 ) 311 """ 312 Text which is hidden because of clipping is removed before doing layout, 313 and then added back in. This can be helpful for tables where 314 clipped (invisible) text would overlap the next column. 315 316 -clip : separate clipped text 317 """ 318 319 use_no_diag: bool | None = dataclasses.field( 320 metadata={"leaf_focus": {"cmd": "-nodiag", "cmd_type": "bool"}}, 321 default=False, 322 ) 323 """ 324 Diagonal text, i.e., text that is not close to one of 325 the 0, 90, 180, or 270 degree axes, is discarded. 326 This is useful to skip watermarks drawn on top of body text, etc. 327 328 -nodiag : discard diagonal text 329 """ 330 331 use_no_page_break: bool | None = dataclasses.field( 332 metadata={"leaf_focus": {"cmd": "-nopgbrk", "cmd_type": "bool"}}, 333 default=False, 334 ) 335 """ 336 Don't insert a page break (form feed character) at the 337 end of each page. 338 339 -nopgbrk : don't insert a page break at the end of each page 340 """ 341 342 use_bom: bool | None = dataclasses.field( 343 metadata={"leaf_focus": {"cmd": "-nom", "cmd_type": "bool"}}, 344 default=False, 345 ) 346 """ 347 Insert a Unicode byte order marker (BOM) at the start of the text output. 348 349 -bom : insert a Unicode BOM at the start of the text file 350 """ 351 352 use_verbose: bool | None = dataclasses.field( 353 metadata={"leaf_focus": {"cmd": "-verbose", "cmd_type": "bool"}}, 354 default=False, 355 ) 356 """ 357 Print a status message (to stdout) before processing each page. 358 359 -verbose : print per-page status information 360 """ 361 362 fixed_text_number: int | None = dataclasses.field( 363 metadata={"leaf_focus": {"cmd": "-fixed", "cmd_type": "single"}}, 364 default=None, 365 ) 366 """ 367 Specify the character pitch (character width), in points, 368 for physical layout, table, or line printer mode. 369 This is ignored in all other modes. 370 371 -fixed <number> : assume fixed-pitch (or tabular) text 372 """ 373 374 line_space_number: int | None = dataclasses.field( 375 metadata={"leaf_focus": {"cmd": "-linespacing", "cmd_type": "single"}}, 376 default=None, 377 ) 378 """ 379 Specify the line spacing, in points, for line printer mode. 380 This is ignored in all other modes. 381 382 -linespacing <number> : fixed line spacing for LinePrinter mode 383 """ 384 385 line_end_type: str | None = dataclasses.field( 386 metadata={"leaf_focus": {"cmd": "-eol", "cmd_type": "single"}}, 387 default=None, 388 ) 389 """ 390 Sets the end-of-line convention to use for text output. 391 392 -eol <string> : output end-of-line convention (unix, dos, or mac) 393 """ 394 395 margin_left_number: int | None = dataclasses.field( 396 metadata={"leaf_focus": {"cmd": "-marginl", "cmd_type": "single"}}, 397 default=0, 398 ) 399 """ 400 Specifies the left margin, in points. 401 Text in the left margin 402 (i.e., within that many points of the left edge of the page) is discarded. 403 The default value is zero. 404 405 -marginl <number> : left page margin 406 """ 407 408 margin_right_number: int | None = dataclasses.field( 409 metadata={"leaf_focus": {"cmd": "-marginr", "cmd_type": "single"}}, 410 default=0, 411 ) 412 """ 413 Specifies the right margin, in points. 414 Text in the right margin (i.e., within that many points of the 415 right edge of the page) is discarded. 416 The default value is zero. 417 418 -marginr <number> : right page margin 419 """ 420 421 margin_topnumber: int | None = dataclasses.field( 422 metadata={"leaf_focus": {"cmd": "-margint", "cmd_type": "single"}}, 423 default=0, 424 ) 425 """ 426 Specifies the top margin, in points. 427 Text in the top margin (i.e., within that many points of the top 428 edge of the page) is discarded. 429 The default value is zero. 430 431 -margint <number> : top page margin 432 """ 433 434 margin_bottom_number: int | None = dataclasses.field( 435 metadata={"leaf_focus": {"cmd": "-marginb", "cmd_type": "single"}}, 436 default=0, 437 ) 438 """ 439 Specifies the bottom margin, in points. 440 Text in the bottom margin (i.e., within that many points of the 441 bottom edge of the page) is discarded. 442 The default value is zero. 443 444 -marginb <number> : bottom page margin 445 """ 446 447 @classmethod 448 def get_line_ending(cls) -> str: 449 """Get the line endings based on the current platform. 450 451 Returns: 452 The line ending style. 453 """ 454 opts = { 455 "Linux": "unix", 456 "Darwin": "mac", 457 "Windows": "dos", 458 } 459 plat = platform.system() 460 461 return opts[plat]
Arguments for xpdf pdftotext program.
Maintain (as best as possible) the original physical layout of the text.
-layout : maintain original physical layout
optimized for simple one-column pages. This mode will do a better job of maintaining horizontal spacing, but it will only work properly with a single column of text.
-simple : simple one-column page layout
handles slightly rotated text (e.g., OCR output) better. Only works for pages with a single column of text.
-simple2 : simple one-column page layout, version 2
Table mode is similar to physical layout mode, but optimized for tabular data, with the goal of keeping rows and columns aligned (at the expense of inserting extra whitespace). If the -fixed option is given, character spacing within each line will be determined by the specified character pitch.
-table : similar to -layout, but optimized for tables
Line printer mode uses a strict fixed-character-pitch and -height layout. That is, the page is broken into a grid, and characters are placed into that grid. If the grid spacing is too small for the actual characters, the result is extra whitespace. If the grid spacing is too large, the result is missing whitespace. The grid spacing can be specified using the -fixed and -linespacing options. If one or both are not given on the command line, pdftotext will attempt to compute appropriate value(s).
-lineprinter : use strict fixed-pitch/height layout
Keep the text in content stream order. Depending on how the PDF file was generated, this may or may not be useful.
-raw : keep strings in content stream order
Text which is hidden because of clipping is removed before doing layout, and then added back in. This can be helpful for tables where clipped (invisible) text would overlap the next column.
-clip : separate clipped text
Diagonal text, i.e., text that is not close to one of the 0, 90, 180, or 270 degree axes, is discarded. This is useful to skip watermarks drawn on top of body text, etc.
-nodiag : discard diagonal text
Don't insert a page break (form feed character) at the end of each page.
-nopgbrk : don't insert a page break at the end of each page
Insert a Unicode byte order marker (BOM) at the start of the text output.
-bom : insert a Unicode BOM at the start of the text file
Print a status message (to stdout) before processing each page.
-verbose : print per-page status information
Specify the character pitch (character width), in points, for physical layout, table, or line printer mode. This is ignored in all other modes.
-fixed
Specify the line spacing, in points, for line printer mode. This is ignored in all other modes.
-linespacing
Sets the end-of-line convention to use for text output.
-eol
Specifies the left margin, in points. Text in the left margin (i.e., within that many points of the left edge of the page) is discarded. The default value is zero.
-marginl
Specifies the right margin, in points. Text in the right margin (i.e., within that many points of the right edge of the page) is discarded. The default value is zero.
-marginr
Specifies the top margin, in points. Text in the top margin (i.e., within that many points of the top edge of the page) is discarded. The default value is zero.
-margint
Specifies the bottom margin, in points. Text in the bottom margin (i.e., within that many points of the bottom edge of the page) is discarded. The default value is zero.
-marginb
447 @classmethod 448 def get_line_ending(cls) -> str: 449 """Get the line endings based on the current platform. 450 451 Returns: 452 The line ending style. 453 """ 454 opts = { 455 "Linux": "unix", 456 "Darwin": "mac", 457 "Windows": "dos", 458 } 459 plat = platform.system() 460 461 return opts[plat]
Get the line endings based on the current platform.
Returns:
The line ending style.
Inherited Members
464@beartype 465@dataclasses.dataclass 466class XpdfTextResult: 467 """Result for xpdf pdftotext program.""" 468 469 output_path: pathlib.Path 470 stdout: typing.Collection[str] = dataclasses.field(default_factory=list) 471 stderr: typing.Collection[str] = dataclasses.field(default_factory=list)
Result for xpdf pdftotext program.
474@beartype 475@dataclasses.dataclass 476class XpdfImageArgs(XpdfArgs): 477 """Arguments for xpdf pdftopng program.""" 478 479 resolution: int | None = dataclasses.field( 480 metadata={"leaf_focus": {"cmd": "-r", "cmd_type": "single"}}, 481 default=150, 482 ) 483 """ 484 Specifies the resolution, in DPI. The default is 150 DPI. 485 486 -r <number> : resolution, in DPI (default is 150) 487 """ 488 use_monochrome: bool | None = dataclasses.field( 489 metadata={"leaf_focus": {"cmd": "-mono", "cmd_type": "bool"}}, 490 default=False, 491 ) 492 """ 493 Generate a monochrome image (instead of a color image). 494 495 -mono : generate a monochrome PNG file 496 """ 497 498 use_grayscale: bool | None = dataclasses.field( 499 metadata={"leaf_focus": {"cmd": "-gray", "cmd_type": "bool"}}, 500 default=False, 501 ) 502 """ 503 Generate a grayscale image (instead of a color image). 504 505 -gray : generate a grayscale PNG file 506 """ 507 use_alpha_channel: bool | None = dataclasses.field( 508 metadata={"leaf_focus": {"cmd": "-alpha", "cmd_type": "bool"}}, 509 default=False, 510 ) 511 """ 512 Generate an alpha channel in the PNG file. 513 This is only useful with PDF files that have been constructed 514 with a transparent background. 515 The -alpha flag cannot be used with -mono. 516 517 -alpha : include an alpha channel in the PNG file 518 """ 519 520 rotation: int | None = dataclasses.field( 521 metadata={"leaf_focus": {"cmd": "-rot", "cmd_type": "single"}}, 522 default=None, 523 ) 524 """ 525 Rotate pages by 0 (the default), 90, 180, or 270 degrees. 526 527 -rot <int> : set page rotation: 0, 90, 180, or 270 528 """ 529 530 free_type: str | None = dataclasses.field( 531 metadata={"leaf_focus": {"cmd": "-freetype", "cmd_type": "single"}}, 532 default="yes", 533 ) 534 """ 535 Enable or disable FreeType (a TrueType / Type 1 font rasterizer). 536 This defaults to "yes". 537 538 -freetype <string>: enable FreeType font rasterizer: yes, no 539 """ 540 anti_aliasing: str | None = dataclasses.field( 541 metadata={"leaf_focus": {"cmd": "-aa", "cmd_type": "single"}}, 542 default="yes", 543 ) 544 """ 545 Enable or disable font anti-aliasing. 546 This defaults to "yes". 547 548 -aa <string> : enable font anti-aliasing: yes, no 549 """ 550 vector_anti_aliasing: str | None = dataclasses.field( 551 metadata={"leaf_focus": {"cmd": "-aaVector", "cmd_type": "single"}}, 552 default="yes", 553 ) 554 """ 555 Enable or disable vector anti-aliasing. 556 This defaults to "yes". 557 558 -aaVector <string>: enable vector anti-aliasing: yes, no 559 """
Arguments for xpdf pdftopng program.
Specifies the resolution, in DPI. The default is 150 DPI.
-r
Generate a monochrome image (instead of a color image).
-mono : generate a monochrome PNG file
Generate a grayscale image (instead of a color image).
-gray : generate a grayscale PNG file
Generate an alpha channel in the PNG file. This is only useful with PDF files that have been constructed with a transparent background. The -alpha flag cannot be used with -mono.
-alpha : include an alpha channel in the PNG file
Rotate pages by 0 (the default), 90, 180, or 270 degrees.
-rot
Enable or disable FreeType (a TrueType / Type 1 font rasterizer). This defaults to "yes".
-freetype
Enable or disable font anti-aliasing. This defaults to "yes".
-aa
Enable or disable vector anti-aliasing. This defaults to "yes".
-aaVector
Inherited Members
562@beartype 563@dataclasses.dataclass 564class XpdfImageResult: 565 """Result for xpdf pdftopng program.""" 566 567 output_dir: pathlib.Path 568 output_files: typing.Collection[pathlib.Path] 569 stdout: typing.Collection[str] = dataclasses.field(default_factory=list) 570 stderr: typing.Collection[str] = dataclasses.field(default_factory=list)
Result for xpdf pdftopng program.