leaf_focus.ocr.model
Models for OCR processing.
1"""Models for OCR processing.""" 2 3from __future__ import annotations 4 5import csv 6import dataclasses 7import logging 8import math 9import pathlib 10 11import numpy as np 12 13from beartype import beartype, typing 14 15 16logger = logging.getLogger(__name__) 17 18 19@beartype 20@dataclasses.dataclass 21class TextItem: 22 """One found text item (could be a word or phrase) in an image.""" 23 24 text: str 25 """The recognised text.""" 26 27 top_left_x: float | np.float32 28 top_left_y: float | np.float32 29 30 top_right_x: float | np.float32 31 top_right_y: float | np.float32 32 33 bottom_right_x: float | np.float32 34 bottom_right_y: float | np.float32 35 36 bottom_left_x: float | np.float32 37 bottom_left_y: float | np.float32 38 39 line_number: int | None = None 40 line_order: int | None = None 41 42 @property 43 def top_left(self) -> tuple[float | np.float32, float | np.float32]: 44 """Get the top left point. 45 46 Returns: 47 The x and y coordinates. 48 """ 49 return self.top_left_x, self.top_left_y 50 51 @property 52 def top_right(self) -> tuple[float | np.float32, float | np.float32]: 53 """Get the top right point. 54 55 Returns: 56 The x and y coordinates. 57 """ 58 return self.top_right_x, self.top_right_y 59 60 @property 61 def bottom_right(self) -> tuple[float | np.float32, float | np.float32]: 62 """Get the bottom right point. 63 64 Returns: 65 The x and y coordinates. 66 """ 67 return self.bottom_right_x, self.bottom_right_y 68 69 @property 70 def bottom_left(self) -> tuple[float | np.float32, float | np.float32]: 71 """Get the bottom left point. 72 73 Returns: 74 The x and y coordinates. 75 """ 76 return self.bottom_left_x, self.bottom_left_y 77 78 @property 79 def top_length(self) -> float | np.float32: 80 """Get the length of the top side. 81 82 Returns: 83 float: The length. 84 """ 85 # Get the length of the hypotenuse side. 86 side1 = abs(float(self.top_right_x) - float(self.top_left_x)) 87 side2 = abs(float(self.top_right_y) - float(self.top_left_y)) 88 if side2 == 0: 89 return side1 90 return math.sqrt(pow(side1, 2) + pow(side2, 2)) 91 92 @property 93 def left_length(self) -> float | np.float32: 94 """Get the length of the left side. 95 96 Returns: 97 float: The length. 98 """ 99 # Get the length of the hypotenuse side. 100 side1 = abs(float(self.top_left_y) - float(self.bottom_left_y)) 101 side2 = abs(float(self.top_left_x) - float(self.bottom_left_x)) 102 if side2 == 0: 103 return side1 104 return math.sqrt(pow(side1, 2) + pow(side2, 2)) 105 106 @property 107 def line_bounds(self) -> tuple[float | np.float32, float | np.float32]: 108 """Line bounds from top of text to bottom of text.""" 109 top_bound = min( 110 [ 111 float(self.top_left_y), 112 float(self.top_right_y), 113 float(self.bottom_left_y), 114 float(self.bottom_right_y), 115 ], 116 ) 117 bottom_bound = max( 118 [ 119 float(self.top_left_y), 120 float(self.top_right_y), 121 float(self.bottom_left_y), 122 float(self.bottom_right_y), 123 ], 124 ) 125 return top_bound, bottom_bound 126 127 def is_same_line(self, other: TextItem) -> bool: 128 """Check if the vertical midpoints of this item and another item overlap. 129 130 Calculated as the midpoint +- 1/3 of the height of the text. 131 132 Args: 133 other (TextItem): The text item to compare. 134 135 Returns: 136 bool: True if this item and the other item overlap, otherwise false. 137 """ 138 if not other: 139 return False 140 self_bounds = self.line_bounds 141 self_top = self_bounds[0] 142 self_bottom = self_bounds[1] 143 self_third = (self_bottom - self_top) / 3 144 self_top += self_third 145 self_bottom -= self_third 146 147 other_bounds = other.line_bounds 148 other_top = other_bounds[0] 149 other_bottom = other_bounds[1] 150 other_third = (other_bottom - other_top) / 3 151 other_top += other_third 152 other_bottom -= other_third 153 154 return bool(self_top <= other_bottom and other_top <= self_bottom) 155 156 @property 157 def slope_top_left_right(self) -> float | np.float32: 158 """Get the top slope from the left to the right. 159 160 Returns: 161 float: The slope. 162 """ 163 return self._slope( 164 self.top_left_x, 165 self.top_left_y, 166 self.top_right_x, 167 self.top_right_y, 168 ) 169 170 @property 171 def slope_top_right_left(self) -> float | np.float32: 172 """Get the top slope from the right to the left. 173 174 Returns: 175 float: The slope. 176 """ 177 return self._slope( 178 self.top_right_x, 179 self.top_right_y, 180 self.top_left_x, 181 self.top_left_y, 182 ) 183 184 @property 185 def slope_left_top_bottom(self) -> float | np.float32: 186 """Get the left slope from the top to the bottom. 187 188 Returns: 189 float: The slope. 190 """ 191 return self._slope( 192 self.top_left_x, 193 self.top_left_y, 194 self.bottom_left_x, 195 self.bottom_left_y, 196 ) 197 198 @property 199 def slope_left_bottom_top(self) -> float | np.float32: 200 """Get the left slope from the bottom to the top. 201 202 Returns: 203 float: The slope. 204 """ 205 return self._slope( 206 self.bottom_left_x, 207 self.bottom_left_y, 208 self.top_left_x, 209 self.top_left_y, 210 ) 211 212 @property 213 def slope_bottom_left_right(self) -> float | np.float32: 214 """Get the bottom slope from the left to the right. 215 216 Returns: 217 float: The slope. 218 """ 219 return self._slope( 220 self.bottom_left_x, 221 self.bottom_left_y, 222 self.bottom_right_x, 223 self.bottom_right_y, 224 ) 225 226 @property 227 def slope_bottom_right_left(self) -> float | np.float32: 228 """Get the bottom slope from the right to the left. 229 230 Returns: 231 float: The slope. 232 """ 233 return self._slope( 234 self.bottom_right_x, 235 self.bottom_right_y, 236 self.bottom_left_x, 237 self.bottom_left_y, 238 ) 239 240 @property 241 def slope_right_top_bottom(self) -> float | np.float32: 242 """Get the right slope from the top to the bottom. 243 244 Returns: 245 float: The slope. 246 """ 247 return self._slope( 248 self.top_right_x, 249 self.top_right_y, 250 self.bottom_right_x, 251 self.bottom_right_y, 252 ) 253 254 @property 255 def slope_right_bottom_top(self) -> float | np.float32: 256 """Get the right slope from the bottom to the top. 257 258 Returns: 259 float: The slope. 260 """ 261 return self._slope( 262 self.bottom_right_x, 263 self.bottom_right_y, 264 self.top_right_x, 265 self.top_right_y, 266 ) 267 268 @property 269 def is_horizontal_level(self) -> bool: 270 """Check whether the left-to-right slope is approximately horizontal. 271 272 Returns: 273 bool: True if the item is approximately horizontal. 274 """ 275 # -0.1 -> 0.1 is strictly horizontal 276 # give a bit of buffer 277 buffer = 0.09 278 return bool(-buffer <= self.slope_top_left_right <= buffer) 279 280 @property 281 def is_vertical_level(self) -> bool: 282 """Check whether the top-to-bottom slope is approximately vertical. 283 284 Returns: 285 bool: True if the item is approximately vertical. 286 """ 287 # -0.1 -> 0.1 is strictly vertical 288 # give a bit of buffer 289 return bool(self.slope_left_top_bottom == math.inf) 290 291 @classmethod 292 def save(cls, path: pathlib.Path, items: list[TextItem]) -> None: 293 """Save found text items to a file. 294 295 Args: 296 path: Write the items to this file. 297 items: The items to save. 298 299 Returns: 300 None 301 """ 302 logger.debug("Saving %s OCR output items.", len(items)) 303 304 fields = [ 305 "text", 306 "line_number", 307 "line_order", 308 "top_left_x", 309 "top_left_y", 310 "top_right_x", 311 "top_right_y", 312 "bottom_right_x", 313 "bottom_right_y", 314 "bottom_left_x", 315 "bottom_left_y", 316 ] 317 with path.open("w", newline="", encoding="utf8") as file_path: 318 writer = csv.DictWriter(file_path, fields) 319 writer.writeheader() 320 sorted_items = sorted( 321 items, 322 key=lambda i: (i.line_number or 0, i.line_order or 0), 323 ) 324 writer.writerows([dataclasses.asdict(i) for i in sorted_items]) 325 326 logger.debug("Saved OCR items to '%s'.", path) 327 328 @classmethod 329 def load(cls, path: pathlib.Path) -> typing.Generator[TextItem, typing.Any, None]: 330 """Load found text items from a file. 331 332 Args: 333 path: The path to the file containing items. 334 335 Returns: 336 typing.Generator["TextItem", typing.Any, None]: Items from the file. 337 """ 338 logger.debug("Loading OCR output items.") 339 count = 0 340 341 with path.open(encoding="utf8") as file_path: 342 reader = csv.DictReader(file_path) 343 for row in reader: 344 line_number = row.get("line_number", "").strip() 345 line_number = int(line_number) if line_number else None 346 347 line_order = row.get("line_order", "").strip() 348 line_order = int(line_order) if line_order else None 349 350 count += 1 351 352 yield TextItem( 353 text=row["text"], 354 line_number=line_number, 355 line_order=line_order, 356 top_left_x=float(row["top_left_x"]), 357 top_left_y=float(row["top_left_y"]), 358 top_right_x=float(row["top_right_x"]), 359 top_right_y=float(row["top_right_y"]), 360 bottom_right_x=float(row["bottom_right_x"]), 361 bottom_right_y=float(row["bottom_right_y"]), 362 bottom_left_x=float(row["bottom_left_x"]), 363 bottom_left_y=float(row["bottom_left_y"]), 364 ) 365 366 logger.debug("Loaded %s OCR items from '%s'.", count, path) 367 368 @classmethod 369 def from_prediction( 370 cls, 371 prediction: tuple[typing.Any, typing.Any], 372 ) -> TextItem: 373 """Convert from (text, box) to item. 374 375 Box is (top left, top right, bottom right, bottom left). 376 Its structure is [[startX,startY], [endX,startY], [endX,endY], [startX, endY]]. 377 378 Args: 379 prediction: The text recognised in an image. 380 381 Returns: 382 TextItem: A text item representing the recognised text. 383 """ 384 ( 385 text, 386 ( 387 (top_left_x, top_left_y), 388 (top_right_x, top_right_y), 389 (bottom_right_x, bottom_right_y), 390 (bottom_left_x, bottom_left_y), 391 ), 392 ) = prediction 393 return TextItem( 394 text=text, 395 top_left_x=top_left_x, 396 top_left_y=top_left_y, 397 top_right_x=top_right_x, 398 top_right_y=top_right_y, 399 bottom_right_x=bottom_right_x, 400 bottom_right_y=bottom_right_y, 401 bottom_left_x=bottom_left_x, 402 bottom_left_y=bottom_left_y, 403 ) 404 405 @classmethod 406 def order_text_lines( 407 cls, 408 items: list[TextItem], 409 ) -> list[list[TextItem]]: 410 """Put items into lines of text (top -> bottom, left -> right).""" 411 if not items: 412 items = [] 413 414 logger.debug("Arranging text into lines.") 415 416 lines = [] 417 current_line: list[TextItem] = [] 418 for item in items: 419 if not item.is_horizontal_level: 420 # exclude items that are too sloped 421 continue 422 423 if len(current_line) < 1 or any(item.is_same_line(i) for i in current_line): 424 current_line.append(item) 425 426 elif len(current_line) > 0: 427 # store current line 428 current_line = sorted(current_line, key=lambda x: x.top_left) 429 lines.append(current_line) 430 431 # create new line 432 current_line = [item] 433 434 # include last items 435 if len(current_line) > 0: 436 lines.append(current_line) 437 438 # update items to set line number and line order 439 for line_index, line in enumerate(lines): 440 for item_index, item in enumerate(line): 441 item.line_number = line_index + 1 442 item.line_order = item_index + 1 443 444 return lines 445 446 @property 447 def to_prediction( 448 self, 449 ) -> tuple[ 450 str, 451 tuple[ 452 tuple[float | np.float32, float | np.float32], 453 tuple[float | np.float32, float | np.float32], 454 tuple[float | np.float32, float | np.float32], 455 tuple[float | np.float32, float | np.float32], 456 ], 457 ]: 458 """Convert to prediction format.""" 459 return ( 460 self.text, 461 ( 462 (self.top_left_x, self.top_left_y), 463 (self.top_right_x, self.top_right_y), 464 (self.bottom_right_x, self.bottom_right_y), 465 (self.bottom_left_x, self.bottom_left_y), 466 ), 467 ) 468 469 def _slope( 470 self, 471 pt_x1: float | np.float32, 472 pt_y1: float | np.float32, 473 pt_x2: float | np.float32, 474 pt_y2: float | np.float32, 475 ) -> float | np.float32: 476 """Get the slope of a line.""" 477 y_diff = pt_y2 - pt_y1 478 x_diff = pt_x2 - pt_x1 479 try: 480 return y_diff / x_diff 481 except ZeroDivisionError: 482 return math.inf if y_diff >= 0 else -math.inf 483 484 def __str__(self) -> str: 485 """Convert to a string.""" 486 line_info = f"({self.line_number or 0}:{self.line_order})" 487 pos_info = f"[top left:{self.top_left}, top slope: {self.slope_top_left_right}]" 488 return f"{self.text} {line_info} {pos_info}" 489 490 491@beartype 492@dataclasses.dataclass 493class KerasOcrResult: 494 """Result from running keras-ocr.""" 495 496 output_dir: pathlib.Path 497 annotations_file: pathlib.Path 498 predictions_file: pathlib.Path 499 items: list[list[TextItem]]
20@beartype 21@dataclasses.dataclass 22class TextItem: 23 """One found text item (could be a word or phrase) in an image.""" 24 25 text: str 26 """The recognised text.""" 27 28 top_left_x: float | np.float32 29 top_left_y: float | np.float32 30 31 top_right_x: float | np.float32 32 top_right_y: float | np.float32 33 34 bottom_right_x: float | np.float32 35 bottom_right_y: float | np.float32 36 37 bottom_left_x: float | np.float32 38 bottom_left_y: float | np.float32 39 40 line_number: int | None = None 41 line_order: int | None = None 42 43 @property 44 def top_left(self) -> tuple[float | np.float32, float | np.float32]: 45 """Get the top left point. 46 47 Returns: 48 The x and y coordinates. 49 """ 50 return self.top_left_x, self.top_left_y 51 52 @property 53 def top_right(self) -> tuple[float | np.float32, float | np.float32]: 54 """Get the top right point. 55 56 Returns: 57 The x and y coordinates. 58 """ 59 return self.top_right_x, self.top_right_y 60 61 @property 62 def bottom_right(self) -> tuple[float | np.float32, float | np.float32]: 63 """Get the bottom right point. 64 65 Returns: 66 The x and y coordinates. 67 """ 68 return self.bottom_right_x, self.bottom_right_y 69 70 @property 71 def bottom_left(self) -> tuple[float | np.float32, float | np.float32]: 72 """Get the bottom left point. 73 74 Returns: 75 The x and y coordinates. 76 """ 77 return self.bottom_left_x, self.bottom_left_y 78 79 @property 80 def top_length(self) -> float | np.float32: 81 """Get the length of the top side. 82 83 Returns: 84 float: The length. 85 """ 86 # Get the length of the hypotenuse side. 87 side1 = abs(float(self.top_right_x) - float(self.top_left_x)) 88 side2 = abs(float(self.top_right_y) - float(self.top_left_y)) 89 if side2 == 0: 90 return side1 91 return math.sqrt(pow(side1, 2) + pow(side2, 2)) 92 93 @property 94 def left_length(self) -> float | np.float32: 95 """Get the length of the left side. 96 97 Returns: 98 float: The length. 99 """ 100 # Get the length of the hypotenuse side. 101 side1 = abs(float(self.top_left_y) - float(self.bottom_left_y)) 102 side2 = abs(float(self.top_left_x) - float(self.bottom_left_x)) 103 if side2 == 0: 104 return side1 105 return math.sqrt(pow(side1, 2) + pow(side2, 2)) 106 107 @property 108 def line_bounds(self) -> tuple[float | np.float32, float | np.float32]: 109 """Line bounds from top of text to bottom of text.""" 110 top_bound = min( 111 [ 112 float(self.top_left_y), 113 float(self.top_right_y), 114 float(self.bottom_left_y), 115 float(self.bottom_right_y), 116 ], 117 ) 118 bottom_bound = max( 119 [ 120 float(self.top_left_y), 121 float(self.top_right_y), 122 float(self.bottom_left_y), 123 float(self.bottom_right_y), 124 ], 125 ) 126 return top_bound, bottom_bound 127 128 def is_same_line(self, other: TextItem) -> bool: 129 """Check if the vertical midpoints of this item and another item overlap. 130 131 Calculated as the midpoint +- 1/3 of the height of the text. 132 133 Args: 134 other (TextItem): The text item to compare. 135 136 Returns: 137 bool: True if this item and the other item overlap, otherwise false. 138 """ 139 if not other: 140 return False 141 self_bounds = self.line_bounds 142 self_top = self_bounds[0] 143 self_bottom = self_bounds[1] 144 self_third = (self_bottom - self_top) / 3 145 self_top += self_third 146 self_bottom -= self_third 147 148 other_bounds = other.line_bounds 149 other_top = other_bounds[0] 150 other_bottom = other_bounds[1] 151 other_third = (other_bottom - other_top) / 3 152 other_top += other_third 153 other_bottom -= other_third 154 155 return bool(self_top <= other_bottom and other_top <= self_bottom) 156 157 @property 158 def slope_top_left_right(self) -> float | np.float32: 159 """Get the top slope from the left to the right. 160 161 Returns: 162 float: The slope. 163 """ 164 return self._slope( 165 self.top_left_x, 166 self.top_left_y, 167 self.top_right_x, 168 self.top_right_y, 169 ) 170 171 @property 172 def slope_top_right_left(self) -> float | np.float32: 173 """Get the top slope from the right to the left. 174 175 Returns: 176 float: The slope. 177 """ 178 return self._slope( 179 self.top_right_x, 180 self.top_right_y, 181 self.top_left_x, 182 self.top_left_y, 183 ) 184 185 @property 186 def slope_left_top_bottom(self) -> float | np.float32: 187 """Get the left slope from the top to the bottom. 188 189 Returns: 190 float: The slope. 191 """ 192 return self._slope( 193 self.top_left_x, 194 self.top_left_y, 195 self.bottom_left_x, 196 self.bottom_left_y, 197 ) 198 199 @property 200 def slope_left_bottom_top(self) -> float | np.float32: 201 """Get the left slope from the bottom to the top. 202 203 Returns: 204 float: The slope. 205 """ 206 return self._slope( 207 self.bottom_left_x, 208 self.bottom_left_y, 209 self.top_left_x, 210 self.top_left_y, 211 ) 212 213 @property 214 def slope_bottom_left_right(self) -> float | np.float32: 215 """Get the bottom slope from the left to the right. 216 217 Returns: 218 float: The slope. 219 """ 220 return self._slope( 221 self.bottom_left_x, 222 self.bottom_left_y, 223 self.bottom_right_x, 224 self.bottom_right_y, 225 ) 226 227 @property 228 def slope_bottom_right_left(self) -> float | np.float32: 229 """Get the bottom slope from the right to the left. 230 231 Returns: 232 float: The slope. 233 """ 234 return self._slope( 235 self.bottom_right_x, 236 self.bottom_right_y, 237 self.bottom_left_x, 238 self.bottom_left_y, 239 ) 240 241 @property 242 def slope_right_top_bottom(self) -> float | np.float32: 243 """Get the right slope from the top to the bottom. 244 245 Returns: 246 float: The slope. 247 """ 248 return self._slope( 249 self.top_right_x, 250 self.top_right_y, 251 self.bottom_right_x, 252 self.bottom_right_y, 253 ) 254 255 @property 256 def slope_right_bottom_top(self) -> float | np.float32: 257 """Get the right slope from the bottom to the top. 258 259 Returns: 260 float: The slope. 261 """ 262 return self._slope( 263 self.bottom_right_x, 264 self.bottom_right_y, 265 self.top_right_x, 266 self.top_right_y, 267 ) 268 269 @property 270 def is_horizontal_level(self) -> bool: 271 """Check whether the left-to-right slope is approximately horizontal. 272 273 Returns: 274 bool: True if the item is approximately horizontal. 275 """ 276 # -0.1 -> 0.1 is strictly horizontal 277 # give a bit of buffer 278 buffer = 0.09 279 return bool(-buffer <= self.slope_top_left_right <= buffer) 280 281 @property 282 def is_vertical_level(self) -> bool: 283 """Check whether the top-to-bottom slope is approximately vertical. 284 285 Returns: 286 bool: True if the item is approximately vertical. 287 """ 288 # -0.1 -> 0.1 is strictly vertical 289 # give a bit of buffer 290 return bool(self.slope_left_top_bottom == math.inf) 291 292 @classmethod 293 def save(cls, path: pathlib.Path, items: list[TextItem]) -> None: 294 """Save found text items to a file. 295 296 Args: 297 path: Write the items to this file. 298 items: The items to save. 299 300 Returns: 301 None 302 """ 303 logger.debug("Saving %s OCR output items.", len(items)) 304 305 fields = [ 306 "text", 307 "line_number", 308 "line_order", 309 "top_left_x", 310 "top_left_y", 311 "top_right_x", 312 "top_right_y", 313 "bottom_right_x", 314 "bottom_right_y", 315 "bottom_left_x", 316 "bottom_left_y", 317 ] 318 with path.open("w", newline="", encoding="utf8") as file_path: 319 writer = csv.DictWriter(file_path, fields) 320 writer.writeheader() 321 sorted_items = sorted( 322 items, 323 key=lambda i: (i.line_number or 0, i.line_order or 0), 324 ) 325 writer.writerows([dataclasses.asdict(i) for i in sorted_items]) 326 327 logger.debug("Saved OCR items to '%s'.", path) 328 329 @classmethod 330 def load(cls, path: pathlib.Path) -> typing.Generator[TextItem, typing.Any, None]: 331 """Load found text items from a file. 332 333 Args: 334 path: The path to the file containing items. 335 336 Returns: 337 typing.Generator["TextItem", typing.Any, None]: Items from the file. 338 """ 339 logger.debug("Loading OCR output items.") 340 count = 0 341 342 with path.open(encoding="utf8") as file_path: 343 reader = csv.DictReader(file_path) 344 for row in reader: 345 line_number = row.get("line_number", "").strip() 346 line_number = int(line_number) if line_number else None 347 348 line_order = row.get("line_order", "").strip() 349 line_order = int(line_order) if line_order else None 350 351 count += 1 352 353 yield TextItem( 354 text=row["text"], 355 line_number=line_number, 356 line_order=line_order, 357 top_left_x=float(row["top_left_x"]), 358 top_left_y=float(row["top_left_y"]), 359 top_right_x=float(row["top_right_x"]), 360 top_right_y=float(row["top_right_y"]), 361 bottom_right_x=float(row["bottom_right_x"]), 362 bottom_right_y=float(row["bottom_right_y"]), 363 bottom_left_x=float(row["bottom_left_x"]), 364 bottom_left_y=float(row["bottom_left_y"]), 365 ) 366 367 logger.debug("Loaded %s OCR items from '%s'.", count, path) 368 369 @classmethod 370 def from_prediction( 371 cls, 372 prediction: tuple[typing.Any, typing.Any], 373 ) -> TextItem: 374 """Convert from (text, box) to item. 375 376 Box is (top left, top right, bottom right, bottom left). 377 Its structure is [[startX,startY], [endX,startY], [endX,endY], [startX, endY]]. 378 379 Args: 380 prediction: The text recognised in an image. 381 382 Returns: 383 TextItem: A text item representing the recognised text. 384 """ 385 ( 386 text, 387 ( 388 (top_left_x, top_left_y), 389 (top_right_x, top_right_y), 390 (bottom_right_x, bottom_right_y), 391 (bottom_left_x, bottom_left_y), 392 ), 393 ) = prediction 394 return TextItem( 395 text=text, 396 top_left_x=top_left_x, 397 top_left_y=top_left_y, 398 top_right_x=top_right_x, 399 top_right_y=top_right_y, 400 bottom_right_x=bottom_right_x, 401 bottom_right_y=bottom_right_y, 402 bottom_left_x=bottom_left_x, 403 bottom_left_y=bottom_left_y, 404 ) 405 406 @classmethod 407 def order_text_lines( 408 cls, 409 items: list[TextItem], 410 ) -> list[list[TextItem]]: 411 """Put items into lines of text (top -> bottom, left -> right).""" 412 if not items: 413 items = [] 414 415 logger.debug("Arranging text into lines.") 416 417 lines = [] 418 current_line: list[TextItem] = [] 419 for item in items: 420 if not item.is_horizontal_level: 421 # exclude items that are too sloped 422 continue 423 424 if len(current_line) < 1 or any(item.is_same_line(i) for i in current_line): 425 current_line.append(item) 426 427 elif len(current_line) > 0: 428 # store current line 429 current_line = sorted(current_line, key=lambda x: x.top_left) 430 lines.append(current_line) 431 432 # create new line 433 current_line = [item] 434 435 # include last items 436 if len(current_line) > 0: 437 lines.append(current_line) 438 439 # update items to set line number and line order 440 for line_index, line in enumerate(lines): 441 for item_index, item in enumerate(line): 442 item.line_number = line_index + 1 443 item.line_order = item_index + 1 444 445 return lines 446 447 @property 448 def to_prediction( 449 self, 450 ) -> tuple[ 451 str, 452 tuple[ 453 tuple[float | np.float32, float | np.float32], 454 tuple[float | np.float32, float | np.float32], 455 tuple[float | np.float32, float | np.float32], 456 tuple[float | np.float32, float | np.float32], 457 ], 458 ]: 459 """Convert to prediction format.""" 460 return ( 461 self.text, 462 ( 463 (self.top_left_x, self.top_left_y), 464 (self.top_right_x, self.top_right_y), 465 (self.bottom_right_x, self.bottom_right_y), 466 (self.bottom_left_x, self.bottom_left_y), 467 ), 468 ) 469 470 def _slope( 471 self, 472 pt_x1: float | np.float32, 473 pt_y1: float | np.float32, 474 pt_x2: float | np.float32, 475 pt_y2: float | np.float32, 476 ) -> float | np.float32: 477 """Get the slope of a line.""" 478 y_diff = pt_y2 - pt_y1 479 x_diff = pt_x2 - pt_x1 480 try: 481 return y_diff / x_diff 482 except ZeroDivisionError: 483 return math.inf if y_diff >= 0 else -math.inf 484 485 def __str__(self) -> str: 486 """Convert to a string.""" 487 line_info = f"({self.line_number or 0}:{self.line_order})" 488 pos_info = f"[top left:{self.top_left}, top slope: {self.slope_top_left_right}]" 489 return f"{self.text} {line_info} {pos_info}"
One found text item (could be a word or phrase) in an image.
43 @property 44 def top_left(self) -> tuple[float | np.float32, float | np.float32]: 45 """Get the top left point. 46 47 Returns: 48 The x and y coordinates. 49 """ 50 return self.top_left_x, self.top_left_y
Get the top left point.
Returns:
The x and y coordinates.
52 @property 53 def top_right(self) -> tuple[float | np.float32, float | np.float32]: 54 """Get the top right point. 55 56 Returns: 57 The x and y coordinates. 58 """ 59 return self.top_right_x, self.top_right_y
Get the top right point.
Returns:
The x and y coordinates.
61 @property 62 def bottom_right(self) -> tuple[float | np.float32, float | np.float32]: 63 """Get the bottom right point. 64 65 Returns: 66 The x and y coordinates. 67 """ 68 return self.bottom_right_x, self.bottom_right_y
Get the bottom right point.
Returns:
The x and y coordinates.
70 @property 71 def bottom_left(self) -> tuple[float | np.float32, float | np.float32]: 72 """Get the bottom left point. 73 74 Returns: 75 The x and y coordinates. 76 """ 77 return self.bottom_left_x, self.bottom_left_y
Get the bottom left point.
Returns:
The x and y coordinates.
79 @property 80 def top_length(self) -> float | np.float32: 81 """Get the length of the top side. 82 83 Returns: 84 float: The length. 85 """ 86 # Get the length of the hypotenuse side. 87 side1 = abs(float(self.top_right_x) - float(self.top_left_x)) 88 side2 = abs(float(self.top_right_y) - float(self.top_left_y)) 89 if side2 == 0: 90 return side1 91 return math.sqrt(pow(side1, 2) + pow(side2, 2))
Get the length of the top side.
Returns:
float: The length.
93 @property 94 def left_length(self) -> float | np.float32: 95 """Get the length of the left side. 96 97 Returns: 98 float: The length. 99 """ 100 # Get the length of the hypotenuse side. 101 side1 = abs(float(self.top_left_y) - float(self.bottom_left_y)) 102 side2 = abs(float(self.top_left_x) - float(self.bottom_left_x)) 103 if side2 == 0: 104 return side1 105 return math.sqrt(pow(side1, 2) + pow(side2, 2))
Get the length of the left side.
Returns:
float: The length.
107 @property 108 def line_bounds(self) -> tuple[float | np.float32, float | np.float32]: 109 """Line bounds from top of text to bottom of text.""" 110 top_bound = min( 111 [ 112 float(self.top_left_y), 113 float(self.top_right_y), 114 float(self.bottom_left_y), 115 float(self.bottom_right_y), 116 ], 117 ) 118 bottom_bound = max( 119 [ 120 float(self.top_left_y), 121 float(self.top_right_y), 122 float(self.bottom_left_y), 123 float(self.bottom_right_y), 124 ], 125 ) 126 return top_bound, bottom_bound
Line bounds from top of text to bottom of text.
128 def is_same_line(self, other: TextItem) -> bool: 129 """Check if the vertical midpoints of this item and another item overlap. 130 131 Calculated as the midpoint +- 1/3 of the height of the text. 132 133 Args: 134 other (TextItem): The text item to compare. 135 136 Returns: 137 bool: True if this item and the other item overlap, otherwise false. 138 """ 139 if not other: 140 return False 141 self_bounds = self.line_bounds 142 self_top = self_bounds[0] 143 self_bottom = self_bounds[1] 144 self_third = (self_bottom - self_top) / 3 145 self_top += self_third 146 self_bottom -= self_third 147 148 other_bounds = other.line_bounds 149 other_top = other_bounds[0] 150 other_bottom = other_bounds[1] 151 other_third = (other_bottom - other_top) / 3 152 other_top += other_third 153 other_bottom -= other_third 154 155 return bool(self_top <= other_bottom and other_top <= self_bottom)
Check if the vertical midpoints of this item and another item overlap.
Calculated as the midpoint +- 1/3 of the height of the text.
Arguments:
- other (TextItem): The text item to compare.
Returns:
bool: True if this item and the other item overlap, otherwise false.
157 @property 158 def slope_top_left_right(self) -> float | np.float32: 159 """Get the top slope from the left to the right. 160 161 Returns: 162 float: The slope. 163 """ 164 return self._slope( 165 self.top_left_x, 166 self.top_left_y, 167 self.top_right_x, 168 self.top_right_y, 169 )
Get the top slope from the left to the right.
Returns:
float: The slope.
171 @property 172 def slope_top_right_left(self) -> float | np.float32: 173 """Get the top slope from the right to the left. 174 175 Returns: 176 float: The slope. 177 """ 178 return self._slope( 179 self.top_right_x, 180 self.top_right_y, 181 self.top_left_x, 182 self.top_left_y, 183 )
Get the top slope from the right to the left.
Returns:
float: The slope.
185 @property 186 def slope_left_top_bottom(self) -> float | np.float32: 187 """Get the left slope from the top to the bottom. 188 189 Returns: 190 float: The slope. 191 """ 192 return self._slope( 193 self.top_left_x, 194 self.top_left_y, 195 self.bottom_left_x, 196 self.bottom_left_y, 197 )
Get the left slope from the top to the bottom.
Returns:
float: The slope.
199 @property 200 def slope_left_bottom_top(self) -> float | np.float32: 201 """Get the left slope from the bottom to the top. 202 203 Returns: 204 float: The slope. 205 """ 206 return self._slope( 207 self.bottom_left_x, 208 self.bottom_left_y, 209 self.top_left_x, 210 self.top_left_y, 211 )
Get the left slope from the bottom to the top.
Returns:
float: The slope.
213 @property 214 def slope_bottom_left_right(self) -> float | np.float32: 215 """Get the bottom slope from the left to the right. 216 217 Returns: 218 float: The slope. 219 """ 220 return self._slope( 221 self.bottom_left_x, 222 self.bottom_left_y, 223 self.bottom_right_x, 224 self.bottom_right_y, 225 )
Get the bottom slope from the left to the right.
Returns:
float: The slope.
227 @property 228 def slope_bottom_right_left(self) -> float | np.float32: 229 """Get the bottom slope from the right to the left. 230 231 Returns: 232 float: The slope. 233 """ 234 return self._slope( 235 self.bottom_right_x, 236 self.bottom_right_y, 237 self.bottom_left_x, 238 self.bottom_left_y, 239 )
Get the bottom slope from the right to the left.
Returns:
float: The slope.
241 @property 242 def slope_right_top_bottom(self) -> float | np.float32: 243 """Get the right slope from the top to the bottom. 244 245 Returns: 246 float: The slope. 247 """ 248 return self._slope( 249 self.top_right_x, 250 self.top_right_y, 251 self.bottom_right_x, 252 self.bottom_right_y, 253 )
Get the right slope from the top to the bottom.
Returns:
float: The slope.
255 @property 256 def slope_right_bottom_top(self) -> float | np.float32: 257 """Get the right slope from the bottom to the top. 258 259 Returns: 260 float: The slope. 261 """ 262 return self._slope( 263 self.bottom_right_x, 264 self.bottom_right_y, 265 self.top_right_x, 266 self.top_right_y, 267 )
Get the right slope from the bottom to the top.
Returns:
float: The slope.
269 @property 270 def is_horizontal_level(self) -> bool: 271 """Check whether the left-to-right slope is approximately horizontal. 272 273 Returns: 274 bool: True if the item is approximately horizontal. 275 """ 276 # -0.1 -> 0.1 is strictly horizontal 277 # give a bit of buffer 278 buffer = 0.09 279 return bool(-buffer <= self.slope_top_left_right <= buffer)
Check whether the left-to-right slope is approximately horizontal.
Returns:
bool: True if the item is approximately horizontal.
281 @property 282 def is_vertical_level(self) -> bool: 283 """Check whether the top-to-bottom slope is approximately vertical. 284 285 Returns: 286 bool: True if the item is approximately vertical. 287 """ 288 # -0.1 -> 0.1 is strictly vertical 289 # give a bit of buffer 290 return bool(self.slope_left_top_bottom == math.inf)
Check whether the top-to-bottom slope is approximately vertical.
Returns:
bool: True if the item is approximately vertical.
292 @classmethod 293 def save(cls, path: pathlib.Path, items: list[TextItem]) -> None: 294 """Save found text items to a file. 295 296 Args: 297 path: Write the items to this file. 298 items: The items to save. 299 300 Returns: 301 None 302 """ 303 logger.debug("Saving %s OCR output items.", len(items)) 304 305 fields = [ 306 "text", 307 "line_number", 308 "line_order", 309 "top_left_x", 310 "top_left_y", 311 "top_right_x", 312 "top_right_y", 313 "bottom_right_x", 314 "bottom_right_y", 315 "bottom_left_x", 316 "bottom_left_y", 317 ] 318 with path.open("w", newline="", encoding="utf8") as file_path: 319 writer = csv.DictWriter(file_path, fields) 320 writer.writeheader() 321 sorted_items = sorted( 322 items, 323 key=lambda i: (i.line_number or 0, i.line_order or 0), 324 ) 325 writer.writerows([dataclasses.asdict(i) for i in sorted_items]) 326 327 logger.debug("Saved OCR items to '%s'.", path)
Save found text items to a file.
Arguments:
- path: Write the items to this file.
- items: The items to save.
Returns:
None
329 @classmethod 330 def load(cls, path: pathlib.Path) -> typing.Generator[TextItem, typing.Any, None]: 331 """Load found text items from a file. 332 333 Args: 334 path: The path to the file containing items. 335 336 Returns: 337 typing.Generator["TextItem", typing.Any, None]: Items from the file. 338 """ 339 logger.debug("Loading OCR output items.") 340 count = 0 341 342 with path.open(encoding="utf8") as file_path: 343 reader = csv.DictReader(file_path) 344 for row in reader: 345 line_number = row.get("line_number", "").strip() 346 line_number = int(line_number) if line_number else None 347 348 line_order = row.get("line_order", "").strip() 349 line_order = int(line_order) if line_order else None 350 351 count += 1 352 353 yield TextItem( 354 text=row["text"], 355 line_number=line_number, 356 line_order=line_order, 357 top_left_x=float(row["top_left_x"]), 358 top_left_y=float(row["top_left_y"]), 359 top_right_x=float(row["top_right_x"]), 360 top_right_y=float(row["top_right_y"]), 361 bottom_right_x=float(row["bottom_right_x"]), 362 bottom_right_y=float(row["bottom_right_y"]), 363 bottom_left_x=float(row["bottom_left_x"]), 364 bottom_left_y=float(row["bottom_left_y"]), 365 ) 366 367 logger.debug("Loaded %s OCR items from '%s'.", count, path)
Load found text items from a file.
Arguments:
- path: The path to the file containing items.
Returns:
typing.Generator["TextItem", typing.Any, None]: Items from the file.
369 @classmethod 370 def from_prediction( 371 cls, 372 prediction: tuple[typing.Any, typing.Any], 373 ) -> TextItem: 374 """Convert from (text, box) to item. 375 376 Box is (top left, top right, bottom right, bottom left). 377 Its structure is [[startX,startY], [endX,startY], [endX,endY], [startX, endY]]. 378 379 Args: 380 prediction: The text recognised in an image. 381 382 Returns: 383 TextItem: A text item representing the recognised text. 384 """ 385 ( 386 text, 387 ( 388 (top_left_x, top_left_y), 389 (top_right_x, top_right_y), 390 (bottom_right_x, bottom_right_y), 391 (bottom_left_x, bottom_left_y), 392 ), 393 ) = prediction 394 return TextItem( 395 text=text, 396 top_left_x=top_left_x, 397 top_left_y=top_left_y, 398 top_right_x=top_right_x, 399 top_right_y=top_right_y, 400 bottom_right_x=bottom_right_x, 401 bottom_right_y=bottom_right_y, 402 bottom_left_x=bottom_left_x, 403 bottom_left_y=bottom_left_y, 404 )
Convert from (text, box) to item.
Box is (top left, top right, bottom right, bottom left). Its structure is [[startX,startY], [endX,startY], [endX,endY], [startX, endY]].
Arguments:
- prediction: The text recognised in an image.
Returns:
TextItem: A text item representing the recognised text.
406 @classmethod 407 def order_text_lines( 408 cls, 409 items: list[TextItem], 410 ) -> list[list[TextItem]]: 411 """Put items into lines of text (top -> bottom, left -> right).""" 412 if not items: 413 items = [] 414 415 logger.debug("Arranging text into lines.") 416 417 lines = [] 418 current_line: list[TextItem] = [] 419 for item in items: 420 if not item.is_horizontal_level: 421 # exclude items that are too sloped 422 continue 423 424 if len(current_line) < 1 or any(item.is_same_line(i) for i in current_line): 425 current_line.append(item) 426 427 elif len(current_line) > 0: 428 # store current line 429 current_line = sorted(current_line, key=lambda x: x.top_left) 430 lines.append(current_line) 431 432 # create new line 433 current_line = [item] 434 435 # include last items 436 if len(current_line) > 0: 437 lines.append(current_line) 438 439 # update items to set line number and line order 440 for line_index, line in enumerate(lines): 441 for item_index, item in enumerate(line): 442 item.line_number = line_index + 1 443 item.line_order = item_index + 1 444 445 return lines
Put items into lines of text (top -> bottom, left -> right).
447 @property 448 def to_prediction( 449 self, 450 ) -> tuple[ 451 str, 452 tuple[ 453 tuple[float | np.float32, float | np.float32], 454 tuple[float | np.float32, float | np.float32], 455 tuple[float | np.float32, float | np.float32], 456 tuple[float | np.float32, float | np.float32], 457 ], 458 ]: 459 """Convert to prediction format.""" 460 return ( 461 self.text, 462 ( 463 (self.top_left_x, self.top_left_y), 464 (self.top_right_x, self.top_right_y), 465 (self.bottom_right_x, self.bottom_right_y), 466 (self.bottom_left_x, self.bottom_left_y), 467 ), 468 )
Convert to prediction format.
492@beartype 493@dataclasses.dataclass 494class KerasOcrResult: 495 """Result from running keras-ocr.""" 496 497 output_dir: pathlib.Path 498 annotations_file: pathlib.Path 499 predictions_file: pathlib.Path 500 items: list[list[TextItem]]
Result from running keras-ocr.