Edit on GitHub

leaf_focus.ocr.model

Models for OCR processing.

  1"""Models for OCR processing."""
  2
  3from __future__ import annotations
  4
  5import csv
  6import dataclasses
  7import logging
  8import math
  9import pathlib
 10
 11import numpy as np
 12
 13from beartype import beartype, typing
 14
 15
 16logger = logging.getLogger(__name__)
 17
 18
 19@beartype
 20@dataclasses.dataclass
 21class TextItem:
 22    """One found text item (could be a word or phrase) in an image."""
 23
 24    text: str
 25    """The recognised text."""
 26
 27    top_left_x: float | np.float32
 28    top_left_y: float | np.float32
 29
 30    top_right_x: float | np.float32
 31    top_right_y: float | np.float32
 32
 33    bottom_right_x: float | np.float32
 34    bottom_right_y: float | np.float32
 35
 36    bottom_left_x: float | np.float32
 37    bottom_left_y: float | np.float32
 38
 39    line_number: int | None = None
 40    line_order: int | None = None
 41
 42    @property
 43    def top_left(self) -> tuple[float | np.float32, float | np.float32]:
 44        """Get the top left point.
 45
 46        Returns:
 47            The x and y coordinates.
 48        """
 49        return self.top_left_x, self.top_left_y
 50
 51    @property
 52    def top_right(self) -> tuple[float | np.float32, float | np.float32]:
 53        """Get the top right point.
 54
 55        Returns:
 56            The x and y coordinates.
 57        """
 58        return self.top_right_x, self.top_right_y
 59
 60    @property
 61    def bottom_right(self) -> tuple[float | np.float32, float | np.float32]:
 62        """Get the bottom right point.
 63
 64        Returns:
 65             The x and y coordinates.
 66        """
 67        return self.bottom_right_x, self.bottom_right_y
 68
 69    @property
 70    def bottom_left(self) -> tuple[float | np.float32, float | np.float32]:
 71        """Get the bottom left point.
 72
 73        Returns:
 74            The x and y coordinates.
 75        """
 76        return self.bottom_left_x, self.bottom_left_y
 77
 78    @property
 79    def top_length(self) -> float | np.float32:
 80        """Get the length of the top side.
 81
 82        Returns:
 83            float: The length.
 84        """
 85        # Get the length of the hypotenuse side.
 86        side1 = abs(float(self.top_right_x) - float(self.top_left_x))
 87        side2 = abs(float(self.top_right_y) - float(self.top_left_y))
 88        if side2 == 0:
 89            return side1
 90        return math.sqrt(pow(side1, 2) + pow(side2, 2))
 91
 92    @property
 93    def left_length(self) -> float | np.float32:
 94        """Get the length of the left side.
 95
 96        Returns:
 97            float: The length.
 98        """
 99        # Get the length of the hypotenuse side.
100        side1 = abs(float(self.top_left_y) - float(self.bottom_left_y))
101        side2 = abs(float(self.top_left_x) - float(self.bottom_left_x))
102        if side2 == 0:
103            return side1
104        return math.sqrt(pow(side1, 2) + pow(side2, 2))
105
106    @property
107    def line_bounds(self) -> tuple[float | np.float32, float | np.float32]:
108        """Line bounds from top of text to bottom of text."""
109        top_bound = min(
110            [
111                float(self.top_left_y),
112                float(self.top_right_y),
113                float(self.bottom_left_y),
114                float(self.bottom_right_y),
115            ],
116        )
117        bottom_bound = max(
118            [
119                float(self.top_left_y),
120                float(self.top_right_y),
121                float(self.bottom_left_y),
122                float(self.bottom_right_y),
123            ],
124        )
125        return top_bound, bottom_bound
126
127    def is_same_line(self, other: TextItem) -> bool:
128        """Check if the vertical midpoints of this item and another item overlap.
129
130        Calculated as the midpoint +- 1/3 of the height of the text.
131
132        Args:
133            other (TextItem): The text item to compare.
134
135        Returns:
136            bool: True if this item and the other item overlap, otherwise false.
137        """
138        if not other:
139            return False
140        self_bounds = self.line_bounds
141        self_top = self_bounds[0]
142        self_bottom = self_bounds[1]
143        self_third = (self_bottom - self_top) / 3
144        self_top += self_third
145        self_bottom -= self_third
146
147        other_bounds = other.line_bounds
148        other_top = other_bounds[0]
149        other_bottom = other_bounds[1]
150        other_third = (other_bottom - other_top) / 3
151        other_top += other_third
152        other_bottom -= other_third
153
154        return bool(self_top <= other_bottom and other_top <= self_bottom)
155
156    @property
157    def slope_top_left_right(self) -> float | np.float32:
158        """Get the top slope from the left to the right.
159
160        Returns:
161            float: The slope.
162        """
163        return self._slope(
164            self.top_left_x,
165            self.top_left_y,
166            self.top_right_x,
167            self.top_right_y,
168        )
169
170    @property
171    def slope_top_right_left(self) -> float | np.float32:
172        """Get the top slope from the right to the left.
173
174        Returns:
175            float: The slope.
176        """
177        return self._slope(
178            self.top_right_x,
179            self.top_right_y,
180            self.top_left_x,
181            self.top_left_y,
182        )
183
184    @property
185    def slope_left_top_bottom(self) -> float | np.float32:
186        """Get the left slope from the top to the bottom.
187
188        Returns:
189            float: The slope.
190        """
191        return self._slope(
192            self.top_left_x,
193            self.top_left_y,
194            self.bottom_left_x,
195            self.bottom_left_y,
196        )
197
198    @property
199    def slope_left_bottom_top(self) -> float | np.float32:
200        """Get the left slope from the bottom to the top.
201
202        Returns:
203            float: The slope.
204        """
205        return self._slope(
206            self.bottom_left_x,
207            self.bottom_left_y,
208            self.top_left_x,
209            self.top_left_y,
210        )
211
212    @property
213    def slope_bottom_left_right(self) -> float | np.float32:
214        """Get the bottom slope from the left to the right.
215
216        Returns:
217            float: The slope.
218        """
219        return self._slope(
220            self.bottom_left_x,
221            self.bottom_left_y,
222            self.bottom_right_x,
223            self.bottom_right_y,
224        )
225
226    @property
227    def slope_bottom_right_left(self) -> float | np.float32:
228        """Get the bottom slope from the right to the left.
229
230        Returns:
231            float: The slope.
232        """
233        return self._slope(
234            self.bottom_right_x,
235            self.bottom_right_y,
236            self.bottom_left_x,
237            self.bottom_left_y,
238        )
239
240    @property
241    def slope_right_top_bottom(self) -> float | np.float32:
242        """Get the right slope from the top to the bottom.
243
244        Returns:
245            float: The slope.
246        """
247        return self._slope(
248            self.top_right_x,
249            self.top_right_y,
250            self.bottom_right_x,
251            self.bottom_right_y,
252        )
253
254    @property
255    def slope_right_bottom_top(self) -> float | np.float32:
256        """Get the right slope from the bottom to the top.
257
258        Returns:
259            float: The slope.
260        """
261        return self._slope(
262            self.bottom_right_x,
263            self.bottom_right_y,
264            self.top_right_x,
265            self.top_right_y,
266        )
267
268    @property
269    def is_horizontal_level(self) -> bool:
270        """Check whether the left-to-right slope is approximately horizontal.
271
272        Returns:
273            bool: True if the item is approximately horizontal.
274        """
275        # -0.1 -> 0.1 is strictly horizontal
276        # give a bit of buffer
277        buffer = 0.09
278        return bool(-buffer <= self.slope_top_left_right <= buffer)
279
280    @property
281    def is_vertical_level(self) -> bool:
282        """Check whether the top-to-bottom slope is approximately vertical.
283
284        Returns:
285            bool: True if the item is approximately vertical.
286        """
287        # -0.1 -> 0.1 is strictly vertical
288        # give a bit of buffer
289        return bool(self.slope_left_top_bottom == math.inf)
290
291    @classmethod
292    def save(cls, path: pathlib.Path, items: list[TextItem]) -> None:
293        """Save found text items to a file.
294
295        Args:
296            path: Write the items to this file.
297            items: The items to save.
298
299        Returns:
300            None
301        """
302        logger.debug("Saving %s OCR output items.", len(items))
303
304        fields = [
305            "text",
306            "line_number",
307            "line_order",
308            "top_left_x",
309            "top_left_y",
310            "top_right_x",
311            "top_right_y",
312            "bottom_right_x",
313            "bottom_right_y",
314            "bottom_left_x",
315            "bottom_left_y",
316        ]
317        with path.open("w", newline="", encoding="utf8") as file_path:
318            writer = csv.DictWriter(file_path, fields)
319            writer.writeheader()
320            sorted_items = sorted(
321                items,
322                key=lambda i: (i.line_number or 0, i.line_order or 0),
323            )
324            writer.writerows([dataclasses.asdict(i) for i in sorted_items])
325
326        logger.debug("Saved OCR items to '%s'.", path)
327
328    @classmethod
329    def load(cls, path: pathlib.Path) -> typing.Generator[TextItem, typing.Any, None]:
330        """Load found text items from a file.
331
332        Args:
333            path: The path to the file containing items.
334
335        Returns:
336            typing.Generator["TextItem", typing.Any, None]: Items from the file.
337        """
338        logger.debug("Loading OCR output items.")
339        count = 0
340
341        with path.open(encoding="utf8") as file_path:
342            reader = csv.DictReader(file_path)
343            for row in reader:
344                line_number = row.get("line_number", "").strip()
345                line_number = int(line_number) if line_number else None
346
347                line_order = row.get("line_order", "").strip()
348                line_order = int(line_order) if line_order else None
349
350                count += 1
351
352                yield TextItem(
353                    text=row["text"],
354                    line_number=line_number,
355                    line_order=line_order,
356                    top_left_x=float(row["top_left_x"]),
357                    top_left_y=float(row["top_left_y"]),
358                    top_right_x=float(row["top_right_x"]),
359                    top_right_y=float(row["top_right_y"]),
360                    bottom_right_x=float(row["bottom_right_x"]),
361                    bottom_right_y=float(row["bottom_right_y"]),
362                    bottom_left_x=float(row["bottom_left_x"]),
363                    bottom_left_y=float(row["bottom_left_y"]),
364                )
365
366        logger.debug("Loaded %s OCR items from '%s'.", count, path)
367
368    @classmethod
369    def from_prediction(
370        cls,
371        prediction: tuple[typing.Any, typing.Any],
372    ) -> TextItem:
373        """Convert from (text, box) to item.
374
375        Box is (top left, top right, bottom right, bottom left).
376        Its structure is [[startX,startY], [endX,startY], [endX,endY], [startX, endY]].
377
378        Args:
379            prediction: The text recognised in an image.
380
381        Returns:
382            TextItem: A text item representing the recognised text.
383        """
384        (
385            text,
386            (
387                (top_left_x, top_left_y),
388                (top_right_x, top_right_y),
389                (bottom_right_x, bottom_right_y),
390                (bottom_left_x, bottom_left_y),
391            ),
392        ) = prediction
393        return TextItem(
394            text=text,
395            top_left_x=top_left_x,
396            top_left_y=top_left_y,
397            top_right_x=top_right_x,
398            top_right_y=top_right_y,
399            bottom_right_x=bottom_right_x,
400            bottom_right_y=bottom_right_y,
401            bottom_left_x=bottom_left_x,
402            bottom_left_y=bottom_left_y,
403        )
404
405    @classmethod
406    def order_text_lines(
407        cls,
408        items: list[TextItem],
409    ) -> list[list[TextItem]]:
410        """Put items into lines of text (top -> bottom, left -> right)."""
411        if not items:
412            items = []
413
414        logger.debug("Arranging text into lines.")
415
416        lines = []
417        current_line: list[TextItem] = []
418        for item in items:
419            if not item.is_horizontal_level:
420                # exclude items that are too sloped
421                continue
422
423            if len(current_line) < 1 or any(item.is_same_line(i) for i in current_line):
424                current_line.append(item)
425
426            elif len(current_line) > 0:
427                # store current line
428                current_line = sorted(current_line, key=lambda x: x.top_left)
429                lines.append(current_line)
430
431                # create new line
432                current_line = [item]
433
434        # include last items
435        if len(current_line) > 0:
436            lines.append(current_line)
437
438        # update items to set line number and line order
439        for line_index, line in enumerate(lines):
440            for item_index, item in enumerate(line):
441                item.line_number = line_index + 1
442                item.line_order = item_index + 1
443
444        return lines
445
446    @property
447    def to_prediction(
448        self,
449    ) -> tuple[
450        str,
451        tuple[
452            tuple[float | np.float32, float | np.float32],
453            tuple[float | np.float32, float | np.float32],
454            tuple[float | np.float32, float | np.float32],
455            tuple[float | np.float32, float | np.float32],
456        ],
457    ]:
458        """Convert to prediction format."""
459        return (
460            self.text,
461            (
462                (self.top_left_x, self.top_left_y),
463                (self.top_right_x, self.top_right_y),
464                (self.bottom_right_x, self.bottom_right_y),
465                (self.bottom_left_x, self.bottom_left_y),
466            ),
467        )
468
469    def _slope(
470        self,
471        pt_x1: float | np.float32,
472        pt_y1: float | np.float32,
473        pt_x2: float | np.float32,
474        pt_y2: float | np.float32,
475    ) -> float | np.float32:
476        """Get the slope of a line."""
477        y_diff = pt_y2 - pt_y1
478        x_diff = pt_x2 - pt_x1
479        try:
480            return y_diff / x_diff
481        except ZeroDivisionError:
482            return math.inf if y_diff >= 0 else -math.inf
483
484    def __str__(self) -> str:
485        """Convert to a string."""
486        line_info = f"({self.line_number or 0}:{self.line_order})"
487        pos_info = f"[top left:{self.top_left}, top slope: {self.slope_top_left_right}]"
488        return f"{self.text} {line_info} {pos_info}"
489
490
491@beartype
492@dataclasses.dataclass
493class KerasOcrResult:
494    """Result from running keras-ocr."""
495
496    output_dir: pathlib.Path
497    annotations_file: pathlib.Path
498    predictions_file: pathlib.Path
499    items: list[list[TextItem]]
logger = <Logger leaf_focus.ocr.model (WARNING)>
@beartype
@dataclasses.dataclass
class TextItem:
 20@beartype
 21@dataclasses.dataclass
 22class TextItem:
 23    """One found text item (could be a word or phrase) in an image."""
 24
 25    text: str
 26    """The recognised text."""
 27
 28    top_left_x: float | np.float32
 29    top_left_y: float | np.float32
 30
 31    top_right_x: float | np.float32
 32    top_right_y: float | np.float32
 33
 34    bottom_right_x: float | np.float32
 35    bottom_right_y: float | np.float32
 36
 37    bottom_left_x: float | np.float32
 38    bottom_left_y: float | np.float32
 39
 40    line_number: int | None = None
 41    line_order: int | None = None
 42
 43    @property
 44    def top_left(self) -> tuple[float | np.float32, float | np.float32]:
 45        """Get the top left point.
 46
 47        Returns:
 48            The x and y coordinates.
 49        """
 50        return self.top_left_x, self.top_left_y
 51
 52    @property
 53    def top_right(self) -> tuple[float | np.float32, float | np.float32]:
 54        """Get the top right point.
 55
 56        Returns:
 57            The x and y coordinates.
 58        """
 59        return self.top_right_x, self.top_right_y
 60
 61    @property
 62    def bottom_right(self) -> tuple[float | np.float32, float | np.float32]:
 63        """Get the bottom right point.
 64
 65        Returns:
 66             The x and y coordinates.
 67        """
 68        return self.bottom_right_x, self.bottom_right_y
 69
 70    @property
 71    def bottom_left(self) -> tuple[float | np.float32, float | np.float32]:
 72        """Get the bottom left point.
 73
 74        Returns:
 75            The x and y coordinates.
 76        """
 77        return self.bottom_left_x, self.bottom_left_y
 78
 79    @property
 80    def top_length(self) -> float | np.float32:
 81        """Get the length of the top side.
 82
 83        Returns:
 84            float: The length.
 85        """
 86        # Get the length of the hypotenuse side.
 87        side1 = abs(float(self.top_right_x) - float(self.top_left_x))
 88        side2 = abs(float(self.top_right_y) - float(self.top_left_y))
 89        if side2 == 0:
 90            return side1
 91        return math.sqrt(pow(side1, 2) + pow(side2, 2))
 92
 93    @property
 94    def left_length(self) -> float | np.float32:
 95        """Get the length of the left side.
 96
 97        Returns:
 98            float: The length.
 99        """
100        # Get the length of the hypotenuse side.
101        side1 = abs(float(self.top_left_y) - float(self.bottom_left_y))
102        side2 = abs(float(self.top_left_x) - float(self.bottom_left_x))
103        if side2 == 0:
104            return side1
105        return math.sqrt(pow(side1, 2) + pow(side2, 2))
106
107    @property
108    def line_bounds(self) -> tuple[float | np.float32, float | np.float32]:
109        """Line bounds from top of text to bottom of text."""
110        top_bound = min(
111            [
112                float(self.top_left_y),
113                float(self.top_right_y),
114                float(self.bottom_left_y),
115                float(self.bottom_right_y),
116            ],
117        )
118        bottom_bound = max(
119            [
120                float(self.top_left_y),
121                float(self.top_right_y),
122                float(self.bottom_left_y),
123                float(self.bottom_right_y),
124            ],
125        )
126        return top_bound, bottom_bound
127
128    def is_same_line(self, other: TextItem) -> bool:
129        """Check if the vertical midpoints of this item and another item overlap.
130
131        Calculated as the midpoint +- 1/3 of the height of the text.
132
133        Args:
134            other (TextItem): The text item to compare.
135
136        Returns:
137            bool: True if this item and the other item overlap, otherwise false.
138        """
139        if not other:
140            return False
141        self_bounds = self.line_bounds
142        self_top = self_bounds[0]
143        self_bottom = self_bounds[1]
144        self_third = (self_bottom - self_top) / 3
145        self_top += self_third
146        self_bottom -= self_third
147
148        other_bounds = other.line_bounds
149        other_top = other_bounds[0]
150        other_bottom = other_bounds[1]
151        other_third = (other_bottom - other_top) / 3
152        other_top += other_third
153        other_bottom -= other_third
154
155        return bool(self_top <= other_bottom and other_top <= self_bottom)
156
157    @property
158    def slope_top_left_right(self) -> float | np.float32:
159        """Get the top slope from the left to the right.
160
161        Returns:
162            float: The slope.
163        """
164        return self._slope(
165            self.top_left_x,
166            self.top_left_y,
167            self.top_right_x,
168            self.top_right_y,
169        )
170
171    @property
172    def slope_top_right_left(self) -> float | np.float32:
173        """Get the top slope from the right to the left.
174
175        Returns:
176            float: The slope.
177        """
178        return self._slope(
179            self.top_right_x,
180            self.top_right_y,
181            self.top_left_x,
182            self.top_left_y,
183        )
184
185    @property
186    def slope_left_top_bottom(self) -> float | np.float32:
187        """Get the left slope from the top to the bottom.
188
189        Returns:
190            float: The slope.
191        """
192        return self._slope(
193            self.top_left_x,
194            self.top_left_y,
195            self.bottom_left_x,
196            self.bottom_left_y,
197        )
198
199    @property
200    def slope_left_bottom_top(self) -> float | np.float32:
201        """Get the left slope from the bottom to the top.
202
203        Returns:
204            float: The slope.
205        """
206        return self._slope(
207            self.bottom_left_x,
208            self.bottom_left_y,
209            self.top_left_x,
210            self.top_left_y,
211        )
212
213    @property
214    def slope_bottom_left_right(self) -> float | np.float32:
215        """Get the bottom slope from the left to the right.
216
217        Returns:
218            float: The slope.
219        """
220        return self._slope(
221            self.bottom_left_x,
222            self.bottom_left_y,
223            self.bottom_right_x,
224            self.bottom_right_y,
225        )
226
227    @property
228    def slope_bottom_right_left(self) -> float | np.float32:
229        """Get the bottom slope from the right to the left.
230
231        Returns:
232            float: The slope.
233        """
234        return self._slope(
235            self.bottom_right_x,
236            self.bottom_right_y,
237            self.bottom_left_x,
238            self.bottom_left_y,
239        )
240
241    @property
242    def slope_right_top_bottom(self) -> float | np.float32:
243        """Get the right slope from the top to the bottom.
244
245        Returns:
246            float: The slope.
247        """
248        return self._slope(
249            self.top_right_x,
250            self.top_right_y,
251            self.bottom_right_x,
252            self.bottom_right_y,
253        )
254
255    @property
256    def slope_right_bottom_top(self) -> float | np.float32:
257        """Get the right slope from the bottom to the top.
258
259        Returns:
260            float: The slope.
261        """
262        return self._slope(
263            self.bottom_right_x,
264            self.bottom_right_y,
265            self.top_right_x,
266            self.top_right_y,
267        )
268
269    @property
270    def is_horizontal_level(self) -> bool:
271        """Check whether the left-to-right slope is approximately horizontal.
272
273        Returns:
274            bool: True if the item is approximately horizontal.
275        """
276        # -0.1 -> 0.1 is strictly horizontal
277        # give a bit of buffer
278        buffer = 0.09
279        return bool(-buffer <= self.slope_top_left_right <= buffer)
280
281    @property
282    def is_vertical_level(self) -> bool:
283        """Check whether the top-to-bottom slope is approximately vertical.
284
285        Returns:
286            bool: True if the item is approximately vertical.
287        """
288        # -0.1 -> 0.1 is strictly vertical
289        # give a bit of buffer
290        return bool(self.slope_left_top_bottom == math.inf)
291
292    @classmethod
293    def save(cls, path: pathlib.Path, items: list[TextItem]) -> None:
294        """Save found text items to a file.
295
296        Args:
297            path: Write the items to this file.
298            items: The items to save.
299
300        Returns:
301            None
302        """
303        logger.debug("Saving %s OCR output items.", len(items))
304
305        fields = [
306            "text",
307            "line_number",
308            "line_order",
309            "top_left_x",
310            "top_left_y",
311            "top_right_x",
312            "top_right_y",
313            "bottom_right_x",
314            "bottom_right_y",
315            "bottom_left_x",
316            "bottom_left_y",
317        ]
318        with path.open("w", newline="", encoding="utf8") as file_path:
319            writer = csv.DictWriter(file_path, fields)
320            writer.writeheader()
321            sorted_items = sorted(
322                items,
323                key=lambda i: (i.line_number or 0, i.line_order or 0),
324            )
325            writer.writerows([dataclasses.asdict(i) for i in sorted_items])
326
327        logger.debug("Saved OCR items to '%s'.", path)
328
329    @classmethod
330    def load(cls, path: pathlib.Path) -> typing.Generator[TextItem, typing.Any, None]:
331        """Load found text items from a file.
332
333        Args:
334            path: The path to the file containing items.
335
336        Returns:
337            typing.Generator["TextItem", typing.Any, None]: Items from the file.
338        """
339        logger.debug("Loading OCR output items.")
340        count = 0
341
342        with path.open(encoding="utf8") as file_path:
343            reader = csv.DictReader(file_path)
344            for row in reader:
345                line_number = row.get("line_number", "").strip()
346                line_number = int(line_number) if line_number else None
347
348                line_order = row.get("line_order", "").strip()
349                line_order = int(line_order) if line_order else None
350
351                count += 1
352
353                yield TextItem(
354                    text=row["text"],
355                    line_number=line_number,
356                    line_order=line_order,
357                    top_left_x=float(row["top_left_x"]),
358                    top_left_y=float(row["top_left_y"]),
359                    top_right_x=float(row["top_right_x"]),
360                    top_right_y=float(row["top_right_y"]),
361                    bottom_right_x=float(row["bottom_right_x"]),
362                    bottom_right_y=float(row["bottom_right_y"]),
363                    bottom_left_x=float(row["bottom_left_x"]),
364                    bottom_left_y=float(row["bottom_left_y"]),
365                )
366
367        logger.debug("Loaded %s OCR items from '%s'.", count, path)
368
369    @classmethod
370    def from_prediction(
371        cls,
372        prediction: tuple[typing.Any, typing.Any],
373    ) -> TextItem:
374        """Convert from (text, box) to item.
375
376        Box is (top left, top right, bottom right, bottom left).
377        Its structure is [[startX,startY], [endX,startY], [endX,endY], [startX, endY]].
378
379        Args:
380            prediction: The text recognised in an image.
381
382        Returns:
383            TextItem: A text item representing the recognised text.
384        """
385        (
386            text,
387            (
388                (top_left_x, top_left_y),
389                (top_right_x, top_right_y),
390                (bottom_right_x, bottom_right_y),
391                (bottom_left_x, bottom_left_y),
392            ),
393        ) = prediction
394        return TextItem(
395            text=text,
396            top_left_x=top_left_x,
397            top_left_y=top_left_y,
398            top_right_x=top_right_x,
399            top_right_y=top_right_y,
400            bottom_right_x=bottom_right_x,
401            bottom_right_y=bottom_right_y,
402            bottom_left_x=bottom_left_x,
403            bottom_left_y=bottom_left_y,
404        )
405
406    @classmethod
407    def order_text_lines(
408        cls,
409        items: list[TextItem],
410    ) -> list[list[TextItem]]:
411        """Put items into lines of text (top -> bottom, left -> right)."""
412        if not items:
413            items = []
414
415        logger.debug("Arranging text into lines.")
416
417        lines = []
418        current_line: list[TextItem] = []
419        for item in items:
420            if not item.is_horizontal_level:
421                # exclude items that are too sloped
422                continue
423
424            if len(current_line) < 1 or any(item.is_same_line(i) for i in current_line):
425                current_line.append(item)
426
427            elif len(current_line) > 0:
428                # store current line
429                current_line = sorted(current_line, key=lambda x: x.top_left)
430                lines.append(current_line)
431
432                # create new line
433                current_line = [item]
434
435        # include last items
436        if len(current_line) > 0:
437            lines.append(current_line)
438
439        # update items to set line number and line order
440        for line_index, line in enumerate(lines):
441            for item_index, item in enumerate(line):
442                item.line_number = line_index + 1
443                item.line_order = item_index + 1
444
445        return lines
446
447    @property
448    def to_prediction(
449        self,
450    ) -> tuple[
451        str,
452        tuple[
453            tuple[float | np.float32, float | np.float32],
454            tuple[float | np.float32, float | np.float32],
455            tuple[float | np.float32, float | np.float32],
456            tuple[float | np.float32, float | np.float32],
457        ],
458    ]:
459        """Convert to prediction format."""
460        return (
461            self.text,
462            (
463                (self.top_left_x, self.top_left_y),
464                (self.top_right_x, self.top_right_y),
465                (self.bottom_right_x, self.bottom_right_y),
466                (self.bottom_left_x, self.bottom_left_y),
467            ),
468        )
469
470    def _slope(
471        self,
472        pt_x1: float | np.float32,
473        pt_y1: float | np.float32,
474        pt_x2: float | np.float32,
475        pt_y2: float | np.float32,
476    ) -> float | np.float32:
477        """Get the slope of a line."""
478        y_diff = pt_y2 - pt_y1
479        x_diff = pt_x2 - pt_x1
480        try:
481            return y_diff / x_diff
482        except ZeroDivisionError:
483            return math.inf if y_diff >= 0 else -math.inf
484
485    def __str__(self) -> str:
486        """Convert to a string."""
487        line_info = f"({self.line_number or 0}:{self.line_order})"
488        pos_info = f"[top left:{self.top_left}, top slope: {self.slope_top_left_right}]"
489        return f"{self.text} {line_info} {pos_info}"

One found text item (could be a word or phrase) in an image.

TextItem( text: str, top_left_x: float | numpy.float32, top_left_y: float | numpy.float32, top_right_x: float | numpy.float32, top_right_y: float | numpy.float32, bottom_right_x: float | numpy.float32, bottom_right_y: float | numpy.float32, bottom_left_x: float | numpy.float32, bottom_left_y: float | numpy.float32, line_number: int | None = None, line_order: int | None = None)
text: str

The recognised text.

top_left_x: float | numpy.float32
top_left_y: float | numpy.float32
top_right_x: float | numpy.float32
top_right_y: float | numpy.float32
bottom_right_x: float | numpy.float32
bottom_right_y: float | numpy.float32
bottom_left_x: float | numpy.float32
bottom_left_y: float | numpy.float32
line_number: int | None = None
line_order: int | None = None
top_left: tuple[float | numpy.float32, float | numpy.float32]
43    @property
44    def top_left(self) -> tuple[float | np.float32, float | np.float32]:
45        """Get the top left point.
46
47        Returns:
48            The x and y coordinates.
49        """
50        return self.top_left_x, self.top_left_y

Get the top left point.

Returns:

The x and y coordinates.

top_right: tuple[float | numpy.float32, float | numpy.float32]
52    @property
53    def top_right(self) -> tuple[float | np.float32, float | np.float32]:
54        """Get the top right point.
55
56        Returns:
57            The x and y coordinates.
58        """
59        return self.top_right_x, self.top_right_y

Get the top right point.

Returns:

The x and y coordinates.

bottom_right: tuple[float | numpy.float32, float | numpy.float32]
61    @property
62    def bottom_right(self) -> tuple[float | np.float32, float | np.float32]:
63        """Get the bottom right point.
64
65        Returns:
66             The x and y coordinates.
67        """
68        return self.bottom_right_x, self.bottom_right_y

Get the bottom right point.

Returns:

The x and y coordinates.

bottom_left: tuple[float | numpy.float32, float | numpy.float32]
70    @property
71    def bottom_left(self) -> tuple[float | np.float32, float | np.float32]:
72        """Get the bottom left point.
73
74        Returns:
75            The x and y coordinates.
76        """
77        return self.bottom_left_x, self.bottom_left_y

Get the bottom left point.

Returns:

The x and y coordinates.

top_length: float | numpy.float32
79    @property
80    def top_length(self) -> float | np.float32:
81        """Get the length of the top side.
82
83        Returns:
84            float: The length.
85        """
86        # Get the length of the hypotenuse side.
87        side1 = abs(float(self.top_right_x) - float(self.top_left_x))
88        side2 = abs(float(self.top_right_y) - float(self.top_left_y))
89        if side2 == 0:
90            return side1
91        return math.sqrt(pow(side1, 2) + pow(side2, 2))

Get the length of the top side.

Returns:

float: The length.

left_length: float | numpy.float32
 93    @property
 94    def left_length(self) -> float | np.float32:
 95        """Get the length of the left side.
 96
 97        Returns:
 98            float: The length.
 99        """
100        # Get the length of the hypotenuse side.
101        side1 = abs(float(self.top_left_y) - float(self.bottom_left_y))
102        side2 = abs(float(self.top_left_x) - float(self.bottom_left_x))
103        if side2 == 0:
104            return side1
105        return math.sqrt(pow(side1, 2) + pow(side2, 2))

Get the length of the left side.

Returns:

float: The length.

line_bounds: tuple[float | numpy.float32, float | numpy.float32]
107    @property
108    def line_bounds(self) -> tuple[float | np.float32, float | np.float32]:
109        """Line bounds from top of text to bottom of text."""
110        top_bound = min(
111            [
112                float(self.top_left_y),
113                float(self.top_right_y),
114                float(self.bottom_left_y),
115                float(self.bottom_right_y),
116            ],
117        )
118        bottom_bound = max(
119            [
120                float(self.top_left_y),
121                float(self.top_right_y),
122                float(self.bottom_left_y),
123                float(self.bottom_right_y),
124            ],
125        )
126        return top_bound, bottom_bound

Line bounds from top of text to bottom of text.

def is_same_line(self, other: TextItem) -> bool:
128    def is_same_line(self, other: TextItem) -> bool:
129        """Check if the vertical midpoints of this item and another item overlap.
130
131        Calculated as the midpoint +- 1/3 of the height of the text.
132
133        Args:
134            other (TextItem): The text item to compare.
135
136        Returns:
137            bool: True if this item and the other item overlap, otherwise false.
138        """
139        if not other:
140            return False
141        self_bounds = self.line_bounds
142        self_top = self_bounds[0]
143        self_bottom = self_bounds[1]
144        self_third = (self_bottom - self_top) / 3
145        self_top += self_third
146        self_bottom -= self_third
147
148        other_bounds = other.line_bounds
149        other_top = other_bounds[0]
150        other_bottom = other_bounds[1]
151        other_third = (other_bottom - other_top) / 3
152        other_top += other_third
153        other_bottom -= other_third
154
155        return bool(self_top <= other_bottom and other_top <= self_bottom)

Check if the vertical midpoints of this item and another item overlap.

Calculated as the midpoint +- 1/3 of the height of the text.

Arguments:
  • other (TextItem): The text item to compare.
Returns:

bool: True if this item and the other item overlap, otherwise false.

slope_top_left_right: float | numpy.float32
157    @property
158    def slope_top_left_right(self) -> float | np.float32:
159        """Get the top slope from the left to the right.
160
161        Returns:
162            float: The slope.
163        """
164        return self._slope(
165            self.top_left_x,
166            self.top_left_y,
167            self.top_right_x,
168            self.top_right_y,
169        )

Get the top slope from the left to the right.

Returns:

float: The slope.

slope_top_right_left: float | numpy.float32
171    @property
172    def slope_top_right_left(self) -> float | np.float32:
173        """Get the top slope from the right to the left.
174
175        Returns:
176            float: The slope.
177        """
178        return self._slope(
179            self.top_right_x,
180            self.top_right_y,
181            self.top_left_x,
182            self.top_left_y,
183        )

Get the top slope from the right to the left.

Returns:

float: The slope.

slope_left_top_bottom: float | numpy.float32
185    @property
186    def slope_left_top_bottom(self) -> float | np.float32:
187        """Get the left slope from the top to the bottom.
188
189        Returns:
190            float: The slope.
191        """
192        return self._slope(
193            self.top_left_x,
194            self.top_left_y,
195            self.bottom_left_x,
196            self.bottom_left_y,
197        )

Get the left slope from the top to the bottom.

Returns:

float: The slope.

slope_left_bottom_top: float | numpy.float32
199    @property
200    def slope_left_bottom_top(self) -> float | np.float32:
201        """Get the left slope from the bottom to the top.
202
203        Returns:
204            float: The slope.
205        """
206        return self._slope(
207            self.bottom_left_x,
208            self.bottom_left_y,
209            self.top_left_x,
210            self.top_left_y,
211        )

Get the left slope from the bottom to the top.

Returns:

float: The slope.

slope_bottom_left_right: float | numpy.float32
213    @property
214    def slope_bottom_left_right(self) -> float | np.float32:
215        """Get the bottom slope from the left to the right.
216
217        Returns:
218            float: The slope.
219        """
220        return self._slope(
221            self.bottom_left_x,
222            self.bottom_left_y,
223            self.bottom_right_x,
224            self.bottom_right_y,
225        )

Get the bottom slope from the left to the right.

Returns:

float: The slope.

slope_bottom_right_left: float | numpy.float32
227    @property
228    def slope_bottom_right_left(self) -> float | np.float32:
229        """Get the bottom slope from the right to the left.
230
231        Returns:
232            float: The slope.
233        """
234        return self._slope(
235            self.bottom_right_x,
236            self.bottom_right_y,
237            self.bottom_left_x,
238            self.bottom_left_y,
239        )

Get the bottom slope from the right to the left.

Returns:

float: The slope.

slope_right_top_bottom: float | numpy.float32
241    @property
242    def slope_right_top_bottom(self) -> float | np.float32:
243        """Get the right slope from the top to the bottom.
244
245        Returns:
246            float: The slope.
247        """
248        return self._slope(
249            self.top_right_x,
250            self.top_right_y,
251            self.bottom_right_x,
252            self.bottom_right_y,
253        )

Get the right slope from the top to the bottom.

Returns:

float: The slope.

slope_right_bottom_top: float | numpy.float32
255    @property
256    def slope_right_bottom_top(self) -> float | np.float32:
257        """Get the right slope from the bottom to the top.
258
259        Returns:
260            float: The slope.
261        """
262        return self._slope(
263            self.bottom_right_x,
264            self.bottom_right_y,
265            self.top_right_x,
266            self.top_right_y,
267        )

Get the right slope from the bottom to the top.

Returns:

float: The slope.

is_horizontal_level: bool
269    @property
270    def is_horizontal_level(self) -> bool:
271        """Check whether the left-to-right slope is approximately horizontal.
272
273        Returns:
274            bool: True if the item is approximately horizontal.
275        """
276        # -0.1 -> 0.1 is strictly horizontal
277        # give a bit of buffer
278        buffer = 0.09
279        return bool(-buffer <= self.slope_top_left_right <= buffer)

Check whether the left-to-right slope is approximately horizontal.

Returns:

bool: True if the item is approximately horizontal.

is_vertical_level: bool
281    @property
282    def is_vertical_level(self) -> bool:
283        """Check whether the top-to-bottom slope is approximately vertical.
284
285        Returns:
286            bool: True if the item is approximately vertical.
287        """
288        # -0.1 -> 0.1 is strictly vertical
289        # give a bit of buffer
290        return bool(self.slope_left_top_bottom == math.inf)

Check whether the top-to-bottom slope is approximately vertical.

Returns:

bool: True if the item is approximately vertical.

@classmethod
def save( cls, path: pathlib.Path, items: list[TextItem]) -> None:
292    @classmethod
293    def save(cls, path: pathlib.Path, items: list[TextItem]) -> None:
294        """Save found text items to a file.
295
296        Args:
297            path: Write the items to this file.
298            items: The items to save.
299
300        Returns:
301            None
302        """
303        logger.debug("Saving %s OCR output items.", len(items))
304
305        fields = [
306            "text",
307            "line_number",
308            "line_order",
309            "top_left_x",
310            "top_left_y",
311            "top_right_x",
312            "top_right_y",
313            "bottom_right_x",
314            "bottom_right_y",
315            "bottom_left_x",
316            "bottom_left_y",
317        ]
318        with path.open("w", newline="", encoding="utf8") as file_path:
319            writer = csv.DictWriter(file_path, fields)
320            writer.writeheader()
321            sorted_items = sorted(
322                items,
323                key=lambda i: (i.line_number or 0, i.line_order or 0),
324            )
325            writer.writerows([dataclasses.asdict(i) for i in sorted_items])
326
327        logger.debug("Saved OCR items to '%s'.", path)

Save found text items to a file.

Arguments:
  • path: Write the items to this file.
  • items: The items to save.
Returns:

None

@classmethod
def load( cls, path: pathlib.Path) -> Generator[TextItem, typing.Any, None]:
329    @classmethod
330    def load(cls, path: pathlib.Path) -> typing.Generator[TextItem, typing.Any, None]:
331        """Load found text items from a file.
332
333        Args:
334            path: The path to the file containing items.
335
336        Returns:
337            typing.Generator["TextItem", typing.Any, None]: Items from the file.
338        """
339        logger.debug("Loading OCR output items.")
340        count = 0
341
342        with path.open(encoding="utf8") as file_path:
343            reader = csv.DictReader(file_path)
344            for row in reader:
345                line_number = row.get("line_number", "").strip()
346                line_number = int(line_number) if line_number else None
347
348                line_order = row.get("line_order", "").strip()
349                line_order = int(line_order) if line_order else None
350
351                count += 1
352
353                yield TextItem(
354                    text=row["text"],
355                    line_number=line_number,
356                    line_order=line_order,
357                    top_left_x=float(row["top_left_x"]),
358                    top_left_y=float(row["top_left_y"]),
359                    top_right_x=float(row["top_right_x"]),
360                    top_right_y=float(row["top_right_y"]),
361                    bottom_right_x=float(row["bottom_right_x"]),
362                    bottom_right_y=float(row["bottom_right_y"]),
363                    bottom_left_x=float(row["bottom_left_x"]),
364                    bottom_left_y=float(row["bottom_left_y"]),
365                )
366
367        logger.debug("Loaded %s OCR items from '%s'.", count, path)

Load found text items from a file.

Arguments:
  • path: The path to the file containing items.
Returns:

typing.Generator["TextItem", typing.Any, None]: Items from the file.

@classmethod
def from_prediction( cls, prediction: tuple[typing.Any, typing.Any]) -> TextItem:
369    @classmethod
370    def from_prediction(
371        cls,
372        prediction: tuple[typing.Any, typing.Any],
373    ) -> TextItem:
374        """Convert from (text, box) to item.
375
376        Box is (top left, top right, bottom right, bottom left).
377        Its structure is [[startX,startY], [endX,startY], [endX,endY], [startX, endY]].
378
379        Args:
380            prediction: The text recognised in an image.
381
382        Returns:
383            TextItem: A text item representing the recognised text.
384        """
385        (
386            text,
387            (
388                (top_left_x, top_left_y),
389                (top_right_x, top_right_y),
390                (bottom_right_x, bottom_right_y),
391                (bottom_left_x, bottom_left_y),
392            ),
393        ) = prediction
394        return TextItem(
395            text=text,
396            top_left_x=top_left_x,
397            top_left_y=top_left_y,
398            top_right_x=top_right_x,
399            top_right_y=top_right_y,
400            bottom_right_x=bottom_right_x,
401            bottom_right_y=bottom_right_y,
402            bottom_left_x=bottom_left_x,
403            bottom_left_y=bottom_left_y,
404        )

Convert from (text, box) to item.

Box is (top left, top right, bottom right, bottom left). Its structure is [[startX,startY], [endX,startY], [endX,endY], [startX, endY]].

Arguments:
  • prediction: The text recognised in an image.
Returns:

TextItem: A text item representing the recognised text.

@classmethod
def order_text_lines( cls, items: list[TextItem]) -> list[list[TextItem]]:
406    @classmethod
407    def order_text_lines(
408        cls,
409        items: list[TextItem],
410    ) -> list[list[TextItem]]:
411        """Put items into lines of text (top -> bottom, left -> right)."""
412        if not items:
413            items = []
414
415        logger.debug("Arranging text into lines.")
416
417        lines = []
418        current_line: list[TextItem] = []
419        for item in items:
420            if not item.is_horizontal_level:
421                # exclude items that are too sloped
422                continue
423
424            if len(current_line) < 1 or any(item.is_same_line(i) for i in current_line):
425                current_line.append(item)
426
427            elif len(current_line) > 0:
428                # store current line
429                current_line = sorted(current_line, key=lambda x: x.top_left)
430                lines.append(current_line)
431
432                # create new line
433                current_line = [item]
434
435        # include last items
436        if len(current_line) > 0:
437            lines.append(current_line)
438
439        # update items to set line number and line order
440        for line_index, line in enumerate(lines):
441            for item_index, item in enumerate(line):
442                item.line_number = line_index + 1
443                item.line_order = item_index + 1
444
445        return lines

Put items into lines of text (top -> bottom, left -> right).

to_prediction: tuple[str, tuple[tuple[float | numpy.float32, float | numpy.float32], tuple[float | numpy.float32, float | numpy.float32], tuple[float | numpy.float32, float | numpy.float32], tuple[float | numpy.float32, float | numpy.float32]]]
447    @property
448    def to_prediction(
449        self,
450    ) -> tuple[
451        str,
452        tuple[
453            tuple[float | np.float32, float | np.float32],
454            tuple[float | np.float32, float | np.float32],
455            tuple[float | np.float32, float | np.float32],
456            tuple[float | np.float32, float | np.float32],
457        ],
458    ]:
459        """Convert to prediction format."""
460        return (
461            self.text,
462            (
463                (self.top_left_x, self.top_left_y),
464                (self.top_right_x, self.top_right_y),
465                (self.bottom_right_x, self.bottom_right_y),
466                (self.bottom_left_x, self.bottom_left_y),
467            ),
468        )

Convert to prediction format.

@beartype
@dataclasses.dataclass
class KerasOcrResult:
492@beartype
493@dataclasses.dataclass
494class KerasOcrResult:
495    """Result from running keras-ocr."""
496
497    output_dir: pathlib.Path
498    annotations_file: pathlib.Path
499    predictions_file: pathlib.Path
500    items: list[list[TextItem]]

Result from running keras-ocr.

KerasOcrResult( output_dir: pathlib.Path, annotations_file: pathlib.Path, predictions_file: pathlib.Path, items: list[list[TextItem]])
output_dir: pathlib.Path
annotations_file: pathlib.Path
predictions_file: pathlib.Path
items: list[list[TextItem]]