Commit acc3a66b authored by narugo1992's avatar narugo1992
Browse files

dev(narugo): add paddle ocr

parent 8c204738
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
from .entry import detect_text_with_ocr, ocr

imgutils/ocr/detect.py

0 → 100644
+159 −0
Original line number Diff line number Diff line
from functools import lru_cache

import cv2
import numpy as np
import pyclipper
from huggingface_hub import hf_hub_download
from shapely import Polygon

from ..data import ImageTyping
from ..utils import open_onnx_model

_MIN_SIZE = 3


@lru_cache()
def _open_ocr_detection_model(model):
    return open_onnx_model(hf_hub_download(
        'deepghs/paddleocr',
        f'{model}/detection.onnx',
    ))


def _box_score_fast(bitmap, _box):
    h, w = bitmap.shape[:2]
    box = _box.copy()
    xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1)
    xmax = np.clip(np.ceil(box[:, 0].max()).astype("int32"), 0, w - 1)
    ymin = np.clip(np.floor(box[:, 1].min()).astype("int32"), 0, h - 1)
    ymax = np.clip(np.ceil(box[:, 1].max()).astype("int32"), 0, h - 1)

    mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
    box[:, 0] = box[:, 0] - xmin
    box[:, 1] = box[:, 1] - ymin
    # noinspection PyTypeChecker
    cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1)
    return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]


def _unclip(box, unclip_ratio):
    poly = Polygon(box)
    distance = poly.area * unclip_ratio / poly.length
    offset = pyclipper.PyclipperOffset()
    offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
    expanded = np.array(offset.Execute(distance))
    return expanded


def _get_mini_boxes(contour):
    bounding_box = cv2.minAreaRect(contour)
    points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])

    if points[1][1] > points[0][1]:
        index_1 = 0
        index_4 = 1
    else:
        index_1 = 1
        index_4 = 0

    if points[3][1] > points[2][1]:
        index_2 = 2
        index_3 = 3
    else:
        index_2 = 3
        index_3 = 2

    box = [
        points[index_1], points[index_2], points[index_3], points[index_4]
    ]
    return box, min(bounding_box[1])


def _boxes_from_bitmap(pred, _bitmap, dest_width, dest_height,
                       box_threshold=0.7, max_candidates=1000, unclip_ratio=2.0):
    bitmap = _bitmap
    height, width = bitmap.shape

    outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    if len(outs) == 3:
        img, contours, _ = outs[0], outs[1], outs[2]
    elif len(outs) == 2:
        contours, _ = outs[0], outs[1]

    # noinspection PyUnboundLocalVariable
    num_contours = min(len(contours), max_candidates)

    boxes = []
    scores = []
    for index in range(num_contours):
        contour = contours[index]
        points, sside = _get_mini_boxes(contour)
        if sside < _MIN_SIZE:
            continue
        points = np.array(points)
        score = _box_score_fast(pred, points.reshape(-1, 2))
        if box_threshold > score:
            continue

        box = _unclip(points, unclip_ratio).reshape(-1, 1, 2)
        box, sside = _get_mini_boxes(box)
        if sside < _MIN_SIZE + 2:
            continue
        box = np.array(box)

        box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
        box[:, 1] = np.clip(np.round(box[:, 1] / height * dest_height), 0, dest_height)
        boxes.append(box.astype("int32"))
        scores.append(score)
    return np.array(boxes, dtype="int32"), scores


def _normalize(data, mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711)):
    mean, std = np.asarray(mean), np.asarray(std)
    return (data - mean[None, :, None, None]) / std[None, :, None, None]


_ALIGN = 64


def _get_text_points(image: ImageTyping, model: str = 'ch_PP-OCRv4_det_infer',
                     heat_threshold: float = 0.3, box_threshold: float = 0.7,
                     max_candidates: int = 1000, unclip_ratio: float = 2.0):
    origin_width, origin_height = width, height = image.size
    if width % _ALIGN != 0:
        width += (_ALIGN - width % _ALIGN)
    if height % _ALIGN != 0:
        height += (_ALIGN - height % _ALIGN)

    input_ = np.array(image).transpose((2, 0, 1)).astype(np.float32) / 255.0
    # noinspection PyTypeChecker
    input_ = np.pad(input_[None, ...], ((0, 0), (0, 0), (0, height - origin_height), (0, width - origin_width)))

    _ort_session = _open_ocr_detection_model(model)

    input_ = _normalize(input_).astype(np.float32)
    _input_name = _ort_session.get_inputs()[0].name
    _output_name = _ort_session.get_outputs()[0].name
    output_, = _ort_session.run([_output_name], {_input_name: input_})
    heatmap = output_[0][0]
    heatmap = heatmap[:origin_height, :origin_width]

    retval = []
    for points, score in zip(*_boxes_from_bitmap(
            heatmap, heatmap >= heat_threshold, origin_width, origin_height,
            box_threshold, max_candidates, unclip_ratio,
    )):
        retval.append((points, score))
    return retval


def _detect_text(image: ImageTyping, model: str = 'ch_PP-OCRv4_det_infer',
                 heat_threshold: float = 0.3, box_threshold: float = 0.7,
                 max_candidates: int = 1000, unclip_ratio: float = 2.0):
    retval = []
    for points, score in _get_text_points(image, model, heat_threshold, box_threshold, max_candidates, unclip_ratio):
        x0, y0 = points[:, 0].min(), points[:, 1].min()
        x1, y1 = points[:, 0].max(), points[:, 1].max()
        retval.append(((x0.item(), y0.item(), x1.item(), y1.item()), 'text', score))

    return retval

imgutils/ocr/entry.py

0 → 100644
+39 −0
Original line number Diff line number Diff line
from typing import List, Tuple

from .detect import _detect_text
from .recognize import _text_recognize
from ..data import ImageTyping, load_image

_DEFAULT_MODEL = 'ch_PP-OCRv4_det_infer'


def detect_text_with_ocr(image: ImageTyping, model: str = _DEFAULT_MODEL,
                         heat_threshold: float = 0.3, box_threshold: float = 0.7,
                         max_candidates: int = 1000, unclip_ratio: float = 2.0) \
        -> List[Tuple[Tuple[int, int, int, int], str, float]]:
    retval = []
    for box, _, score in _detect_text(image, model, heat_threshold, box_threshold, max_candidates, unclip_ratio):
        retval.append((box, 'text', score))
    retval = sorted(retval, key=lambda x: x[2], reverse=True)
    return retval


def ocr(image: ImageTyping, model: str = _DEFAULT_MODEL,
        heat_threshold: float = 0.3, box_threshold: float = 0.7,
        max_candidates: int = 1000, unclip_ratio: float = 2.0,
        is_remove_duplicate: bool = False):
    image = load_image(image)
    retval = []
    for (x0, y0, x1, y1), _, score in _detect_text(image, model, heat_threshold,
                                                   box_threshold, max_candidates, unclip_ratio):
        width, height = x1 - x0, y1 - y0
        area = image.crop((x0, y0, x1, y1))
        if height >= width * 1.5:
            area = area.rotate(90)

        text, _ = _text_recognize(area, model, is_remove_duplicate)
        print(text, score)
        retval.append(((x0, y0, x1, y1), text, score))

    retval = sorted(retval, key=lambda x: x[2], reverse=True)
    return retval
+75 −0
Original line number Diff line number Diff line
from functools import lru_cache
from typing import List, Tuple

import numpy as np
from huggingface_hub import hf_hub_download

from ..data import ImageTyping
from ..utils import open_onnx_model


@lru_cache()
def _open_ocr_recognition_model(model):
    return open_onnx_model(hf_hub_download(
        'deepghs/paddleocr',
        f'{model}/recognition.onnx',
    ))


@lru_cache()
def _open_ocr_recognition_dictionary(model) -> List[str]:
    with open(hf_hub_download(
            'deepghs/paddleocr',
            f'{model}/dict.txt',
    ), 'r') as f:
        dict_ = [line.strip() for line in f]

    return ['<blank>', *dict_, ' ']


def decode(text_index, model: str, text_prob=None, is_remove_duplicate=False):
    retval = []
    ignored_tokens = [0]
    batch_size = len(text_index)
    for batch_idx in range(batch_size):
        selection = np.ones(len(text_index[batch_idx]), dtype=bool)
        if is_remove_duplicate:
            selection[1:] = text_index[batch_idx][1:] != text_index[batch_idx][:-1]
        for ignored_token in ignored_tokens:
            selection &= text_index[batch_idx] != ignored_token

        char_list = [
            _open_ocr_recognition_dictionary(model)[text_id.item()]
            for text_id in text_index[batch_idx][selection]
        ]
        if text_prob is not None:
            conf_list = text_prob[batch_idx][selection]
        else:
            conf_list = [1] * len(selection)
        if len(conf_list) == 0:
            conf_list = [0]

        text = ''.join(char_list)
        retval.append((text, np.mean(conf_list).tolist()))

    return retval


def _text_recognize(image: ImageTyping, model: str = 'ch_PP-OCRv4_det_infer',
                    is_remove_duplicate: bool = False) -> Tuple[str, float]:
    r = 48 / image.height
    new_height = int(image.height * r)
    new_width = int(image.width * r)
    image = image.resize((new_width, new_height))

    input_ = np.array(image).transpose((2, 0, 1)).astype(np.float32) / 255.0

    input_ = ((input_ - 0.5) / 0.5)[None, ...].astype(np.float32)
    _ort_session = _open_ocr_recognition_model(model)
    _input_name = _ort_session.get_inputs()[0].name
    _output_name = _ort_session.get_outputs()[0].name
    output, = _ort_session.run([_output_name], {_input_name: input_})

    indices = output.argmax(axis=2)
    confs = output.max(axis=2)
    return decode(indices, model, confs, is_remove_duplicate)[0]
+3 −1
Original line number Diff line number Diff line
@@ -9,3 +9,5 @@ pandas
scipy
emoji>=2.5.0
pilmoji>=1.3.0
shapely
pyclipper
 No newline at end of file