Unverified Commit 6e47db65 authored by dmMaze's avatar dmMaze Committed by GitHub
Browse files

Merge pull request #200 from tak2hu/dev

Add Windows' and MacOS' OCR, Working But With Small Bugs
parents a92e54d8 a6982787
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -36,3 +36,4 @@ config/config.json
*.7z
venv
pip
.DS_Store
 No newline at end of file
+106 −6
Original line number Diff line number Diff line
@@ -246,10 +246,110 @@ class OCRMIT48pxCTC(OCRBase):
        self.chunk_size = chunk_size
        self.model.max_chunk_size = chunk_size
    
import platform
if platform.mac_ver()[0] >= '10.15':
    from .macos_ocr import get_supported_languages
    APPLEVISIONFRAMEWORK = None
    @register_OCR('macos_ocr')
    class OCRApple(OCRBase):
        params = {
            'language': {
                'type':'selector',
                'options': list(get_supported_languages()[0]),
                'select': 'en-US',
            },
            # While this does appear 
            # it doesn't update the languages available
            # different recog level, different available langs
            # 'recognition_level': {
            #     'type': 'selector',
            #     'options': [
            #         'accurate',
            #         'fast',
            #     ],
            #     'select': 'accurate',
            # },
            'confidence_level': '0.1',
        }
        language = 'en-US'
        recognition = 'accurate'
        confidence = '0.1'

        def setup_ocr(self):
            global APPLEVISIONFRAMEWORK
            from .macos_ocr import AppleOCR
            if APPLEVISIONFRAMEWORK is None:
                self.model = APPLEVISIONFRAMEWORK = AppleOCR(lang=[self.language])
            else:
                self.model = APPLEVISIONFRAMEWORK

        def ocr_img(self, img: np.ndarray) -> str:
            return self.model(img)

        def ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock]):
            im_h, im_w = img.shape[:2]
            for blk in blk_list:
                x1, y1, x2, y2 = blk.xyxy
                if y2 < im_h and x2 < im_w and \
                    x1 > 0 and y1 > 0 and x1 < x2 and y1 < y2: 
                    blk.text = self.model(img[y1:y2, x1:x2])
                else:
                    logging.warning('invalid textbbox to target img')
                    blk.text = ['']

        def updateParam(self, param_key: str, param_content):
            super().updateParam(param_key, param_content)
            self.language = self.params['language']['select']
            self.model.lang = [self.language]

            # self.recognition = self.params['recognition_level']['select']
            # self.model.recog_level = self.recognition
            # self.params['language']['options'] = list(get_supported_languages(self.recognition)[0])

            self.confidence = self.params['confidence_level']
            self.model.min_confidence = self.confidence

if platform.system() == 'Windows' and platform.version() >= '10.0.10240.0':
    from .windows_ocr import get_supported_language_packs

    languages_display_name = [lang.display_name for lang in get_supported_language_packs()]
    languages_tag = [lang.language_tag for lang in get_supported_language_packs()]
    WINDOWSOCRENGINE = None
    @register_OCR('windows_ocr')
    class OCRWindows(OCRBase):
        params = {
            'language': {
                'type':'selector',
                'options': languages_display_name,
                'select': languages_display_name[0],
            }
        }
        language = languages_display_name[0]

        def setup_ocr(self):
            global WINDOWSOCRENGINE
            from .windows_ocr import WindowsOCR
            if WINDOWSOCRENGINE is None:
                self.engine = WINDOWSOCRENGINE = WindowsOCR()
            else:
                self.engine = WINDOWSOCRENGINE

        def ocr_img(self, img: np.ndarray) -> str:
            self.engine(img)

        def ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock]) -> None:
            im_h, im_w = img.shape[:2]
            for blk in blk_list:
                x1, y1, x2, y2 = blk.xyxy
                if y2 < im_h and x2 < im_w and \
                    x1 > 0 and y1 > 0 and x1 < x2 and y1 < y2: 
                    blk.text = self.engine(img[y1:y2, x1:x2])
                else:
                    logging.warning('invalid textbbox to target img')
                    blk.text = ['']
        
        def updateParam(self, param_key: str, param_content):
            super().updateParam(param_key, param_content)
            self.language = self.params['language']['select']
            tag_name = languages_tag[languages_display_name.index(self.language)]
            self.engine.lang = tag_name
 No newline at end of file
+86 −0
Original line number Diff line number Diff line
import Vision
import objc
import platform
from typing import Tuple
import numpy as np
from PIL import Image
from io import BytesIO

def get_revision_level():
    with objc.autorelease_pool():
        ver = platform.mac_ver()[0]
        if ver >= '13':
            revision = Vision.VNRecognizeTextRequestRevision3
        # python might return 10.16 instead of 11.0 for Big Sur and above
        elif ver >= '10.16': # ver[0] >= '11'
            revision = Vision.VNRecognizeTextRequestRevision2
        elif ver >= '10.15':
            revision = Vision.VNRecognizeTextRequestRevision1
        return revision

def get_supported_languages(recognition_level='accurate', revision=get_revision_level()) -> Tuple[Tuple[str], Tuple[str]]:
    """Get supported languages for text detection from Vision framework.

    Returns: Tuple of ((language code), (error))
    """        

    if recognition_level == 'fast':
        recognition_level = 1
    else:
        recognition_level = 0
    return Vision.VNRecognizeTextRequest.supportedRecognitionLanguagesForTextRecognitionLevel_revision_error_(
        recognition_level, revision, None
        )

def text_from_image(image: np.ndarray, language_preference=None, recognition_level='accurate'):
    recognition_level = recognition_level.lower()
    if language_preference == 'Auto':
        language_preference = None

    img_buf = BytesIO()
    Image.fromarray(image).save(img_buf, format='PNG')

    with objc.autorelease_pool():
        req = Vision.VNRecognizeTextRequest.alloc().init()

        if recognition_level == 'fast':
            req.setRecognitionLevel_(1)
        else:
            req.setRecognitionLevel_(0)

        if language_preference is not None:
            req.setRecognitionLanguages_(language_preference)

        handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(
            img_buf.getvalue(), None
        )

        success = handler.performRequests_error_([req], None)
        res = []
        if success:
            for result in req.results():
                # bbox = result.boundingBox()
                # w, h = bbox.size.width, bbox.size.height
                # x, y = bbox.origin.x, bbox.origin.y

                res.append((result.text(), result.confidence())) #, [x, y, w, h]))

        req.dealloc()
        handler.dealloc()

        return res


class AppleOCR:
    def __init__(self, lang=[], recog_level='accurate', min_confidence='0.1'):
        self.lang = lang
        self.recog_level = recog_level 
        self.min_confidence = min_confidence

    def __call__(self, img: np.ndarray) -> str:
        result = []
        results = text_from_image(img, self.lang, self.recog_level)
        for res in results:
            if res[1] >= float(self.min_confidence):
                result.append(res[0])
        return '\n'.join(result)
 No newline at end of file
+28 −0
Original line number Diff line number Diff line
# https://learn.microsoft.com/en-us/windows/powertoys/text-extractor#how-to-query-for-ocr-language-packs
from winsdk.windows.media.ocr import OcrEngine
from winsdk.windows.globalization import Language
from winsdk.windows.storage.streams import DataWriter
from winsdk.windows.graphics.imaging import SoftwareBitmap, BitmapPixelFormat

import numpy as np
import cv2, asyncio

def get_supported_language_packs():
    return list(OcrEngine.available_recognizer_languages)

def ocr(byte, width, height, lang='en'):
    writer = DataWriter()
    writer.write_bytes(byte)
    sb = SoftwareBitmap.create_copy_from_buffer(writer.detach_buffer(), BitmapPixelFormat.RGBA8, width, height)
    return OcrEngine.try_create_from_language(Language(lang)).recognize_async(sb)

async def coroutine(awaitable):
    return await awaitable 

class WindowsOCR:
    lang = get_supported_language_packs()[0].language_tag
    
    def __call__(self, img: np.ndarray) -> str:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGBA)
        w, h = img.shape[1], img.shape[0]
        return asyncio.run(coroutine(ocr(img.tobytes(), w, h, self.lang))).text
 No newline at end of file
+7 −1
Original line number Diff line number Diff line
@@ -37,4 +37,10 @@ openai
pyyaml
httpx[socks,brotli]
langdetect
# pywin32 required on windows
pywin32; sys_platform == 'win32'
winsdk; sys_platform == 'win32'
pyobjc-core; sys_platform == 'darwin'
pyobjc-framework-cocoa; sys_platform == 'darwin'
pyobjc-framework-coreml; sys_platform == 'darwin'
pyobjc-framework-quartz; sys_platform == 'darwin'
pyobjc-framework-vision; sys_platform == 'darwin'
 No newline at end of file
Loading