Loading modules/ocr/__init__.py +1 −139 Original line number Diff line number Diff line from typing import List import numpy as np from .base import OCRBase, register_OCR, DEVICE_SELECTOR, DEFAULT_DEVICE, TextBlock, OCR No newline at end of file from .model_32px import OCR32pxModel @register_OCR('mit32px') class OCRMIT32px(OCRBase): params = { 'chunk_size': { 'type': 'selector', 'options': [8, 16, 24, 32], 'value': 16 }, 'device': DEVICE_SELECTOR(), 'description': 'OCRMIT32px' } device = DEFAULT_DEVICE chunk_size = 16 download_file_list = [{ 'url': 'https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/ocr.zip', 'files': ['ocr.ckpt'], 'sha256_pre_calculated': ['d9f619a9dccce8ce88357d1b17d25f07806f225c033ea42c64e86c45446cfe71'], 'save_files': ['data/models/mit32px_ocr.ckpt'], 'archived_files': 'ocr.zip', 'archive_sha256_pre_calculated': '47405638b96fa2540a5ee841a4cd792f25062c09d9458a973362d40785f95d7a', }] _load_model_keys = {'model'} def __init__(self, **params) -> None: super().__init__(**params) self.device = self.params['device']['value'] self.chunk_size = int(self.params['chunk_size']['value']) self.model: OCR32pxModel = None def _load_model(self): self.model = OCR32pxModel(r'data/models/mit32px_ocr.ckpt', self.device, self.chunk_size) def ocr_img(self, img: np.ndarray) -> str: return self.model.ocr_img(img) def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock]): return self.model(img, blk_list) def updateParam(self, param_key: str, param_content): super().updateParam(param_key, param_content) device = self.params['device']['value'] chunk_size = int(self.params['chunk_size']['value']) if self.device != device: self.model.to(device) self.chunk_size = chunk_size self.model.max_chunk_size = chunk_size from .mit48px_ctc import OCR48pxCTC @register_OCR('mit48px_ctc') class OCRMIT48pxCTC(OCRBase): params = { 'chunk_size': { 'type': 'selector', 'options': [8,16,24,32], 'value': 16 }, 'device': DEVICE_SELECTOR(), 'description': 'mit48px_ctc' } device = DEFAULT_DEVICE chunk_size = 16 download_file_list = [{ 'url': 'https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/ocr-ctc.zip', 'files': ['ocr-ctc.ckpt', 'alphabet-all-v5.txt'], 'sha256_pre_calculated': ['8b0837a24da5fde96c23ca47bb7abd590cd5b185c307e348c6e0b7238178ed89', None], 'save_files': ['data/models/mit48pxctc_ocr.ckpt', 'data/alphabet-all-v5.txt'], 'archived_files': 'ocr-ctc.zip', 'archive_sha256_pre_calculated': 'fc61c52f7a811bc72c54f6be85df814c6b60f63585175db27cb94a08e0c30101', }] _load_model_keys = {'model'} def __init__(self, **params) -> None: super().__init__(**params) self.device = self.params['device']['value'] self.chunk_size = int(self.params['chunk_size']['value']) self.model: OCR48pxCTC = None def _load_model(self): self.model = OCR48pxCTC(r'data/models/mit48pxctc_ocr.ckpt', self.device, self.chunk_size) def ocr_img(self, img: np.ndarray) -> str: return self.model.ocr_img(img) def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock]): return self.model(img, blk_list) def updateParam(self, param_key: str, param_content): super().updateParam(param_key, param_content) device = self.params['device']['value'] chunk_size = int(self.params['chunk_size']['value']) if self.device != device: self.model.to(device) self.chunk_size = chunk_size self.model.max_chunk_size = chunk_size from .mit48px import Model48pxOCR OCR48PXMODEL_PATH = r'data/models/ocr_ar_48px.ckpt' @register_OCR('mit48px') class OCRMIT48px(OCRBase): params = { 'device': DEVICE_SELECTOR(), 'description': 'mit48px' } device = DEFAULT_DEVICE download_file_list = [{ 'url': 'https://huggingface.co/zyddnys/manga-image-translator/resolve/main/', 'files': [OCR48PXMODEL_PATH, 'data/alphabet-all-v7.txt'], 'sha256_pre_calculated': ['29daa46d080818bb4ab239a518a88338cbccff8f901bef8c9db191a7cb97671d', None], 'concatenate_url_filename': 2, }] _load_model_keys = {'model'} def __init__(self, **params) -> None: super().__init__(**params) self.device = self.params['device']['value'] self.model: Model48pxOCR = None def _load_model(self): self.model = Model48pxOCR(OCR48PXMODEL_PATH, self.device) def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock]): return self.model(img, blk_list) def updateParam(self, param_key: str, param_content): super().updateParam(param_key, param_content) device = self.params['device']['value'] if self.device != device: self.model.to(device) No newline at end of file modules/ocr/base.py +3 −9 Original line number Diff line number Diff line Loading @@ -14,6 +14,7 @@ class OCRBase(BaseModule): _postprocess_hooks = OrderedDict() _preprocess_hooks = OrderedDict() _line_only: bool = False def __init__(self, **params) -> None: super().__init__(**params) Loading Loading @@ -41,17 +42,10 @@ class OCRBase(BaseModule): self._ocr_blk_list(img, blk_list) for callback_name, callback in self._postprocess_hooks.items(): callback(textblocks=blk_list, img=img, ocr_module=self) # for blk in blk_list: # if isinstance(blk.text, List): # for ii, t in enumerate(blk.text): # for callback in self.postprocess_hooks: # blk.text[ii] = callback(t, blk=blk) # else: # for callback in self.postprocess_hooks: # blk.text = callback(blk.text, blk=blk) return blk_list def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock]) -> None: def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock], *args, **kwargs) -> None: raise NotImplementedError def ocr_img(self, img: np.ndarray) -> str: Loading modules/ocr/mit48px.py +17 −19 Original line number Diff line number Diff line Loading @@ -119,12 +119,11 @@ class Model48pxOCR: }, } def __init__(self, model_path: str, device='cpu', max_chunk_size=16, *args, **kwargs): def __init__(self, model_path: str, device='cpu', *args, **kwargs): super().__init__(*args, **kwargs) self.device = device self.max_chunk_size = max_chunk_size with open('data/alphabet-all-v7.txt', 'r', encoding = 'utf-8') as fp: dictionary = [s[:-1] for s in fp.readlines()] Loading @@ -139,32 +138,31 @@ class Model48pxOCR: self.model.to(device) self.device = device def __call__(self, img: np.ndarray, textblk_lst: List[TextBlock], verbose: bool = False, ignore_bubble: int = 0) -> List[TextBlock]: def __call__(self, img: np.ndarray, textblk_lst: List[TextBlock], chunk_size = 16, regions: List = None, textblk_lst_indices: List = None) -> None: if isinstance(textblk_lst, TextBlock): textblk_lst = [textblk_lst] text_height = 48 max_chunk_size = 16 region_imgs = [] if regions is None or textblk_lst_indices is None: regions = [] textblk_lst_indices = [] region_idx = 0 for blk_idx, textblk in enumerate(textblk_lst): for ii in range(len(textblk)): textblk_lst_indices.append(blk_idx) region_imgs.append(textblk.get_transformed_region(img, ii, 48, maxwidth=8100)) region_idx += 1 region = textblk.get_transformed_region(img, ii, 48, maxwidth=8100) regions.append(region) perm = range(len(region_imgs)) perm = range(len(regions)) chunck_idx = 0 for indices in chunks(perm, max_chunk_size): for indices in chunks(perm, chunk_size): N = len(indices) widths = [region_imgs[i].shape[1] for i in indices] widths = [regions[i].shape[1] for i in indices] max_width = 4 * (max(widths) + 7) // 4 region = np.zeros((N, text_height, max_width, 3), dtype = np.uint8) for i, idx in enumerate(indices): W = region_imgs[idx].shape[1] region[i, :, : W, :]=region_imgs[idx] W = regions[idx].shape[1] region[i, :, : W, :]=regions[idx] image_tensor = (torch.from_numpy(region).float() - 127.5) / 127.5 image_tensor = einops.rearrange(image_tensor, 'N H W C -> N C H W') Loading modules/ocr/mit48px_ctc.py +12 −12 Original line number Diff line number Diff line Loading @@ -392,11 +392,10 @@ class AvgMeter() : class OCR48pxCTC: def __init__(self, model_path: str, device='cpu', max_chunk_size=16): def __init__(self, model_path: str, device='cpu'): with open('data/alphabet-all-v5.txt', 'r', encoding = 'utf-8') as fp : dictionary = [s[:-1] for s in fp.readlines()] self.device = device self.max_chunk_size = max_chunk_size model = OCR(dictionary, 768) sd = torch.load(model_path, map_location = 'cpu') Loading @@ -414,21 +413,22 @@ class OCR48pxCTC: self.device = device @torch.no_grad() def __call__(self, img: np.ndarray, textblk_lst: List[TextBlock]) : def __call__(self, img: np.ndarray, textblk_lst: List[TextBlock], chunk_size = 16, regions: List = None, textblk_lst_indices: List = None) -> None: if isinstance(textblk_lst, TextBlock): textblk_lst = [textblk_lst] if regions is None or textblk_lst_indices is None: regions = [] textblk_lst_indices = [] region_idx = 0 for blk_idx, textblk in enumerate(textblk_lst): for ii in range(len(textblk)): textblk_lst_indices.append(blk_idx) regions.append(textblk.get_transformed_region(img, ii, 48, maxwidth=8100)) region_idx += 1 region = textblk.get_transformed_region(img, ii, 48, maxwidth=8100) regions.append(region) perm = range(len(regions)) chunck_idx = 0 for indices in chunks(perm, self.max_chunk_size) : for indices in chunks(perm, chunk_size) : N = len(indices) widths = [regions[i].shape[1] for i in indices] # max_width = 4 * (max(widths) + 7) // 4 Loading modules/ocr/model_32px.py +13 −14 Original line number Diff line number Diff line Loading @@ -541,9 +541,8 @@ def chunks(lst, n): class OCR32pxModel: def __init__(self, model_path, device='cpu', max_chunk_size=16) -> None: def __init__(self, model_path, device='cpu') -> None: self.device = device self.max_chunk_size = max_chunk_size self.text_height = 32 self.net = None Loading @@ -562,22 +561,22 @@ class OCR32pxModel: self.device = device @torch.no_grad() def __call__(self, img: np.ndarray, textblk_lst: List[TextBlock]) -> None: def __call__(self, img: np.ndarray, textblk_lst: List[TextBlock], chunk_size = 16, regions: List = None, textblk_lst_indices: List = None) -> None: if isinstance(textblk_lst, TextBlock): textblk_lst = [textblk_lst] if regions is None or textblk_lst_indices is None: regions = [] textblk_lst_indices = [] region_idx = 0 for blk_idx, textblk in enumerate(textblk_lst): for ii in range(len(textblk)): textblk_lst_indices.append(blk_idx) region = textblk.get_transformed_region(img, ii, self.text_height, maxwidth=3064) regions.append(region) region_idx += 1 # regions = [textblk.get_transformed_region(img, idx, self.text_height) for idx in range(len(textblk))] perm = range(len(regions)) chunck_idx = 0 for indices in chunks(perm, self.max_chunk_size) : for indices in chunks(perm, chunk_size) : N = len(indices) widths = [regions[i].shape[1] for i in indices] max_width = 4 * (max(widths) + 7) // 4 Loading Loading
modules/ocr/__init__.py +1 −139 Original line number Diff line number Diff line from typing import List import numpy as np from .base import OCRBase, register_OCR, DEVICE_SELECTOR, DEFAULT_DEVICE, TextBlock, OCR No newline at end of file from .model_32px import OCR32pxModel @register_OCR('mit32px') class OCRMIT32px(OCRBase): params = { 'chunk_size': { 'type': 'selector', 'options': [8, 16, 24, 32], 'value': 16 }, 'device': DEVICE_SELECTOR(), 'description': 'OCRMIT32px' } device = DEFAULT_DEVICE chunk_size = 16 download_file_list = [{ 'url': 'https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/ocr.zip', 'files': ['ocr.ckpt'], 'sha256_pre_calculated': ['d9f619a9dccce8ce88357d1b17d25f07806f225c033ea42c64e86c45446cfe71'], 'save_files': ['data/models/mit32px_ocr.ckpt'], 'archived_files': 'ocr.zip', 'archive_sha256_pre_calculated': '47405638b96fa2540a5ee841a4cd792f25062c09d9458a973362d40785f95d7a', }] _load_model_keys = {'model'} def __init__(self, **params) -> None: super().__init__(**params) self.device = self.params['device']['value'] self.chunk_size = int(self.params['chunk_size']['value']) self.model: OCR32pxModel = None def _load_model(self): self.model = OCR32pxModel(r'data/models/mit32px_ocr.ckpt', self.device, self.chunk_size) def ocr_img(self, img: np.ndarray) -> str: return self.model.ocr_img(img) def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock]): return self.model(img, blk_list) def updateParam(self, param_key: str, param_content): super().updateParam(param_key, param_content) device = self.params['device']['value'] chunk_size = int(self.params['chunk_size']['value']) if self.device != device: self.model.to(device) self.chunk_size = chunk_size self.model.max_chunk_size = chunk_size from .mit48px_ctc import OCR48pxCTC @register_OCR('mit48px_ctc') class OCRMIT48pxCTC(OCRBase): params = { 'chunk_size': { 'type': 'selector', 'options': [8,16,24,32], 'value': 16 }, 'device': DEVICE_SELECTOR(), 'description': 'mit48px_ctc' } device = DEFAULT_DEVICE chunk_size = 16 download_file_list = [{ 'url': 'https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/ocr-ctc.zip', 'files': ['ocr-ctc.ckpt', 'alphabet-all-v5.txt'], 'sha256_pre_calculated': ['8b0837a24da5fde96c23ca47bb7abd590cd5b185c307e348c6e0b7238178ed89', None], 'save_files': ['data/models/mit48pxctc_ocr.ckpt', 'data/alphabet-all-v5.txt'], 'archived_files': 'ocr-ctc.zip', 'archive_sha256_pre_calculated': 'fc61c52f7a811bc72c54f6be85df814c6b60f63585175db27cb94a08e0c30101', }] _load_model_keys = {'model'} def __init__(self, **params) -> None: super().__init__(**params) self.device = self.params['device']['value'] self.chunk_size = int(self.params['chunk_size']['value']) self.model: OCR48pxCTC = None def _load_model(self): self.model = OCR48pxCTC(r'data/models/mit48pxctc_ocr.ckpt', self.device, self.chunk_size) def ocr_img(self, img: np.ndarray) -> str: return self.model.ocr_img(img) def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock]): return self.model(img, blk_list) def updateParam(self, param_key: str, param_content): super().updateParam(param_key, param_content) device = self.params['device']['value'] chunk_size = int(self.params['chunk_size']['value']) if self.device != device: self.model.to(device) self.chunk_size = chunk_size self.model.max_chunk_size = chunk_size from .mit48px import Model48pxOCR OCR48PXMODEL_PATH = r'data/models/ocr_ar_48px.ckpt' @register_OCR('mit48px') class OCRMIT48px(OCRBase): params = { 'device': DEVICE_SELECTOR(), 'description': 'mit48px' } device = DEFAULT_DEVICE download_file_list = [{ 'url': 'https://huggingface.co/zyddnys/manga-image-translator/resolve/main/', 'files': [OCR48PXMODEL_PATH, 'data/alphabet-all-v7.txt'], 'sha256_pre_calculated': ['29daa46d080818bb4ab239a518a88338cbccff8f901bef8c9db191a7cb97671d', None], 'concatenate_url_filename': 2, }] _load_model_keys = {'model'} def __init__(self, **params) -> None: super().__init__(**params) self.device = self.params['device']['value'] self.model: Model48pxOCR = None def _load_model(self): self.model = Model48pxOCR(OCR48PXMODEL_PATH, self.device) def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock]): return self.model(img, blk_list) def updateParam(self, param_key: str, param_content): super().updateParam(param_key, param_content) device = self.params['device']['value'] if self.device != device: self.model.to(device) No newline at end of file
modules/ocr/base.py +3 −9 Original line number Diff line number Diff line Loading @@ -14,6 +14,7 @@ class OCRBase(BaseModule): _postprocess_hooks = OrderedDict() _preprocess_hooks = OrderedDict() _line_only: bool = False def __init__(self, **params) -> None: super().__init__(**params) Loading Loading @@ -41,17 +42,10 @@ class OCRBase(BaseModule): self._ocr_blk_list(img, blk_list) for callback_name, callback in self._postprocess_hooks.items(): callback(textblocks=blk_list, img=img, ocr_module=self) # for blk in blk_list: # if isinstance(blk.text, List): # for ii, t in enumerate(blk.text): # for callback in self.postprocess_hooks: # blk.text[ii] = callback(t, blk=blk) # else: # for callback in self.postprocess_hooks: # blk.text = callback(blk.text, blk=blk) return blk_list def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock]) -> None: def _ocr_blk_list(self, img: np.ndarray, blk_list: List[TextBlock], *args, **kwargs) -> None: raise NotImplementedError def ocr_img(self, img: np.ndarray) -> str: Loading
modules/ocr/mit48px.py +17 −19 Original line number Diff line number Diff line Loading @@ -119,12 +119,11 @@ class Model48pxOCR: }, } def __init__(self, model_path: str, device='cpu', max_chunk_size=16, *args, **kwargs): def __init__(self, model_path: str, device='cpu', *args, **kwargs): super().__init__(*args, **kwargs) self.device = device self.max_chunk_size = max_chunk_size with open('data/alphabet-all-v7.txt', 'r', encoding = 'utf-8') as fp: dictionary = [s[:-1] for s in fp.readlines()] Loading @@ -139,32 +138,31 @@ class Model48pxOCR: self.model.to(device) self.device = device def __call__(self, img: np.ndarray, textblk_lst: List[TextBlock], verbose: bool = False, ignore_bubble: int = 0) -> List[TextBlock]: def __call__(self, img: np.ndarray, textblk_lst: List[TextBlock], chunk_size = 16, regions: List = None, textblk_lst_indices: List = None) -> None: if isinstance(textblk_lst, TextBlock): textblk_lst = [textblk_lst] text_height = 48 max_chunk_size = 16 region_imgs = [] if regions is None or textblk_lst_indices is None: regions = [] textblk_lst_indices = [] region_idx = 0 for blk_idx, textblk in enumerate(textblk_lst): for ii in range(len(textblk)): textblk_lst_indices.append(blk_idx) region_imgs.append(textblk.get_transformed_region(img, ii, 48, maxwidth=8100)) region_idx += 1 region = textblk.get_transformed_region(img, ii, 48, maxwidth=8100) regions.append(region) perm = range(len(region_imgs)) perm = range(len(regions)) chunck_idx = 0 for indices in chunks(perm, max_chunk_size): for indices in chunks(perm, chunk_size): N = len(indices) widths = [region_imgs[i].shape[1] for i in indices] widths = [regions[i].shape[1] for i in indices] max_width = 4 * (max(widths) + 7) // 4 region = np.zeros((N, text_height, max_width, 3), dtype = np.uint8) for i, idx in enumerate(indices): W = region_imgs[idx].shape[1] region[i, :, : W, :]=region_imgs[idx] W = regions[idx].shape[1] region[i, :, : W, :]=regions[idx] image_tensor = (torch.from_numpy(region).float() - 127.5) / 127.5 image_tensor = einops.rearrange(image_tensor, 'N H W C -> N C H W') Loading
modules/ocr/mit48px_ctc.py +12 −12 Original line number Diff line number Diff line Loading @@ -392,11 +392,10 @@ class AvgMeter() : class OCR48pxCTC: def __init__(self, model_path: str, device='cpu', max_chunk_size=16): def __init__(self, model_path: str, device='cpu'): with open('data/alphabet-all-v5.txt', 'r', encoding = 'utf-8') as fp : dictionary = [s[:-1] for s in fp.readlines()] self.device = device self.max_chunk_size = max_chunk_size model = OCR(dictionary, 768) sd = torch.load(model_path, map_location = 'cpu') Loading @@ -414,21 +413,22 @@ class OCR48pxCTC: self.device = device @torch.no_grad() def __call__(self, img: np.ndarray, textblk_lst: List[TextBlock]) : def __call__(self, img: np.ndarray, textblk_lst: List[TextBlock], chunk_size = 16, regions: List = None, textblk_lst_indices: List = None) -> None: if isinstance(textblk_lst, TextBlock): textblk_lst = [textblk_lst] if regions is None or textblk_lst_indices is None: regions = [] textblk_lst_indices = [] region_idx = 0 for blk_idx, textblk in enumerate(textblk_lst): for ii in range(len(textblk)): textblk_lst_indices.append(blk_idx) regions.append(textblk.get_transformed_region(img, ii, 48, maxwidth=8100)) region_idx += 1 region = textblk.get_transformed_region(img, ii, 48, maxwidth=8100) regions.append(region) perm = range(len(regions)) chunck_idx = 0 for indices in chunks(perm, self.max_chunk_size) : for indices in chunks(perm, chunk_size) : N = len(indices) widths = [regions[i].shape[1] for i in indices] # max_width = 4 * (max(widths) + 7) // 4 Loading
modules/ocr/model_32px.py +13 −14 Original line number Diff line number Diff line Loading @@ -541,9 +541,8 @@ def chunks(lst, n): class OCR32pxModel: def __init__(self, model_path, device='cpu', max_chunk_size=16) -> None: def __init__(self, model_path, device='cpu') -> None: self.device = device self.max_chunk_size = max_chunk_size self.text_height = 32 self.net = None Loading @@ -562,22 +561,22 @@ class OCR32pxModel: self.device = device @torch.no_grad() def __call__(self, img: np.ndarray, textblk_lst: List[TextBlock]) -> None: def __call__(self, img: np.ndarray, textblk_lst: List[TextBlock], chunk_size = 16, regions: List = None, textblk_lst_indices: List = None) -> None: if isinstance(textblk_lst, TextBlock): textblk_lst = [textblk_lst] if regions is None or textblk_lst_indices is None: regions = [] textblk_lst_indices = [] region_idx = 0 for blk_idx, textblk in enumerate(textblk_lst): for ii in range(len(textblk)): textblk_lst_indices.append(blk_idx) region = textblk.get_transformed_region(img, ii, self.text_height, maxwidth=3064) regions.append(region) region_idx += 1 # regions = [textblk.get_transformed_region(img, idx, self.text_height) for idx in range(len(textblk))] perm = range(len(regions)) chunck_idx = 0 for indices in chunks(perm, self.max_chunk_size) : for indices in chunks(perm, chunk_size) : N = len(indices) widths = [regions[i].shape[1] for i in indices] max_width = 4 * (max(widths) + 7) // 4 Loading