Unverified Commit 195d5f12 authored by dmMaze's avatar dmMaze Committed by GitHub
Browse files

Merge pull request #20 from Snowad14/master

Add CT2 Offline Sugoi Translator.
parents e25c82a0 59379ff5
Loading
Loading
Loading
Loading
+38 −5
Original line number Diff line number Diff line
import urllib.request
from typing import Dict, List, Union
import time, requests, re, uuid, base64, hmac
import functools
import json
import time, requests, re, uuid, base64, hmac, functools, json, deepl
import ctranslate2, sentencepiece as spm
from .exceptions import InvalidSourceOrTargetLanguage, TranslatorSetupFailure, MissingTranslatorParams, TranslatorNotValid
from ..textdetector.textblock import TextBlock
from ..moduleparamparser import ModuleParamParser
from utils.registry import Registry
from utils.io_utils import text_is_empty
import deepl

TRANSLATORS = Registry('translators')
register_translator = TRANSLATORS.register_module
@@ -345,6 +343,41 @@ class DeeplTranslator(TranslatorBase):
        result = translator.translate_text(text, source_lang=source, target_lang=target)
        return [i.text for i in result]
    
SUGOIMODEL_TRANSLATOR_DIRPATH = 'data/models/sugoi_translator'
SUGOIMODEL_TOKENIZATOR_PATH = SUGOIMODEL_TRANSLATOR_DIRPATH + "\\spm.ja.nopretok.model"

@register_translator('Sugoi')
class SugoiTranslator(TranslatorBase):

    concate_text = False
    setup_params: Dict = {
        'device': {
            'type': 'selector',
            'options': ['cpu', 'cuda'],
            'select': 'cpu'
        }
    }

    def _setup_translator(self):
        self.lang_map['日本語'] = 'ja'
        self.lang_map['English'] = 'en'
        
        self.translator = ctranslate2.Translator(SUGOIMODEL_TRANSLATOR_DIRPATH, device=self.setup_params['device']['select'])
        self.tokenizator = spm.SentencePieceProcessor(model_file=SUGOIMODEL_TOKENIZATOR_PATH)

    def _translate(self, text: Union[str, List]) -> Union[str, List]:
        text = [i.replace(".", "@") for i in text]
        tokenized_text = self.tokenizator.encode(text, out_type=str, enable_sampling=True, alpha=0.1, nbest_size=-1)
        tokenized_translated = self.translator.translate_batch(tokenized_text)
        text_translated = [''.join(text[0]["tokens"]).replace('', ' ').replace("@", ".") for text in tokenized_translated]
        return text_translated

    def updateParam(self, param_key: str, param_content):
        super().updateParam(param_key, param_content)
        if param_key == 'device':
            if hasattr(self, 'translator'):
                delattr(self, 'translator')
            self.translator = ctranslate2.Translator(SUGOIMODEL_TRANSLATOR_DIRPATH, device=self.setup_params['device']['select'])

# # "dummy translator" is the name showed in the app
# @register_translator('dummy translator')
+5 −1
Original line number Diff line number Diff line
# To install pytorch cuda (gpu) version, please look https://pytorch.org/
urllib3==1.25.11 # https://github.com/psf/requests/issues/5740
jaconv
torch
torchvision
transformers
fugashi
unidic_lite
@@ -15,3 +17,5 @@ bs4
deepl
qtpy
pkuseg
sentencepiece
ctranslate2