Loading pykospacing/kospacing.py +32 −21 Original line number Diff line number Diff line # -*- coding: utf-8 -*- from keras.models import load_model from pykospacing.embedding_maker import load_vocab, encoding_and_padding import numpy as np import os import pkg_resources import re import warnings import numpy as np import pkg_resources from keras.models import load_model from pykospacing.embedding_maker import encoding_and_padding, load_vocab __all__ = ['spacing', ] os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' model_path = pkg_resources.resource_filename('pykospacing', os.path.join('resources', 'models', 'kospacing')) dic_path = pkg_resources.resource_filename('pykospacing', os.path.join('resources', 'dicts', 'c2v.dic')) model = load_model(model_path) model.make_predict_function() w2idx, _ = load_vocab(dic_path) model_path = pkg_resources.resource_filename( 'pykospacing', os.path.join('resources', 'models', 'kospacing')) dic_path = pkg_resources.resource_filename( 'pykospacing', os.path.join('resources', 'dicts', 'c2v.dic')) MODEL = load_model(model_path) MODEL.make_predict_function() W2IDX, _ = load_vocab(dic_path) class pred_spacing: class PredSpacing: """predict spacing for input string """ def __init__(self, model, w2idx): self.model = model self.w2idx = w2idx self._model = model self._w2idx = w2idx self.pattern = re.compile(r'\s+') def get_spaced_sent(self, raw_sent): raw_sent_ = "«" + raw_sent + "»" raw_sent_ = raw_sent_.replace(' ', '^') sents_in = [raw_sent_, ] mat_in = encoding_and_padding(word2idx_dic=self.w2idx, sequences=sents_in, maxlen=200, padding='post', truncating='post') results = self.model.predict(mat_in) mat_in = encoding_and_padding( word2idx_dic=self._w2idx, sequences=sents_in, maxlen=200, padding='post', truncating='post') results = self._model.predict(mat_in) mat_set = results[0, ] preds = np.array(['1' if i > 0.5 else '0' for i in mat_set[:len(raw_sent_)]]) preds = np.array( ['1' if i > 0.5 else '0' for i in mat_set[:len(raw_sent_)]]) return self.make_pred_sents(raw_sent_, preds) def make_pred_sents(self, x_sents, y_pred): Loading @@ -48,11 +54,16 @@ class pred_spacing: return subs pred_spacing = pred_spacing(model, w2idx) PredSpacing = PredSpacing(MODEL, W2IDX) MAX_LEN = 198 def spacing(sent): if len(sent) > 198: warnings.warn('One sentence can not contain more than 198 characters. : {}'.format(sent)) spaced_sent = pred_spacing.get_spaced_sent(sent) return(spaced_sent.strip()) if len(sent) > MAX_LEN: splitted_sent = [sent[y-MAX_LEN:y] for y in range(MAX_LEN, len(sent)+MAX_LEN, MAX_LEN)] spaced_sent = ''.join([PredSpacing.get_spaced_sent(ss) for ss in splitted_sent]) else: spaced_sent = PredSpacing.get_spaced_sent(sent) return spaced_sent.strip() Loading
pykospacing/kospacing.py +32 −21 Original line number Diff line number Diff line # -*- coding: utf-8 -*- from keras.models import load_model from pykospacing.embedding_maker import load_vocab, encoding_and_padding import numpy as np import os import pkg_resources import re import warnings import numpy as np import pkg_resources from keras.models import load_model from pykospacing.embedding_maker import encoding_and_padding, load_vocab __all__ = ['spacing', ] os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' model_path = pkg_resources.resource_filename('pykospacing', os.path.join('resources', 'models', 'kospacing')) dic_path = pkg_resources.resource_filename('pykospacing', os.path.join('resources', 'dicts', 'c2v.dic')) model = load_model(model_path) model.make_predict_function() w2idx, _ = load_vocab(dic_path) model_path = pkg_resources.resource_filename( 'pykospacing', os.path.join('resources', 'models', 'kospacing')) dic_path = pkg_resources.resource_filename( 'pykospacing', os.path.join('resources', 'dicts', 'c2v.dic')) MODEL = load_model(model_path) MODEL.make_predict_function() W2IDX, _ = load_vocab(dic_path) class pred_spacing: class PredSpacing: """predict spacing for input string """ def __init__(self, model, w2idx): self.model = model self.w2idx = w2idx self._model = model self._w2idx = w2idx self.pattern = re.compile(r'\s+') def get_spaced_sent(self, raw_sent): raw_sent_ = "«" + raw_sent + "»" raw_sent_ = raw_sent_.replace(' ', '^') sents_in = [raw_sent_, ] mat_in = encoding_and_padding(word2idx_dic=self.w2idx, sequences=sents_in, maxlen=200, padding='post', truncating='post') results = self.model.predict(mat_in) mat_in = encoding_and_padding( word2idx_dic=self._w2idx, sequences=sents_in, maxlen=200, padding='post', truncating='post') results = self._model.predict(mat_in) mat_set = results[0, ] preds = np.array(['1' if i > 0.5 else '0' for i in mat_set[:len(raw_sent_)]]) preds = np.array( ['1' if i > 0.5 else '0' for i in mat_set[:len(raw_sent_)]]) return self.make_pred_sents(raw_sent_, preds) def make_pred_sents(self, x_sents, y_pred): Loading @@ -48,11 +54,16 @@ class pred_spacing: return subs pred_spacing = pred_spacing(model, w2idx) PredSpacing = PredSpacing(MODEL, W2IDX) MAX_LEN = 198 def spacing(sent): if len(sent) > 198: warnings.warn('One sentence can not contain more than 198 characters. : {}'.format(sent)) spaced_sent = pred_spacing.get_spaced_sent(sent) return(spaced_sent.strip()) if len(sent) > MAX_LEN: splitted_sent = [sent[y-MAX_LEN:y] for y in range(MAX_LEN, len(sent)+MAX_LEN, MAX_LEN)] spaced_sent = ''.join([PredSpacing.get_spaced_sent(ss) for ss in splitted_sent]) else: spaced_sent = PredSpacing.get_spaced_sent(sent) return spaced_sent.strip()