Commit 99335e9f authored by haven-jeon's avatar haven-jeon
Browse files

Apply rules for non-spacing words

parent ddf7fe59
Loading
Loading
Loading
Loading
+9 −1
Original line number Diff line number Diff line
@@ -65,9 +65,17 @@ To install from GitHub, use
#### Example 


    >>> from pykospacing import spacing
    >>> from pykospacing import Spacing
    >>> spacing = Spacing()
    >>> spacing("김형호영화시장분석가는'1987'의네이버영화정보네티즌10점평에서언급된단어들을지난해12월27일부터올해1월10일까지통계프로그램R과KoNLP패키지로텍스트마이닝하여분석했다.")
    "김형호 영화시장 분석가는 '1987'의 네이버 영화 정보 네티즌 10점 평에서 언급된 단어들을 지난해 12월 27일부터 올해 1월 10일까지 통계 프로그램 R과 KoNLP 패키지로 텍스트마이닝하여 분석했다."
    >>> # Apply a list of words that must be non-spaceing
    >>> spacing('귀밑에서턱까지잇따라난수염을구레나룻이라고한다.')
    '귀 밑에서 턱까지 잇따라 난 수염을 구레나 룻이라고 한다.'
    >>> spacing = Spacing(rules=['구레나룻'])
    >>> spacing('귀밑에서턱까지잇따라난수염을구레나룻이라고한다.')
    '귀 밑에서 턱까지 잇따라 난 수염을 구레나룻이라고 한다.'



Run on command line(thanks [lqez](https://github.com/lqez)). 
+22 −18
Original line number Diff line number Diff line
@@ -7,7 +7,7 @@ import pkg_resources
from tensorflow.keras.models import load_model
from pykospacing.embedding_maker import encoding_and_padding, load_vocab

__all__ = ['spacing', ]
__all__ = ['Spacing', ]
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

model_path = pkg_resources.resource_filename(
@@ -17,15 +17,18 @@ dic_path = pkg_resources.resource_filename(
MODEL = load_model(model_path)
MODEL.make_predict_function()
W2IDX, _ = load_vocab(dic_path)
MAX_LEN = 198


class PredSpacing:
class Spacing:
    """predict spacing for input string
    """
    def __init__(self, model, w2idx):
        self._model = model
        self._w2idx = w2idx
    def __init__(self, rules=[]):
        self._model = MODEL
        self._w2idx = W2IDX
        self.max_len = MAX_LEN
        self.pattern = re.compile(r'\s+')
        self.rules = [(re.compile('\s*'.join(r)), r) for r in rules]

    def get_spaced_sent(self, raw_sent):
        raw_sent_ = "«" + raw_sent + "»"
@@ -53,17 +56,18 @@ class PredSpacing:
        subs = subs.replace('»', '')
        return subs

    def apply_rules(self, spaced_sent):
        for rgx, word in self.rules:
            spaced_sent = rgx.sub(word, spaced_sent)
        return spaced_sent

PredSpacing = PredSpacing(MODEL, W2IDX)

MAX_LEN = 198


def spacing(sent):
    if len(sent) > MAX_LEN:
        splitted_sent = [sent[y-MAX_LEN:y] for y in range(MAX_LEN, len(sent)+MAX_LEN, MAX_LEN)]
        spaced_sent = ''.join([PredSpacing.get_spaced_sent(ss)
    def __call__(self, sent):
        if len(sent) > self.max_len:
            splitted_sent = [sent[y-self.max_len:y] for y in range(self.max_len, len(sent)+self.max_len, self.max_len)]
            spaced_sent = ''.join([self.get_spaced_sent(ss)
                                for ss in splitted_sent])
        else:
        spaced_sent = PredSpacing.get_spaced_sent(sent)
            spaced_sent = self.get_spaced_sent(sent)
        if len(self.rules) > 0:
            spaced_sent = self.apply_rules(spaced_sent)
        return spaced_sent.strip()
+3 −5
Original line number Diff line number Diff line
# -*- coding: utf-8 -*-
import sys
import argparse
from pykospacing import spacing
from pykospacing import Spacing


def get_parser():
@@ -22,12 +22,10 @@ def main(args=sys.argv[1:]):

    source = args.infile.read()
    
    limit = 198
    result = '\n'
    spacing = Spacing()
    for line in source.splitlines():
        while len(line) > limit:
            result += spacing(line[0:limit])
            line = line[limit:]
        result += spacing(line)
        result += '\n'

    if args.overwrite:
+1 −1
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@ from setuptools import setup

setup(name='pykospacing',
      python_requires='>=3.6',
      version=0.4,
      version=0.5,
      url='https://github.com/haven-jeon/PyKoSpacing',
      license='GPL-3',
      author='Heewon Jeon',