Loading README.md +9 −1 Original line number Diff line number Diff line Loading @@ -65,9 +65,17 @@ To install from GitHub, use #### Example >>> from pykospacing import spacing >>> from pykospacing import Spacing >>> spacing = Spacing() >>> spacing("김형호영화시장분석가는'1987'의네이버영화정보네티즌10점평에서언급된단어들을지난해12월27일부터올해1월10일까지통계프로그램R과KoNLP패키지로텍스트마이닝하여분석했다.") "김형호 영화시장 분석가는 '1987'의 네이버 영화 정보 네티즌 10점 평에서 언급된 단어들을 지난해 12월 27일부터 올해 1월 10일까지 통계 프로그램 R과 KoNLP 패키지로 텍스트마이닝하여 분석했다." >>> # Apply a list of words that must be non-spaceing >>> spacing('귀밑에서턱까지잇따라난수염을구레나룻이라고한다.') '귀 밑에서 턱까지 잇따라 난 수염을 구레나 룻이라고 한다.' >>> spacing = Spacing(rules=['구레나룻']) >>> spacing('귀밑에서턱까지잇따라난수염을구레나룻이라고한다.') '귀 밑에서 턱까지 잇따라 난 수염을 구레나룻이라고 한다.' Run on command line(thanks [lqez](https://github.com/lqez)). Loading pykospacing/kospacing.py +22 −18 Original line number Diff line number Diff line Loading @@ -7,7 +7,7 @@ import pkg_resources from tensorflow.keras.models import load_model from pykospacing.embedding_maker import encoding_and_padding, load_vocab __all__ = ['spacing', ] __all__ = ['Spacing', ] os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' model_path = pkg_resources.resource_filename( Loading @@ -17,15 +17,18 @@ dic_path = pkg_resources.resource_filename( MODEL = load_model(model_path) MODEL.make_predict_function() W2IDX, _ = load_vocab(dic_path) MAX_LEN = 198 class PredSpacing: class Spacing: """predict spacing for input string """ def __init__(self, model, w2idx): self._model = model self._w2idx = w2idx def __init__(self, rules=[]): self._model = MODEL self._w2idx = W2IDX self.max_len = MAX_LEN self.pattern = re.compile(r'\s+') self.rules = [(re.compile('\s*'.join(r)), r) for r in rules] def get_spaced_sent(self, raw_sent): raw_sent_ = "«" + raw_sent + "»" Loading Loading @@ -53,17 +56,18 @@ class PredSpacing: subs = subs.replace('»', '') return subs def apply_rules(self, spaced_sent): for rgx, word in self.rules: spaced_sent = rgx.sub(word, spaced_sent) return spaced_sent PredSpacing = PredSpacing(MODEL, W2IDX) MAX_LEN = 198 def spacing(sent): if len(sent) > MAX_LEN: splitted_sent = [sent[y-MAX_LEN:y] for y in range(MAX_LEN, len(sent)+MAX_LEN, MAX_LEN)] spaced_sent = ''.join([PredSpacing.get_spaced_sent(ss) def __call__(self, sent): if len(sent) > self.max_len: splitted_sent = [sent[y-self.max_len:y] for y in range(self.max_len, len(sent)+self.max_len, self.max_len)] spaced_sent = ''.join([self.get_spaced_sent(ss) for ss in splitted_sent]) else: spaced_sent = PredSpacing.get_spaced_sent(sent) spaced_sent = self.get_spaced_sent(sent) if len(self.rules) > 0: spaced_sent = self.apply_rules(spaced_sent) return spaced_sent.strip() pykospacing/pykos.py +3 −5 Original line number Diff line number Diff line # -*- coding: utf-8 -*- import sys import argparse from pykospacing import spacing from pykospacing import Spacing def get_parser(): Loading @@ -22,12 +22,10 @@ def main(args=sys.argv[1:]): source = args.infile.read() limit = 198 result = '\n' spacing = Spacing() for line in source.splitlines(): while len(line) > limit: result += spacing(line[0:limit]) line = line[limit:] result += spacing(line) result += '\n' if args.overwrite: Loading setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -3,7 +3,7 @@ from setuptools import setup setup(name='pykospacing', python_requires='>=3.6', version=0.4, version=0.5, url='https://github.com/haven-jeon/PyKoSpacing', license='GPL-3', author='Heewon Jeon', Loading Loading
README.md +9 −1 Original line number Diff line number Diff line Loading @@ -65,9 +65,17 @@ To install from GitHub, use #### Example >>> from pykospacing import spacing >>> from pykospacing import Spacing >>> spacing = Spacing() >>> spacing("김형호영화시장분석가는'1987'의네이버영화정보네티즌10점평에서언급된단어들을지난해12월27일부터올해1월10일까지통계프로그램R과KoNLP패키지로텍스트마이닝하여분석했다.") "김형호 영화시장 분석가는 '1987'의 네이버 영화 정보 네티즌 10점 평에서 언급된 단어들을 지난해 12월 27일부터 올해 1월 10일까지 통계 프로그램 R과 KoNLP 패키지로 텍스트마이닝하여 분석했다." >>> # Apply a list of words that must be non-spaceing >>> spacing('귀밑에서턱까지잇따라난수염을구레나룻이라고한다.') '귀 밑에서 턱까지 잇따라 난 수염을 구레나 룻이라고 한다.' >>> spacing = Spacing(rules=['구레나룻']) >>> spacing('귀밑에서턱까지잇따라난수염을구레나룻이라고한다.') '귀 밑에서 턱까지 잇따라 난 수염을 구레나룻이라고 한다.' Run on command line(thanks [lqez](https://github.com/lqez)). Loading
pykospacing/kospacing.py +22 −18 Original line number Diff line number Diff line Loading @@ -7,7 +7,7 @@ import pkg_resources from tensorflow.keras.models import load_model from pykospacing.embedding_maker import encoding_and_padding, load_vocab __all__ = ['spacing', ] __all__ = ['Spacing', ] os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' model_path = pkg_resources.resource_filename( Loading @@ -17,15 +17,18 @@ dic_path = pkg_resources.resource_filename( MODEL = load_model(model_path) MODEL.make_predict_function() W2IDX, _ = load_vocab(dic_path) MAX_LEN = 198 class PredSpacing: class Spacing: """predict spacing for input string """ def __init__(self, model, w2idx): self._model = model self._w2idx = w2idx def __init__(self, rules=[]): self._model = MODEL self._w2idx = W2IDX self.max_len = MAX_LEN self.pattern = re.compile(r'\s+') self.rules = [(re.compile('\s*'.join(r)), r) for r in rules] def get_spaced_sent(self, raw_sent): raw_sent_ = "«" + raw_sent + "»" Loading Loading @@ -53,17 +56,18 @@ class PredSpacing: subs = subs.replace('»', '') return subs def apply_rules(self, spaced_sent): for rgx, word in self.rules: spaced_sent = rgx.sub(word, spaced_sent) return spaced_sent PredSpacing = PredSpacing(MODEL, W2IDX) MAX_LEN = 198 def spacing(sent): if len(sent) > MAX_LEN: splitted_sent = [sent[y-MAX_LEN:y] for y in range(MAX_LEN, len(sent)+MAX_LEN, MAX_LEN)] spaced_sent = ''.join([PredSpacing.get_spaced_sent(ss) def __call__(self, sent): if len(sent) > self.max_len: splitted_sent = [sent[y-self.max_len:y] for y in range(self.max_len, len(sent)+self.max_len, self.max_len)] spaced_sent = ''.join([self.get_spaced_sent(ss) for ss in splitted_sent]) else: spaced_sent = PredSpacing.get_spaced_sent(sent) spaced_sent = self.get_spaced_sent(sent) if len(self.rules) > 0: spaced_sent = self.apply_rules(spaced_sent) return spaced_sent.strip()
pykospacing/pykos.py +3 −5 Original line number Diff line number Diff line # -*- coding: utf-8 -*- import sys import argparse from pykospacing import spacing from pykospacing import Spacing def get_parser(): Loading @@ -22,12 +22,10 @@ def main(args=sys.argv[1:]): source = args.infile.read() limit = 198 result = '\n' spacing = Spacing() for line in source.splitlines(): while len(line) > limit: result += spacing(line[0:limit]) line = line[limit:] result += spacing(line) result += '\n' if args.overwrite: Loading
setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -3,7 +3,7 @@ from setuptools import setup setup(name='pykospacing', python_requires='>=3.6', version=0.4, version=0.5, url='https://github.com/haven-jeon/PyKoSpacing', license='GPL-3', author='Heewon Jeon', Loading