Commit 2fb74c05 authored by Park Hyunwoo's avatar Park Hyunwoo
Browse files

Clean up python codes

parent 13061236
Loading
Loading
Loading
Loading
+1 −3
Original line number Diff line number Diff line


from pykospacing.kospacing import *

__version__ = '0.01'
__version__ = '0.1'
+9 −15
Original line number Diff line number Diff line
__all__ = ['load_embedding', 'load_vocab', 'encoding_and_padding']



import os
from keras.preprocessing import sequence
import json
import numpy as np
from keras.preprocessing import sequence


__all__ = ['load_embedding', 'load_vocab', 'encoding_and_padding']


def load_embedding(embeddings_file):
    return(np.load(embeddings_file))


def load_vocab(vocab_path):
    with open(vocab_path, 'r') as f:
        data = json.loads(f.read())
@@ -25,7 +23,6 @@ def encoding_and_padding(word2idx_dic, sequences, **params):
    1. making item to idx
    2. padding

    
    :word2idx_dic
    :sequences: list of lists where each element is a sequence
    :maxlen: int, maximum length
@@ -38,6 +35,3 @@ def encoding_and_padding(word2idx_dic, sequences, **params):
    seq_idx = [[word2idx_dic.get(a, word2idx_dic['__ETC__']) for a in i] for i in sequences]
    params['value'] = word2idx_dic['__PAD__']
    return(sequence.pad_sequences(seq_idx, **params))
    

+22 −29
Original line number Diff line number Diff line
# -*- coding: utf-8 -*-
__all__ = ['spacing',]

import json, os, re, sys

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import numpy as np
from keras.models import load_model

from pykospacing.embedding_maker import load_vocab, encoding_and_padding
import pkg_resources, warnings
import numpy as np
import os
import pkg_resources
import re
import warnings


model_path = pkg_resources.resource_filename('pykospacing', os.path.join("resources", "models", "kospacing"))
dic_path = pkg_resources.resource_filename('pykospacing', os.path.join("resources", "dicts", "c2v.dic"))
model = None
__all__ = ['spacing', ]
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

model_path = pkg_resources.resource_filename('pykospacing', os.path.join('resources', 'models', 'kospacing'))
dic_path = pkg_resources.resource_filename('pykospacing', os.path.join('resources', 'dicts', 'c2v.dic'))
model = load_model(model_path)
model._make_predict_function()
w2idx, _ = load_vocab(dic_path)

class pred_spacing:

class pred_spacing:
    def __init__(self, model, w2idx):
        self.model = model
        self.w2idx = w2idx
        self.pattern = re.compile(r'\s+')

        
    def get_spaced_sent(self, raw_sent):
        #print('sent : {}'.format(raw_sent), file=sys.stdout)
        raw_sent_ = "«" + raw_sent + "»"
        raw_sent_ = raw_sent_.replace(' ', '^')
        sents_in = [raw_sent_, ]
@@ -52,14 +48,11 @@ class pred_spacing:
        return subs



pred_spacing = pred_spacing(model, w2idx)


def spacing(sent):
    if len(sent) > 198:
        warnings.warn("One sentence can not contain more than 198 characters. : {}".format(sent))
        warnings.warn('One sentence can not contain more than 198 characters. : {}'.format(sent))
    spaced_sent = pred_spacing.get_spaced_sent(sent)
    return(spaced_sent.strip())


+5 −17
Original line number Diff line number Diff line
from setuptools import setup, find_packages
from setuptools import setup
from pykospacing import __version__


setup(name='pykospacing',

      version='0.1',

      version=__version__,
      url='https://github.com/haven-jeon/PyKoSpacing',

      license='GPL-3',

      author='Heewon Jeon',

      author_email='madjakarta@gmail.com',

      description='Python package for automatic Korean word spacing.',

      packages=['pykospacing', ],

      long_description=open('README.md', encoding='utf-8').read(),

      zip_safe=False,

      include_package_data=True,

      install_requires=['tensorflow >= 1.4.0, <= 1.6.0', 'keras >= 2.1.5', 'h5py >= 2.7.1'],
      )