diff --git a/chars2vec/__init__.py b/chars2vec/__init__.py index 71a1d85..70368ea 100644 --- a/chars2vec/__init__.py +++ b/chars2vec/__init__.py @@ -1 +1 @@ -from .model import * \ No newline at end of file +from .model import Chars2Vec, load_model, save_model, train_model \ No newline at end of file diff --git a/chars2vec/model.py b/chars2vec/model.py index a98215e..0ed0127 100644 --- a/chars2vec/model.py +++ b/chars2vec/model.py @@ -1,18 +1,17 @@ -import numpy as np +import os import pickle + import keras -import os +import numpy as np class Chars2Vec: - def __init__(self, emb_dim, char_to_ix): - ''' + """ Creates chars2vec model. - :param emb_dim: int, dimension of embeddings. :param char_to_ix: dict, keys are characters, values are sequence numbers of characters. - ''' + """ if not isinstance(emb_dim, int) or emb_dim < 1: raise TypeError("parameter 'emb_dim' must be a positive integer") @@ -25,14 +24,19 @@ def __init__(self, emb_dim, char_to_ix): self.vocab_size = len(self.char_to_ix) self.dim = emb_dim self.cache = {} + self.embedding_model = self._get_embedding_model() + self.model = self._get_model() + self.model.compile(optimizer="adam", loss="mae") - lstm_input = keras.layers.Input(shape=(None, self.vocab_size)) + def _get_embedding_model(self): + inputs = keras.layers.Input(shape=(None, self.vocab_size)) - x = keras.layers.LSTM(emb_dim, return_sequences=True)(lstm_input) - x = keras.layers.LSTM(emb_dim)(x) + outputs = keras.layers.LSTM(self.dim, return_sequences=True)(inputs) + outputs = keras.layers.LSTM(self.dim)(outputs) - self.embedding_model = keras.models.Model(inputs=[lstm_input], outputs=x) + return keras.models.Model(inputs=[inputs], outputs=outputs) + def _get_model(self): model_input_1 = keras.layers.Input(shape=(None, self.vocab_size)) model_input_2 = keras.layers.Input(shape=(None, self.vocab_size)) @@ -40,116 +44,101 @@ def __init__(self, emb_dim, char_to_ix): embedding_2 = self.embedding_model(model_input_2) x = keras.layers.Subtract()([embedding_1, embedding_2]) x = keras.layers.Dot(1)([x, x]) - model_output = keras.layers.Dense(1, activation='sigmoid')(x) - - self.model = keras.models.Model(inputs=[model_input_1, model_input_2], outputs=model_output) - self.model.compile(optimizer='adam', loss='mae') - - - def fit(self, word_pairs, targets, - max_epochs, patience, validation_split, batch_size): - ''' + model_output = keras.layers.Dense(1, activation="sigmoid")(x) + + return keras.models.Model( + inputs=[model_input_1, model_input_2], outputs=model_output + ) + + def _create_word_embedding(self, word): + word_embedding = [] + for char in word.lower(): + char_embedding = np.zeros(self.vocab_size) + if char in self.char_to_ix: + char_embedding[self.char_to_ix[char]] = 1 + word_embedding.append(char_embedding) + return word_embedding + + def fit( + self, word_pairs, targets, max_epochs, patience, validation_split, batch_size + ): + """ Fits model. - :param word_pairs: list or numpy.ndarray of word pairs. :param targets: list or numpy.ndarray of targets. :param max_epochs: parameter 'epochs' of keras model. :param patience: parameter 'patience' of callback in keras model. :param validation_split: parameter 'validation_split' of keras model. :param batch_size: parameter 'batch_size' of keras model. - ''' + """ - if not isinstance(word_pairs, list) and not isinstance(word_pairs, np.ndarray): + if not isinstance(word_pairs, (list, np.ndarray)): raise TypeError("parameters 'word_pairs' must be a list or numpy.ndarray") - if not isinstance(targets, list) and not isinstance(targets, np.ndarray): + if not isinstance(targets, (list, np.ndarray)): raise TypeError("parameters 'targets' must be a list or numpy.ndarray") x_1, x_2 = [], [] + for word_pair in word_pairs: + if len(word_pair) != 2: + raise ValueError( + "`word_pairs` contains a 'pair' with more than two words." + ) - for pair_words in word_pairs: - emb_list_1 = [] - emb_list_2 = [] - - if not isinstance(pair_words[0], str) or not isinstance(pair_words[1], str): - raise TypeError("word must be a string") - - first_word = pair_words[0].lower() - second_word = pair_words[1].lower() - - for t in range(len(first_word)): - - if first_word[t] in self.char_to_ix: - x = np.zeros(self.vocab_size) - x[self.char_to_ix[first_word[t]]] = 1 - emb_list_1.append(x) - - else: - emb_list_1.append(np.zeros(self.vocab_size)) - - x_1.append(np.array(emb_list_1)) + if not all(isinstance(word, str) for word in word_pair): + raise TypeError("Both words must be strings.") - for t in range(len(second_word)): + first_word, second_word = word_pair - if second_word[t] in self.char_to_ix: - x = np.zeros(self.vocab_size) - x[self.char_to_ix[second_word[t]]] = 1 - emb_list_2.append(x) + first_word_embedding = self._create_word_embedding(word=first_word.lower()) + x_1.append(np.array(first_word_embedding)) - else: - emb_list_2.append(np.zeros(self.vocab_size)) - - x_2.append(np.array(emb_list_2)) + second_word_embedding = self._create_word_embedding( + word=second_word.lower() + ) + x_2.append(np.array(second_word_embedding)) x_1_pad_seq = keras.preprocessing.sequence.pad_sequences(x_1) x_2_pad_seq = keras.preprocessing.sequence.pad_sequences(x_2) - self.model.fit([x_1_pad_seq, x_2_pad_seq], targets, - batch_size=batch_size, epochs=max_epochs, - validation_split=validation_split, - callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)]) + self.model.fit( + [x_1_pad_seq, x_2_pad_seq], + targets, + batch_size=batch_size, + epochs=max_epochs, + validation_split=validation_split, + callbacks=[ + keras.callbacks.EarlyStopping(monitor="val_loss", patience=patience) + ], + ) def vectorize_words(self, words, maxlen_padseq=None): - ''' + """ Returns embeddings for list of words. Uses cache of word embeddings to vectorization speed up. - :param words: list or numpy.ndarray of strings. :param maxlen_padseq: parameter 'maxlen' for keras pad_sequences transform. - :return word_vectors: numpy.ndarray, word embeddings. - ''' + """ - if not isinstance(words, list) and not isinstance(words, np.ndarray): + if not isinstance(words, (list, np.ndarray)): raise TypeError("parameter 'words' must be a list or numpy.ndarray") words = [w.lower() for w in words] unique_words = np.unique(words) new_words = [w for w in unique_words if w not in self.cache] - if len(new_words) > 0: - + if new_words: list_of_embeddings = [] - - for current_word in new_words: - - if not isinstance(current_word, str): + for word in new_words: + if not isinstance(word, str): raise TypeError("word must be a string") - current_embedding = [] - - for t in range(len(current_word)): - - if current_word[t] in self.char_to_ix: - x = np.zeros(self.vocab_size) - x[self.char_to_ix[current_word[t]]] = 1 - current_embedding.append(x) - - else: - current_embedding.append(np.zeros(self.vocab_size)) + word_embedding = self._create_word_embedding(word=word.lower()) + list_of_embeddings.append(np.array(word_embedding)) - list_of_embeddings.append(np.array(current_embedding)) - - embeddings_pad_seq = keras.preprocessing.sequence.pad_sequences(list_of_embeddings, maxlen=maxlen_padseq) + embeddings_pad_seq = keras.preprocessing.sequence.pad_sequences( + list_of_embeddings, maxlen=maxlen_padseq + ) new_words_vectors = self.embedding_model.predict([embeddings_pad_seq]) for i in range(len(new_words)): @@ -161,54 +150,60 @@ def vectorize_words(self, words, maxlen_padseq=None): def save_model(c2v_model, path_to_model): - ''' + """ Saves trained model to directory. - :param c2v_model: Chars2Vec object, trained model. :param path_to_model: str, path to save model. - ''' + """ if not os.path.exists(path_to_model): os.makedirs(path_to_model) - c2v_model.embedding_model.save_weights(path_to_model + '/weights.h5') + c2v_model.embedding_model.save_weights(path_to_model + "/weights.h5") - with open(path_to_model + '/model.pkl', 'wb') as f: + with open(path_to_model + "/model.pkl", "wb") as f: pickle.dump([c2v_model.dim, c2v_model.char_to_ix], f, protocol=2) def load_model(path): - ''' + """ Loads trained model. - :param path: str, if it is 'eng_50', 'eng_100', 'eng_150', 'eng_200' or 'eng_300' then loads one of default models, else loads model from `path`. - :return c2v_model: Chars2Vec object, trained model. - ''' + """ - if path in ['eng_50', 'eng_100', 'eng_150', 'eng_200', 'eng_300']: - path_to_model = os.path.dirname(os.path.abspath(__file__)) + '/trained_models/' + path + if path in ["eng_50", "eng_100", "eng_150", "eng_200", "eng_300"]: + path_to_model = ( + os.path.dirname(os.path.abspath(__file__)) + "/trained_models/" + path + ) else: path_to_model = path - with open(path_to_model + '/model.pkl', 'rb') as f: + with open(path_to_model + "/model.pkl", "rb") as f: structure = pickle.load(f) emb_dim, char_to_ix = structure[0], structure[1] c2v_model = Chars2Vec(emb_dim, char_to_ix) - c2v_model.embedding_model.load_weights(path_to_model + '/weights.h5') - c2v_model.embedding_model.compile(optimizer='adam', loss='mae') + c2v_model.embedding_model.load_weights(path_to_model + "/weights.h5") + c2v_model.embedding_model.compile(optimizer="adam", loss="mae") return c2v_model -def train_model(emb_dim, X_train, y_train, model_chars, - max_epochs=200, patience=10, validation_split=0.05, batch_size=64): - ''' +def train_model( + emb_dim, + X_train, + y_train, + model_chars, + max_epochs=200, + patience=10, + validation_split=0.05, + batch_size=64, +): + """ Creates and trains chars2vec model using given training data. - :param emb_dim: int, dimension of embeddings. :param X_train: list or numpy.ndarray of word pairs. :param y_train: list or numpy.ndarray of target values that describe the proximity of words. @@ -217,23 +212,20 @@ def train_model(emb_dim, X_train, y_train, model_chars, :param patience: parameter 'patience' of callback in keras model. :param validation_split: parameter 'validation_split' of keras model. :param batch_size: parameter 'batch_size' of keras model. - :return c2v_model: Chars2Vec object, trained model. - ''' + """ - if not isinstance(X_train, list) and not isinstance(X_train, np.ndarray): - raise TypeError("parameter 'X_train' must be a list or numpy.ndarray")\ - - if not isinstance(y_train, list) and not isinstance(y_train, np.ndarray): + if not isinstance(X_train, (list, np.ndarray)): + raise TypeError("parameter 'X_train' must be a list or numpy.ndarray") + if not isinstance(y_train, (list, np.ndarray)): raise TypeError("parameter 'y_train' must be a list or numpy.ndarray") - - if not isinstance(model_chars, list) and not isinstance(model_chars, np.ndarray): + if not isinstance(model_chars, (list, np.ndarray)): raise TypeError("parameter 'model_chars' must be a list or numpy.ndarray") char_to_ix = {ch: i for i, ch in enumerate(model_chars)} c2v_model = Chars2Vec(emb_dim, char_to_ix) - targets = [float(el) for el in y_train] + targets = np.array(y_train) c2v_model.fit(X_train, targets, max_epochs, patience, validation_split, batch_size) return c2v_model diff --git a/setup.py b/setup.py index 63f3c0b..65ae0e0 100644 --- a/setup.py +++ b/setup.py @@ -1,37 +1,31 @@ -import sys -import subprocess -PY_VER = sys.version[0] -subprocess.call(["pip{:} install -r requirements.txt".format(PY_VER)], shell=True) - from setuptools import setup + +with open("README.md") as readme_file: + readme = readme_file.read() + +with open("requirements.txt") as f: + install_requires = f.read().splitlines() + + setup( - name='chars2vec', - version='0.1.7', - author='Vladimir Chikin', - author_email='v4@intuition.engineering', - packages=['chars2vec'], + name="chars2vec", + version="0.1.8", + author="Vladimir Chikin", + author_email="v4@intuition.engineering", + packages=["chars2vec"], include_package_data=True, - package_data={'chars2vec': ['trained_models/*']}, - description='Character-based word embeddings model based on RNN', - maintainer='Intuition', - maintainer_email='dev@intuition.engineering', - url='https://github.com/IntuitionEngineeringTeam/chars2vec', - download_url='https://github.com/IntuitionEngineeringTeam/chars2vec/archive/master.zip', - license='Apache License 2.0', - long_description='Chars2vec library could be very useful if you are dealing with the texts \ - containing abbreviations, slang, typos, or some other specific textual dataset. \ - Chars2vec language model is based on the symbolic representation of words – \ - the model maps each word to a vector of a fixed length. \ - These vector representations are obtained with a custom neural netowrk while \ - the latter is being trained on pairs of similar and non-similar words. \ - This custom neural net includes LSTM, reading sequences of characters in words, as its part. \ - The model maps similarly written words to proximal vectors. \ - This approach enables creation of an embedding in vector space for any sequence of characters.\ - Chars2vec models does not keep any dictionary of embeddings, \ - but generates embedding vectors inplace using pretrained model. \ - There are pretrained models of dimensions 50, 100, 150, 200 and 300 for the English language.\ - The library provides convenient user API to train a model for an arbitrary set of characters.', - classifiers=['Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3'] + package_data={"chars2vec": ["trained_models/*"]}, + description="Character-based word embeddings model based on RNN", + maintainer="Intuition", + maintainer_email="dev@intuition.engineering", + url="https://github.com/IntuitionEngineeringTeam/chars2vec", + download_url="https://github.com/IntuitionEngineeringTeam/chars2vec/archive/master.zip", + license="Apache License 2.0", + long_description=readme, + classifiers=[ + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + ], + install_requires=install_requires, ) \ No newline at end of file