From 6837323677beadca172a4edcdc49e93fea640508 Mon Sep 17 00:00:00 2001 From: Zax Date: Thu, 31 Dec 2020 10:01:18 -0600 Subject: [PATCH 1/5] hotfix: install requirements via pip install --- setup.py | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/setup.py b/setup.py index 63f3c0b..90cbdd1 100644 --- a/setup.py +++ b/setup.py @@ -1,25 +1,25 @@ -import sys -import subprocess -PY_VER = sys.version[0] -subprocess.call(["pip{:} install -r requirements.txt".format(PY_VER)], shell=True) - from setuptools import setup + +with open("requirements.txt") as f: + install_requires = f.read().splitlines() + + setup( - name='chars2vec', - version='0.1.7', - author='Vladimir Chikin', - author_email='v4@intuition.engineering', - packages=['chars2vec'], + name="chars2vec", + version="0.1.8", + author="Vladimir Chikin", + author_email="v4@intuition.engineering", + packages=["chars2vec"], include_package_data=True, - package_data={'chars2vec': ['trained_models/*']}, - description='Character-based word embeddings model based on RNN', - maintainer='Intuition', - maintainer_email='dev@intuition.engineering', - url='https://github.com/IntuitionEngineeringTeam/chars2vec', - download_url='https://github.com/IntuitionEngineeringTeam/chars2vec/archive/master.zip', - license='Apache License 2.0', - long_description='Chars2vec library could be very useful if you are dealing with the texts \ + package_data={"chars2vec": ["trained_models/*"]}, + description="Character-based word embeddings model based on RNN", + maintainer="Intuition", + maintainer_email="dev@intuition.engineering", + url="https://github.com/IntuitionEngineeringTeam/chars2vec", + download_url="https://github.com/IntuitionEngineeringTeam/chars2vec/archive/master.zip", + license="Apache License 2.0", + long_description="Chars2vec library could be very useful if you are dealing with the texts \ containing abbreviations, slang, typos, or some other specific textual dataset. \ Chars2vec language model is based on the symbolic representation of words – \ the model maps each word to a vector of a fixed length. \ @@ -31,7 +31,10 @@ Chars2vec models does not keep any dictionary of embeddings, \ but generates embedding vectors inplace using pretrained model. \ There are pretrained models of dimensions 50, 100, 150, 200 and 300 for the English language.\ - The library provides convenient user API to train a model for an arbitrary set of characters.', - classifiers=['Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3'] + The library provides convenient user API to train a model for an arbitrary set of characters.", + classifiers=[ + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + ], + install_requires=install_requires, ) \ No newline at end of file From 75c2c9723be7ab7709ff9b8d6e9fe3a3c4413631 Mon Sep 17 00:00:00 2001 From: Zax Date: Thu, 31 Dec 2020 10:02:29 -0600 Subject: [PATCH 2/5] import readme to setup.py as long_description --- setup.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/setup.py b/setup.py index 90cbdd1..65ae0e0 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,9 @@ from setuptools import setup +with open("README.md") as readme_file: + readme = readme_file.read() + with open("requirements.txt") as f: install_requires = f.read().splitlines() @@ -19,19 +22,7 @@ url="https://github.com/IntuitionEngineeringTeam/chars2vec", download_url="https://github.com/IntuitionEngineeringTeam/chars2vec/archive/master.zip", license="Apache License 2.0", - long_description="Chars2vec library could be very useful if you are dealing with the texts \ - containing abbreviations, slang, typos, or some other specific textual dataset. \ - Chars2vec language model is based on the symbolic representation of words – \ - the model maps each word to a vector of a fixed length. \ - These vector representations are obtained with a custom neural netowrk while \ - the latter is being trained on pairs of similar and non-similar words. \ - This custom neural net includes LSTM, reading sequences of characters in words, as its part. \ - The model maps similarly written words to proximal vectors. \ - This approach enables creation of an embedding in vector space for any sequence of characters.\ - Chars2vec models does not keep any dictionary of embeddings, \ - but generates embedding vectors inplace using pretrained model. \ - There are pretrained models of dimensions 50, 100, 150, 200 and 300 for the English language.\ - The library provides convenient user API to train a model for an arbitrary set of characters.", + long_description=readme, classifiers=[ "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", From 4694d756857ad78e5178b43c3c4de63825e20a75 Mon Sep 17 00:00:00 2001 From: Zax Date: Thu, 31 Dec 2020 10:05:15 -0600 Subject: [PATCH 3/5] hotfix custom model training; closes PR #8 --- chars2vec/model.py | 90 +++++++++++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 36 deletions(-) diff --git a/chars2vec/model.py b/chars2vec/model.py index a98215e..19d265a 100644 --- a/chars2vec/model.py +++ b/chars2vec/model.py @@ -5,14 +5,13 @@ class Chars2Vec: - def __init__(self, emb_dim, char_to_ix): - ''' + """ Creates chars2vec model. :param emb_dim: int, dimension of embeddings. :param char_to_ix: dict, keys are characters, values are sequence numbers of characters. - ''' + """ if not isinstance(emb_dim, int) or emb_dim < 1: raise TypeError("parameter 'emb_dim' must be a positive integer") @@ -40,15 +39,17 @@ def __init__(self, emb_dim, char_to_ix): embedding_2 = self.embedding_model(model_input_2) x = keras.layers.Subtract()([embedding_1, embedding_2]) x = keras.layers.Dot(1)([x, x]) - model_output = keras.layers.Dense(1, activation='sigmoid')(x) - - self.model = keras.models.Model(inputs=[model_input_1, model_input_2], outputs=model_output) - self.model.compile(optimizer='adam', loss='mae') + model_output = keras.layers.Dense(1, activation="sigmoid")(x) + self.model = keras.models.Model( + inputs=[model_input_1, model_input_2], outputs=model_output + ) + self.model.compile(optimizer="adam", loss="mae") - def fit(self, word_pairs, targets, - max_epochs, patience, validation_split, batch_size): - ''' + def fit( + self, word_pairs, targets, max_epochs, patience, validation_split, batch_size + ): + """ Fits model. :param word_pairs: list or numpy.ndarray of word pairs. @@ -57,7 +58,7 @@ def fit(self, word_pairs, targets, :param patience: parameter 'patience' of callback in keras model. :param validation_split: parameter 'validation_split' of keras model. :param batch_size: parameter 'batch_size' of keras model. - ''' + """ if not isinstance(word_pairs, list) and not isinstance(word_pairs, np.ndarray): raise TypeError("parameters 'word_pairs' must be a list or numpy.ndarray") @@ -104,20 +105,26 @@ def fit(self, word_pairs, targets, x_1_pad_seq = keras.preprocessing.sequence.pad_sequences(x_1) x_2_pad_seq = keras.preprocessing.sequence.pad_sequences(x_2) - self.model.fit([x_1_pad_seq, x_2_pad_seq], targets, - batch_size=batch_size, epochs=max_epochs, - validation_split=validation_split, - callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)]) + self.model.fit( + [x_1_pad_seq, x_2_pad_seq], + targets, + batch_size=batch_size, + epochs=max_epochs, + validation_split=validation_split, + callbacks=[ + keras.callbacks.EarlyStopping(monitor="val_loss", patience=patience) + ], + ) def vectorize_words(self, words, maxlen_padseq=None): - ''' + """ Returns embeddings for list of words. Uses cache of word embeddings to vectorization speed up. :param words: list or numpy.ndarray of strings. :param maxlen_padseq: parameter 'maxlen' for keras pad_sequences transform. :return word_vectors: numpy.ndarray, word embeddings. - ''' + """ if not isinstance(words, list) and not isinstance(words, np.ndarray): raise TypeError("parameter 'words' must be a list or numpy.ndarray") @@ -149,7 +156,9 @@ def vectorize_words(self, words, maxlen_padseq=None): list_of_embeddings.append(np.array(current_embedding)) - embeddings_pad_seq = keras.preprocessing.sequence.pad_sequences(list_of_embeddings, maxlen=maxlen_padseq) + embeddings_pad_seq = keras.preprocessing.sequence.pad_sequences( + list_of_embeddings, maxlen=maxlen_padseq + ) new_words_vectors = self.embedding_model.predict([embeddings_pad_seq]) for i in range(len(new_words)): @@ -161,52 +170,62 @@ def vectorize_words(self, words, maxlen_padseq=None): def save_model(c2v_model, path_to_model): - ''' + """ Saves trained model to directory. :param c2v_model: Chars2Vec object, trained model. :param path_to_model: str, path to save model. - ''' + """ if not os.path.exists(path_to_model): os.makedirs(path_to_model) - c2v_model.embedding_model.save_weights(path_to_model + '/weights.h5') + c2v_model.embedding_model.save_weights(path_to_model + "/weights.h5") - with open(path_to_model + '/model.pkl', 'wb') as f: + with open(path_to_model + "/model.pkl", "wb") as f: pickle.dump([c2v_model.dim, c2v_model.char_to_ix], f, protocol=2) def load_model(path): - ''' + """ Loads trained model. :param path: str, if it is 'eng_50', 'eng_100', 'eng_150', 'eng_200' or 'eng_300' then loads one of default models, else loads model from `path`. :return c2v_model: Chars2Vec object, trained model. - ''' + """ - if path in ['eng_50', 'eng_100', 'eng_150', 'eng_200', 'eng_300']: - path_to_model = os.path.dirname(os.path.abspath(__file__)) + '/trained_models/' + path + if path in ["eng_50", "eng_100", "eng_150", "eng_200", "eng_300"]: + path_to_model = ( + os.path.dirname(os.path.abspath(__file__)) + "/trained_models/" + path + ) else: path_to_model = path - with open(path_to_model + '/model.pkl', 'rb') as f: + with open(path_to_model + "/model.pkl", "rb") as f: structure = pickle.load(f) emb_dim, char_to_ix = structure[0], structure[1] c2v_model = Chars2Vec(emb_dim, char_to_ix) - c2v_model.embedding_model.load_weights(path_to_model + '/weights.h5') - c2v_model.embedding_model.compile(optimizer='adam', loss='mae') + c2v_model.embedding_model.load_weights(path_to_model + "/weights.h5") + c2v_model.embedding_model.compile(optimizer="adam", loss="mae") return c2v_model -def train_model(emb_dim, X_train, y_train, model_chars, - max_epochs=200, patience=10, validation_split=0.05, batch_size=64): - ''' +def train_model( + emb_dim, + X_train, + y_train, + model_chars, + max_epochs=200, + patience=10, + validation_split=0.05, + batch_size=64, +): + """ Creates and trains chars2vec model using given training data. :param emb_dim: int, dimension of embeddings. @@ -219,11 +238,10 @@ def train_model(emb_dim, X_train, y_train, model_chars, :param batch_size: parameter 'batch_size' of keras model. :return c2v_model: Chars2Vec object, trained model. - ''' + """ if not isinstance(X_train, list) and not isinstance(X_train, np.ndarray): - raise TypeError("parameter 'X_train' must be a list or numpy.ndarray")\ - + raise TypeError("parameter 'X_train' must be a list or numpy.ndarray") if not isinstance(y_train, list) and not isinstance(y_train, np.ndarray): raise TypeError("parameter 'y_train' must be a list or numpy.ndarray") @@ -233,7 +251,7 @@ def train_model(emb_dim, X_train, y_train, model_chars, char_to_ix = {ch: i for i, ch in enumerate(model_chars)} c2v_model = Chars2Vec(emb_dim, char_to_ix) - targets = [float(el) for el in y_train] + targets = np.array(y_train) c2v_model.fit(X_train, targets, max_epochs, patience, validation_split, batch_size) return c2v_model From 0ed7a2633fb3af44d36c1ac537e892cf65a85f54 Mon Sep 17 00:00:00 2001 From: Zax Date: Thu, 31 Dec 2020 10:07:04 -0600 Subject: [PATCH 4/5] hotfix namespace collisions w/ keras/tensorflow --- chars2vec/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chars2vec/__init__.py b/chars2vec/__init__.py index 71a1d85..70368ea 100644 --- a/chars2vec/__init__.py +++ b/chars2vec/__init__.py @@ -1 +1 @@ -from .model import * \ No newline at end of file +from .model import Chars2Vec, load_model, save_model, train_model \ No newline at end of file From 9a5005f05c945d85c953d52ea2406ab2666fcb99 Mon Sep 17 00:00:00 2001 From: Zax Date: Thu, 31 Dec 2020 10:07:57 -0600 Subject: [PATCH 5/5] refactor code for simplicity and to keep it dry --- chars2vec/model.py | 120 ++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 73 deletions(-) diff --git a/chars2vec/model.py b/chars2vec/model.py index 19d265a..0ed0127 100644 --- a/chars2vec/model.py +++ b/chars2vec/model.py @@ -1,14 +1,14 @@ -import numpy as np +import os import pickle + import keras -import os +import numpy as np class Chars2Vec: def __init__(self, emb_dim, char_to_ix): """ Creates chars2vec model. - :param emb_dim: int, dimension of embeddings. :param char_to_ix: dict, keys are characters, values are sequence numbers of characters. """ @@ -24,14 +24,19 @@ def __init__(self, emb_dim, char_to_ix): self.vocab_size = len(self.char_to_ix) self.dim = emb_dim self.cache = {} + self.embedding_model = self._get_embedding_model() + self.model = self._get_model() + self.model.compile(optimizer="adam", loss="mae") - lstm_input = keras.layers.Input(shape=(None, self.vocab_size)) + def _get_embedding_model(self): + inputs = keras.layers.Input(shape=(None, self.vocab_size)) - x = keras.layers.LSTM(emb_dim, return_sequences=True)(lstm_input) - x = keras.layers.LSTM(emb_dim)(x) + outputs = keras.layers.LSTM(self.dim, return_sequences=True)(inputs) + outputs = keras.layers.LSTM(self.dim)(outputs) - self.embedding_model = keras.models.Model(inputs=[lstm_input], outputs=x) + return keras.models.Model(inputs=[inputs], outputs=outputs) + def _get_model(self): model_input_1 = keras.layers.Input(shape=(None, self.vocab_size)) model_input_2 = keras.layers.Input(shape=(None, self.vocab_size)) @@ -41,17 +46,24 @@ def __init__(self, emb_dim, char_to_ix): x = keras.layers.Dot(1)([x, x]) model_output = keras.layers.Dense(1, activation="sigmoid")(x) - self.model = keras.models.Model( + return keras.models.Model( inputs=[model_input_1, model_input_2], outputs=model_output ) - self.model.compile(optimizer="adam", loss="mae") + + def _create_word_embedding(self, word): + word_embedding = [] + for char in word.lower(): + char_embedding = np.zeros(self.vocab_size) + if char in self.char_to_ix: + char_embedding[self.char_to_ix[char]] = 1 + word_embedding.append(char_embedding) + return word_embedding def fit( self, word_pairs, targets, max_epochs, patience, validation_split, batch_size ): """ Fits model. - :param word_pairs: list or numpy.ndarray of word pairs. :param targets: list or numpy.ndarray of targets. :param max_epochs: parameter 'epochs' of keras model. @@ -60,47 +72,31 @@ def fit( :param batch_size: parameter 'batch_size' of keras model. """ - if not isinstance(word_pairs, list) and not isinstance(word_pairs, np.ndarray): + if not isinstance(word_pairs, (list, np.ndarray)): raise TypeError("parameters 'word_pairs' must be a list or numpy.ndarray") - if not isinstance(targets, list) and not isinstance(targets, np.ndarray): + if not isinstance(targets, (list, np.ndarray)): raise TypeError("parameters 'targets' must be a list or numpy.ndarray") x_1, x_2 = [], [] + for word_pair in word_pairs: + if len(word_pair) != 2: + raise ValueError( + "`word_pairs` contains a 'pair' with more than two words." + ) - for pair_words in word_pairs: - emb_list_1 = [] - emb_list_2 = [] - - if not isinstance(pair_words[0], str) or not isinstance(pair_words[1], str): - raise TypeError("word must be a string") - - first_word = pair_words[0].lower() - second_word = pair_words[1].lower() - - for t in range(len(first_word)): - - if first_word[t] in self.char_to_ix: - x = np.zeros(self.vocab_size) - x[self.char_to_ix[first_word[t]]] = 1 - emb_list_1.append(x) - - else: - emb_list_1.append(np.zeros(self.vocab_size)) - - x_1.append(np.array(emb_list_1)) - - for t in range(len(second_word)): + if not all(isinstance(word, str) for word in word_pair): + raise TypeError("Both words must be strings.") - if second_word[t] in self.char_to_ix: - x = np.zeros(self.vocab_size) - x[self.char_to_ix[second_word[t]]] = 1 - emb_list_2.append(x) + first_word, second_word = word_pair - else: - emb_list_2.append(np.zeros(self.vocab_size)) + first_word_embedding = self._create_word_embedding(word=first_word.lower()) + x_1.append(np.array(first_word_embedding)) - x_2.append(np.array(emb_list_2)) + second_word_embedding = self._create_word_embedding( + word=second_word.lower() + ) + x_2.append(np.array(second_word_embedding)) x_1_pad_seq = keras.preprocessing.sequence.pad_sequences(x_1) x_2_pad_seq = keras.preprocessing.sequence.pad_sequences(x_2) @@ -119,42 +115,26 @@ def fit( def vectorize_words(self, words, maxlen_padseq=None): """ Returns embeddings for list of words. Uses cache of word embeddings to vectorization speed up. - :param words: list or numpy.ndarray of strings. :param maxlen_padseq: parameter 'maxlen' for keras pad_sequences transform. - :return word_vectors: numpy.ndarray, word embeddings. """ - if not isinstance(words, list) and not isinstance(words, np.ndarray): + if not isinstance(words, (list, np.ndarray)): raise TypeError("parameter 'words' must be a list or numpy.ndarray") words = [w.lower() for w in words] unique_words = np.unique(words) new_words = [w for w in unique_words if w not in self.cache] - if len(new_words) > 0: - + if new_words: list_of_embeddings = [] - - for current_word in new_words: - - if not isinstance(current_word, str): + for word in new_words: + if not isinstance(word, str): raise TypeError("word must be a string") - current_embedding = [] - - for t in range(len(current_word)): - - if current_word[t] in self.char_to_ix: - x = np.zeros(self.vocab_size) - x[self.char_to_ix[current_word[t]]] = 1 - current_embedding.append(x) - - else: - current_embedding.append(np.zeros(self.vocab_size)) - - list_of_embeddings.append(np.array(current_embedding)) + word_embedding = self._create_word_embedding(word=word.lower()) + list_of_embeddings.append(np.array(word_embedding)) embeddings_pad_seq = keras.preprocessing.sequence.pad_sequences( list_of_embeddings, maxlen=maxlen_padseq @@ -172,7 +152,6 @@ def vectorize_words(self, words, maxlen_padseq=None): def save_model(c2v_model, path_to_model): """ Saves trained model to directory. - :param c2v_model: Chars2Vec object, trained model. :param path_to_model: str, path to save model. """ @@ -189,10 +168,8 @@ def save_model(c2v_model, path_to_model): def load_model(path): """ Loads trained model. - :param path: str, if it is 'eng_50', 'eng_100', 'eng_150', 'eng_200' or 'eng_300' then loads one of default models, else loads model from `path`. - :return c2v_model: Chars2Vec object, trained model. """ @@ -227,7 +204,6 @@ def train_model( ): """ Creates and trains chars2vec model using given training data. - :param emb_dim: int, dimension of embeddings. :param X_train: list or numpy.ndarray of word pairs. :param y_train: list or numpy.ndarray of target values that describe the proximity of words. @@ -236,16 +212,14 @@ def train_model( :param patience: parameter 'patience' of callback in keras model. :param validation_split: parameter 'validation_split' of keras model. :param batch_size: parameter 'batch_size' of keras model. - :return c2v_model: Chars2Vec object, trained model. """ - if not isinstance(X_train, list) and not isinstance(X_train, np.ndarray): + if not isinstance(X_train, (list, np.ndarray)): raise TypeError("parameter 'X_train' must be a list or numpy.ndarray") - if not isinstance(y_train, list) and not isinstance(y_train, np.ndarray): + if not isinstance(y_train, (list, np.ndarray)): raise TypeError("parameter 'y_train' must be a list or numpy.ndarray") - - if not isinstance(model_chars, list) and not isinstance(model_chars, np.ndarray): + if not isinstance(model_chars, (list, np.ndarray)): raise TypeError("parameter 'model_chars' must be a list or numpy.ndarray") char_to_ix = {ch: i for i, ch in enumerate(model_chars)}