From 6837323677beadca172a4edcdc49e93fea640508 Mon Sep 17 00:00:00 2001
From: Zax <zaxr@protonmail.com>
Date: Thu, 31 Dec 2020 10:01:18 -0600
Subject: [PATCH 1/5] hotfix: install requirements via pip install

---
 setup.py | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/setup.py b/setup.py
index 63f3c0b..90cbdd1 100644
--- a/setup.py
+++ b/setup.py
@@ -1,25 +1,25 @@
-import sys
-import subprocess
-PY_VER = sys.version[0]
-subprocess.call(["pip{:} install -r requirements.txt".format(PY_VER)], shell=True)
-
 from setuptools import setup
 
+
+with open("requirements.txt") as f:
+    install_requires = f.read().splitlines()
+
+
 setup(
-    name='chars2vec',
-    version='0.1.7',
-    author='Vladimir Chikin',
-    author_email='v4@intuition.engineering',
-    packages=['chars2vec'],
+    name="chars2vec",
+    version="0.1.8",
+    author="Vladimir Chikin",
+    author_email="v4@intuition.engineering",
+    packages=["chars2vec"],
     include_package_data=True,
-    package_data={'chars2vec': ['trained_models/*']},
-    description='Character-based word embeddings model based on RNN',
-    maintainer='Intuition',
-    maintainer_email='dev@intuition.engineering',
-    url='https://github.com/IntuitionEngineeringTeam/chars2vec',
-    download_url='https://github.com/IntuitionEngineeringTeam/chars2vec/archive/master.zip',
-    license='Apache License 2.0',
-    long_description='Chars2vec library could be very useful if you are dealing with the texts \
+    package_data={"chars2vec": ["trained_models/*"]},
+    description="Character-based word embeddings model based on RNN",
+    maintainer="Intuition",
+    maintainer_email="dev@intuition.engineering",
+    url="https://github.com/IntuitionEngineeringTeam/chars2vec",
+    download_url="https://github.com/IntuitionEngineeringTeam/chars2vec/archive/master.zip",
+    license="Apache License 2.0",
+    long_description="Chars2vec library could be very useful if you are dealing with the texts \
                         containing abbreviations, slang, typos, or some other specific textual dataset. \
                         Chars2vec language model is based on the symbolic representation of words – \
                         the model maps each word to a vector of a fixed length. \
@@ -31,7 +31,10 @@
                         Chars2vec models does not keep any dictionary of embeddings, \
                         but generates embedding vectors inplace using pretrained model. \
                         There are pretrained models of dimensions 50, 100, 150, 200 and 300 for the English language.\
-                        The library provides convenient user API to train a model for an arbitrary set of characters.',
-    classifiers=['Programming Language :: Python :: 2.7',
-                 'Programming Language :: Python :: 3']
+                        The library provides convenient user API to train a model for an arbitrary set of characters.",
+    classifiers=[
+        "Programming Language :: Python :: 2.7",
+        "Programming Language :: Python :: 3",
+    ],
+    install_requires=install_requires,
 )
\ No newline at end of file

From 75c2c9723be7ab7709ff9b8d6e9fe3a3c4413631 Mon Sep 17 00:00:00 2001
From: Zax <zaxr@protonmail.com>
Date: Thu, 31 Dec 2020 10:02:29 -0600
Subject: [PATCH 2/5] import readme to setup.py as long_description

---
 setup.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/setup.py b/setup.py
index 90cbdd1..65ae0e0 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,9 @@
 from setuptools import setup
 
 
+with open("README.md") as readme_file:
+    readme = readme_file.read()
+
 with open("requirements.txt") as f:
     install_requires = f.read().splitlines()
 
@@ -19,19 +22,7 @@
     url="https://github.com/IntuitionEngineeringTeam/chars2vec",
     download_url="https://github.com/IntuitionEngineeringTeam/chars2vec/archive/master.zip",
     license="Apache License 2.0",
-    long_description="Chars2vec library could be very useful if you are dealing with the texts \
-                        containing abbreviations, slang, typos, or some other specific textual dataset. \
-                        Chars2vec language model is based on the symbolic representation of words – \
-                        the model maps each word to a vector of a fixed length. \
-                        These vector representations are obtained with a custom neural netowrk while \
-                        the latter is being trained on pairs of similar and non-similar words. \
-                        This custom neural net includes LSTM, reading sequences of characters in words, as its part. \
-                        The model maps similarly written words to proximal vectors. \
-                        This approach enables creation of an embedding in vector space for any sequence of characters.\
-                        Chars2vec models does not keep any dictionary of embeddings, \
-                        but generates embedding vectors inplace using pretrained model. \
-                        There are pretrained models of dimensions 50, 100, 150, 200 and 300 for the English language.\
-                        The library provides convenient user API to train a model for an arbitrary set of characters.",
+    long_description=readme,
     classifiers=[
         "Programming Language :: Python :: 2.7",
         "Programming Language :: Python :: 3",

From 4694d756857ad78e5178b43c3c4de63825e20a75 Mon Sep 17 00:00:00 2001
From: Zax <zaxr@protonmail.com>
Date: Thu, 31 Dec 2020 10:05:15 -0600
Subject: [PATCH 3/5] hotfix custom model training; closes PR #8

---
 chars2vec/model.py | 90 +++++++++++++++++++++++++++-------------------
 1 file changed, 54 insertions(+), 36 deletions(-)

diff --git a/chars2vec/model.py b/chars2vec/model.py
index a98215e..19d265a 100644
--- a/chars2vec/model.py
+++ b/chars2vec/model.py
@@ -5,14 +5,13 @@
 
 
 class Chars2Vec:
-
     def __init__(self, emb_dim, char_to_ix):
-        '''
+        """
         Creates chars2vec model.
 
         :param emb_dim: int, dimension of embeddings.
         :param char_to_ix: dict, keys are characters, values are sequence numbers of characters.
-        '''
+        """
 
         if not isinstance(emb_dim, int) or emb_dim < 1:
             raise TypeError("parameter 'emb_dim' must be a positive integer")
@@ -40,15 +39,17 @@ def __init__(self, emb_dim, char_to_ix):
         embedding_2 = self.embedding_model(model_input_2)
         x = keras.layers.Subtract()([embedding_1, embedding_2])
         x = keras.layers.Dot(1)([x, x])
-        model_output = keras.layers.Dense(1, activation='sigmoid')(x)
-
-        self.model = keras.models.Model(inputs=[model_input_1, model_input_2], outputs=model_output)
-        self.model.compile(optimizer='adam', loss='mae')
+        model_output = keras.layers.Dense(1, activation="sigmoid")(x)
 
+        self.model = keras.models.Model(
+            inputs=[model_input_1, model_input_2], outputs=model_output
+        )
+        self.model.compile(optimizer="adam", loss="mae")
 
-    def fit(self, word_pairs, targets,
-            max_epochs, patience, validation_split, batch_size):
-        '''
+    def fit(
+        self, word_pairs, targets, max_epochs, patience, validation_split, batch_size
+    ):
+        """
         Fits model.
 
         :param word_pairs: list or numpy.ndarray of word pairs.
@@ -57,7 +58,7 @@ def fit(self, word_pairs, targets,
         :param patience: parameter 'patience' of callback in keras model.
         :param validation_split: parameter 'validation_split' of keras model.
         :param batch_size: parameter 'batch_size' of keras model.
-        '''
+        """
 
         if not isinstance(word_pairs, list) and not isinstance(word_pairs, np.ndarray):
             raise TypeError("parameters 'word_pairs' must be a list or numpy.ndarray")
@@ -104,20 +105,26 @@ def fit(self, word_pairs, targets,
         x_1_pad_seq = keras.preprocessing.sequence.pad_sequences(x_1)
         x_2_pad_seq = keras.preprocessing.sequence.pad_sequences(x_2)
 
-        self.model.fit([x_1_pad_seq, x_2_pad_seq], targets,
-                       batch_size=batch_size, epochs=max_epochs,
-                       validation_split=validation_split,
-                       callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)])
+        self.model.fit(
+            [x_1_pad_seq, x_2_pad_seq],
+            targets,
+            batch_size=batch_size,
+            epochs=max_epochs,
+            validation_split=validation_split,
+            callbacks=[
+                keras.callbacks.EarlyStopping(monitor="val_loss", patience=patience)
+            ],
+        )
 
     def vectorize_words(self, words, maxlen_padseq=None):
-        '''
+        """
         Returns embeddings for list of words. Uses cache of word embeddings to vectorization speed up.
 
         :param words: list or numpy.ndarray of strings.
         :param maxlen_padseq: parameter 'maxlen' for keras pad_sequences transform.
 
         :return word_vectors: numpy.ndarray, word embeddings.
-        '''
+        """
 
         if not isinstance(words, list) and not isinstance(words, np.ndarray):
             raise TypeError("parameter 'words' must be a list or numpy.ndarray")
@@ -149,7 +156,9 @@ def vectorize_words(self, words, maxlen_padseq=None):
 
                 list_of_embeddings.append(np.array(current_embedding))
 
-            embeddings_pad_seq = keras.preprocessing.sequence.pad_sequences(list_of_embeddings, maxlen=maxlen_padseq)
+            embeddings_pad_seq = keras.preprocessing.sequence.pad_sequences(
+                list_of_embeddings, maxlen=maxlen_padseq
+            )
             new_words_vectors = self.embedding_model.predict([embeddings_pad_seq])
 
             for i in range(len(new_words)):
@@ -161,52 +170,62 @@ def vectorize_words(self, words, maxlen_padseq=None):
 
 
 def save_model(c2v_model, path_to_model):
-    '''
+    """
     Saves trained model to directory.
 
     :param c2v_model: Chars2Vec object, trained model.
     :param path_to_model: str, path to save model.
-    '''
+    """
 
     if not os.path.exists(path_to_model):
         os.makedirs(path_to_model)
 
-    c2v_model.embedding_model.save_weights(path_to_model + '/weights.h5')
+    c2v_model.embedding_model.save_weights(path_to_model + "/weights.h5")
 
-    with open(path_to_model + '/model.pkl', 'wb') as f:
+    with open(path_to_model + "/model.pkl", "wb") as f:
         pickle.dump([c2v_model.dim, c2v_model.char_to_ix], f, protocol=2)
 
 
 def load_model(path):
-    '''
+    """
     Loads trained model.
 
     :param path: str, if it is 'eng_50', 'eng_100', 'eng_150', 'eng_200' or 'eng_300' then loads one of default models,
      else loads model from `path`.
 
     :return c2v_model: Chars2Vec object, trained model.
-    '''
+    """
 
-    if path in ['eng_50', 'eng_100', 'eng_150', 'eng_200', 'eng_300']:
-        path_to_model = os.path.dirname(os.path.abspath(__file__)) + '/trained_models/' + path
+    if path in ["eng_50", "eng_100", "eng_150", "eng_200", "eng_300"]:
+        path_to_model = (
+            os.path.dirname(os.path.abspath(__file__)) + "/trained_models/" + path
+        )
 
     else:
         path_to_model = path
 
-    with open(path_to_model + '/model.pkl', 'rb') as f:
+    with open(path_to_model + "/model.pkl", "rb") as f:
         structure = pickle.load(f)
         emb_dim, char_to_ix = structure[0], structure[1]
 
     c2v_model = Chars2Vec(emb_dim, char_to_ix)
-    c2v_model.embedding_model.load_weights(path_to_model + '/weights.h5')
-    c2v_model.embedding_model.compile(optimizer='adam', loss='mae')
+    c2v_model.embedding_model.load_weights(path_to_model + "/weights.h5")
+    c2v_model.embedding_model.compile(optimizer="adam", loss="mae")
 
     return c2v_model
 
 
-def train_model(emb_dim, X_train, y_train, model_chars,
-                max_epochs=200, patience=10, validation_split=0.05, batch_size=64):
-    '''
+def train_model(
+    emb_dim,
+    X_train,
+    y_train,
+    model_chars,
+    max_epochs=200,
+    patience=10,
+    validation_split=0.05,
+    batch_size=64,
+):
+    """
     Creates and trains chars2vec model using given training data.
 
     :param emb_dim: int, dimension of embeddings.
@@ -219,11 +238,10 @@ def train_model(emb_dim, X_train, y_train, model_chars,
     :param batch_size: parameter 'batch_size' of keras model.
 
     :return c2v_model: Chars2Vec object, trained model.
-    '''
+    """
 
     if not isinstance(X_train, list) and not isinstance(X_train, np.ndarray):
-        raise TypeError("parameter 'X_train' must be a list or numpy.ndarray")\
-
+        raise TypeError("parameter 'X_train' must be a list or numpy.ndarray")
     if not isinstance(y_train, list) and not isinstance(y_train, np.ndarray):
         raise TypeError("parameter 'y_train' must be a list or numpy.ndarray")
 
@@ -233,7 +251,7 @@ def train_model(emb_dim, X_train, y_train, model_chars,
     char_to_ix = {ch: i for i, ch in enumerate(model_chars)}
     c2v_model = Chars2Vec(emb_dim, char_to_ix)
 
-    targets = [float(el) for el in y_train]
+    targets = np.array(y_train)
     c2v_model.fit(X_train, targets, max_epochs, patience, validation_split, batch_size)
 
     return c2v_model

From 0ed7a2633fb3af44d36c1ac537e892cf65a85f54 Mon Sep 17 00:00:00 2001
From: Zax <zaxr@protonmail.com>
Date: Thu, 31 Dec 2020 10:07:04 -0600
Subject: [PATCH 4/5] hotfix namespace collisions w/ keras/tensorflow

---
 chars2vec/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chars2vec/__init__.py b/chars2vec/__init__.py
index 71a1d85..70368ea 100644
--- a/chars2vec/__init__.py
+++ b/chars2vec/__init__.py
@@ -1 +1 @@
-from .model import *
\ No newline at end of file
+from .model import Chars2Vec, load_model, save_model, train_model
\ No newline at end of file

From 9a5005f05c945d85c953d52ea2406ab2666fcb99 Mon Sep 17 00:00:00 2001
From: Zax <zaxr@protonmail.com>
Date: Thu, 31 Dec 2020 10:07:57 -0600
Subject: [PATCH 5/5] refactor code for simplicity and to keep it dry

---
 chars2vec/model.py | 120 ++++++++++++++++++---------------------------
 1 file changed, 47 insertions(+), 73 deletions(-)

diff --git a/chars2vec/model.py b/chars2vec/model.py
index 19d265a..0ed0127 100644
--- a/chars2vec/model.py
+++ b/chars2vec/model.py
@@ -1,14 +1,14 @@
-import numpy as np
+import os
 import pickle
+
 import keras
-import os
+import numpy as np
 
 
 class Chars2Vec:
     def __init__(self, emb_dim, char_to_ix):
         """
         Creates chars2vec model.
-
         :param emb_dim: int, dimension of embeddings.
         :param char_to_ix: dict, keys are characters, values are sequence numbers of characters.
         """
@@ -24,14 +24,19 @@ def __init__(self, emb_dim, char_to_ix):
         self.vocab_size = len(self.char_to_ix)
         self.dim = emb_dim
         self.cache = {}
+        self.embedding_model = self._get_embedding_model()
+        self.model = self._get_model()
+        self.model.compile(optimizer="adam", loss="mae")
 
-        lstm_input = keras.layers.Input(shape=(None, self.vocab_size))
+    def _get_embedding_model(self):
+        inputs = keras.layers.Input(shape=(None, self.vocab_size))
 
-        x = keras.layers.LSTM(emb_dim, return_sequences=True)(lstm_input)
-        x = keras.layers.LSTM(emb_dim)(x)
+        outputs = keras.layers.LSTM(self.dim, return_sequences=True)(inputs)
+        outputs = keras.layers.LSTM(self.dim)(outputs)
 
-        self.embedding_model = keras.models.Model(inputs=[lstm_input], outputs=x)
+        return keras.models.Model(inputs=[inputs], outputs=outputs)
 
+    def _get_model(self):
         model_input_1 = keras.layers.Input(shape=(None, self.vocab_size))
         model_input_2 = keras.layers.Input(shape=(None, self.vocab_size))
 
@@ -41,17 +46,24 @@ def __init__(self, emb_dim, char_to_ix):
         x = keras.layers.Dot(1)([x, x])
         model_output = keras.layers.Dense(1, activation="sigmoid")(x)
 
-        self.model = keras.models.Model(
+        return keras.models.Model(
             inputs=[model_input_1, model_input_2], outputs=model_output
         )
-        self.model.compile(optimizer="adam", loss="mae")
+
+    def _create_word_embedding(self, word):
+        word_embedding = []
+        for char in word.lower():
+            char_embedding = np.zeros(self.vocab_size)
+            if char in self.char_to_ix:
+                char_embedding[self.char_to_ix[char]] = 1
+            word_embedding.append(char_embedding)
+        return word_embedding
 
     def fit(
         self, word_pairs, targets, max_epochs, patience, validation_split, batch_size
     ):
         """
         Fits model.
-
         :param word_pairs: list or numpy.ndarray of word pairs.
         :param targets: list or numpy.ndarray of targets.
         :param max_epochs: parameter 'epochs' of keras model.
@@ -60,47 +72,31 @@ def fit(
         :param batch_size: parameter 'batch_size' of keras model.
         """
 
-        if not isinstance(word_pairs, list) and not isinstance(word_pairs, np.ndarray):
+        if not isinstance(word_pairs, (list, np.ndarray)):
             raise TypeError("parameters 'word_pairs' must be a list or numpy.ndarray")
 
-        if not isinstance(targets, list) and not isinstance(targets, np.ndarray):
+        if not isinstance(targets, (list, np.ndarray)):
             raise TypeError("parameters 'targets' must be a list or numpy.ndarray")
 
         x_1, x_2 = [], []
+        for word_pair in word_pairs:
+            if len(word_pair) != 2:
+                raise ValueError(
+                    "`word_pairs` contains a 'pair' with more than two words."
+                )
 
-        for pair_words in word_pairs:
-            emb_list_1 = []
-            emb_list_2 = []
-
-            if not isinstance(pair_words[0], str) or not isinstance(pair_words[1], str):
-                raise TypeError("word must be a string")
-
-            first_word = pair_words[0].lower()
-            second_word = pair_words[1].lower()
-
-            for t in range(len(first_word)):
-
-                if first_word[t] in self.char_to_ix:
-                    x = np.zeros(self.vocab_size)
-                    x[self.char_to_ix[first_word[t]]] = 1
-                    emb_list_1.append(x)
-
-                else:
-                    emb_list_1.append(np.zeros(self.vocab_size))
-
-            x_1.append(np.array(emb_list_1))
-
-            for t in range(len(second_word)):
+            if not all(isinstance(word, str) for word in word_pair):
+                raise TypeError("Both words must be strings.")
 
-                if second_word[t] in self.char_to_ix:
-                    x = np.zeros(self.vocab_size)
-                    x[self.char_to_ix[second_word[t]]] = 1
-                    emb_list_2.append(x)
+            first_word, second_word = word_pair
 
-                else:
-                    emb_list_2.append(np.zeros(self.vocab_size))
+            first_word_embedding = self._create_word_embedding(word=first_word.lower())
+            x_1.append(np.array(first_word_embedding))
 
-            x_2.append(np.array(emb_list_2))
+            second_word_embedding = self._create_word_embedding(
+                word=second_word.lower()
+            )
+            x_2.append(np.array(second_word_embedding))
 
         x_1_pad_seq = keras.preprocessing.sequence.pad_sequences(x_1)
         x_2_pad_seq = keras.preprocessing.sequence.pad_sequences(x_2)
@@ -119,42 +115,26 @@ def fit(
     def vectorize_words(self, words, maxlen_padseq=None):
         """
         Returns embeddings for list of words. Uses cache of word embeddings to vectorization speed up.
-
         :param words: list or numpy.ndarray of strings.
         :param maxlen_padseq: parameter 'maxlen' for keras pad_sequences transform.
-
         :return word_vectors: numpy.ndarray, word embeddings.
         """
 
-        if not isinstance(words, list) and not isinstance(words, np.ndarray):
+        if not isinstance(words, (list, np.ndarray)):
             raise TypeError("parameter 'words' must be a list or numpy.ndarray")
 
         words = [w.lower() for w in words]
         unique_words = np.unique(words)
         new_words = [w for w in unique_words if w not in self.cache]
 
-        if len(new_words) > 0:
-
+        if new_words:
             list_of_embeddings = []
-
-            for current_word in new_words:
-
-                if not isinstance(current_word, str):
+            for word in new_words:
+                if not isinstance(word, str):
                     raise TypeError("word must be a string")
 
-                current_embedding = []
-
-                for t in range(len(current_word)):
-
-                    if current_word[t] in self.char_to_ix:
-                        x = np.zeros(self.vocab_size)
-                        x[self.char_to_ix[current_word[t]]] = 1
-                        current_embedding.append(x)
-
-                    else:
-                        current_embedding.append(np.zeros(self.vocab_size))
-
-                list_of_embeddings.append(np.array(current_embedding))
+                word_embedding = self._create_word_embedding(word=word.lower())
+                list_of_embeddings.append(np.array(word_embedding))
 
             embeddings_pad_seq = keras.preprocessing.sequence.pad_sequences(
                 list_of_embeddings, maxlen=maxlen_padseq
@@ -172,7 +152,6 @@ def vectorize_words(self, words, maxlen_padseq=None):
 def save_model(c2v_model, path_to_model):
     """
     Saves trained model to directory.
-
     :param c2v_model: Chars2Vec object, trained model.
     :param path_to_model: str, path to save model.
     """
@@ -189,10 +168,8 @@ def save_model(c2v_model, path_to_model):
 def load_model(path):
     """
     Loads trained model.
-
     :param path: str, if it is 'eng_50', 'eng_100', 'eng_150', 'eng_200' or 'eng_300' then loads one of default models,
      else loads model from `path`.
-
     :return c2v_model: Chars2Vec object, trained model.
     """
 
@@ -227,7 +204,6 @@ def train_model(
 ):
     """
     Creates and trains chars2vec model using given training data.
-
     :param emb_dim: int, dimension of embeddings.
     :param X_train: list or numpy.ndarray of word pairs.
     :param y_train: list or numpy.ndarray of target values that describe the proximity of words.
@@ -236,16 +212,14 @@ def train_model(
     :param patience: parameter 'patience' of callback in keras model.
     :param validation_split: parameter 'validation_split' of keras model.
     :param batch_size: parameter 'batch_size' of keras model.
-
     :return c2v_model: Chars2Vec object, trained model.
     """
 
-    if not isinstance(X_train, list) and not isinstance(X_train, np.ndarray):
+    if not isinstance(X_train, (list, np.ndarray)):
         raise TypeError("parameter 'X_train' must be a list or numpy.ndarray")
-    if not isinstance(y_train, list) and not isinstance(y_train, np.ndarray):
+    if not isinstance(y_train, (list, np.ndarray)):
         raise TypeError("parameter 'y_train' must be a list or numpy.ndarray")
-
-    if not isinstance(model_chars, list) and not isinstance(model_chars, np.ndarray):
+    if not isinstance(model_chars, (list, np.ndarray)):
         raise TypeError("parameter 'model_chars' must be a list or numpy.ndarray")
 
     char_to_ix = {ch: i for i, ch in enumerate(model_chars)}