Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#Sparse Coding
Manaal Faruqui, [email protected]
.npy file support [email protected]

This tool implements sparse coding for converting dense word vector representations to highly sparse vectors. The implementation can be run on multiple cores in parallel with asynchronous updates. The sparsity is introduced in the word vectors using L1 regularization. For technical details please refer to Faruqui et al (2015).

Expand All @@ -16,30 +17,44 @@ You need to download the latest Eigen stable release from here: http://eigen.tux
Unzip the folder and provide its path in the makefile:
INCLUDES = -I PATH_TO_EIGEN

also get https://github.com/rogersce/cnpy and put it in the same dir (this might be a good use for git submodules)

After this just execute the following command:

For sparse coding: ```make```
```make all```

For sparse coding: ```make sparse```

For non-negative sparse coding: ```make nonneg```

###Running the executable

For sparse coding: ```sparse.o```
For sparse coding: ```sparse```

For non-negative sparse coding: ```nonneg.o```
For non-negative sparse coding: ```nonneg```

Usage: ```./sparse.o vec_corpus factor l1_reg l2_reg num_cores outfilename```
Usage: ```./sparse vec_corpus factor l1_reg l2_reg num_cores outfilename```

Example: ```./sparse.o sample_vecs.txt 10 0.5 1e-5 1 out_vecs.txt```
Example: ```./sparse sample_vecs.txt 10 0.5 1e-5 1 out_vecs.txt```

This example would expand the vectors in sample_vecs.txt to 10 times their original length.

###npy files
put the file in a directory:
```sample/vector.npy```

(I put the vocab in the same dir as a txt file, one line per vector: ```sample/vocab.txt```)

Usage: ```./sparse-cnpy vec_corpus_dir factor l1_reg l2_reg num_cores outfilename```

Example: ```./sparse-cnpy sample/ 10 0.5 1e-5 1 out_vecs.txt```

###Reference

```
@InProceedings{faruqui:2015:sparse,
author = {Faruqui, Manaal and Tsvetkov, Yulia and Yogatama, Dani and Dyer, Chris and Smith, Noah A.},
title = {Sparse Overcomplete Word Vector Representations},
title = {Sparse Binary Word Vector Representations},
booktitle = {Proceedings of ACL},
year = {2015},
}
Expand Down
25 changes: 14 additions & 11 deletions makefile
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
CC = g++
INCLUDES = -I /opt/tools/eigen-eigen-ffa86ffb5570
CFLAGS = -std=c++11 -O3 -ffast-math
INCLUDES = -I/usr/include/eigen3/
CFLAGS = -std=c++11 -g -O3 -ffast-math
LIBS = -fopenmp
SRCS = sparse.cc utils.cc
SRCS_NO = sparse-nonneg.cc utils.cc
OUTPUT = sparse.o
NONNEG = nonneg.o
CNPY = -Lcnpy cnpy/libcnpy.a
TARGET = sparse nonneg sparse-cnpy nonneg-cnpy
all: $(TARGET)

make:
$(CC) $(INCLUDES) $(CFLAGS) $(SRCS) -o $(OUTPUT) $(LIBS)
nonneg:
$(CC) $(INCLUDES) $(CFLAGS) $(SRCS_NO) -o $(NONNEG) $(LIBS)
sparse: sparse.cc utils.cc
$(CC) $(INCLUDES) $(CFLAGS) $^ -o $@ $(LIBS)
nonneg: sparse-nonneg.cc utils.cc
$(CC) $(INCLUDES) $(CFLAGS) $^ -o $@ $(LIBS)
sparse-cnpy: sparse-cnpy.cc utils.cc
$(CC) $(INCLUDES) $(CFLAGS) $^ -o $@ $(LIBS) $(CNPY)
nonneg-cnpy: nonneg-cnpy.cc utils.cc
$(CC) $(INCLUDES) $(CFLAGS) $^ -o $@ $(LIBS) $(CNPY)
clean:
$(RM) *.o *~
$(RM) $(TARGET) *~
271 changes: 271 additions & 0 deletions nonneg-cnpy.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
#include <iostream>
#include <vector>
#include <fstream>
#include <functional>
#include <numeric>
#include <cmath>
#include <cstdlib>
#include <time.h>
#include <string>
#include <tr1/unordered_map>
#include <Eigen/Core>
#include <random>

#include "utils.h"
#include "cnpy/cnpy.h"

#define RHO 0.95
#define EPSILON 0.000001
#define RATE 0.05

using namespace std;
using namespace Eigen;

template<typename T> int sgn(T val) {
return (T(0) < val) - (val < T(0));
}

void ReadVecsFromNpyFile(const string& vec_file_name, vector<Col>& word_vecs) {
std::cout << vec_file_name + "/vector.npy" << "\n";
cnpy::NpyArray arr = cnpy::npy_load(vec_file_name + "/vector.npy");
float* loaded_data = reinterpret_cast<float*>(arr.data);

unsigned vocab_size = arr.shape[0];

int itr = 0;
for (int i = 0; i < vocab_size; i++) {
Col word_vec = Col::Zero(arr.shape[1]);
for (unsigned j = 0; j < word_vec.size(); ++j)
word_vec(j, 0) = loaded_data[itr++];
word_vecs.push_back(word_vec);
}
cerr << "Read: " << vec_file_name << endl;
cerr << "Vocab length: " << word_vecs.size() << endl;
cerr << "Vector length: " << word_vecs[0].size() << endl << endl;
}

/* General parameters of the model */
template<typename T>
class Param {

public:
T var;

void Init(const int& rows, const int& cols) {
if (cols == 1) {
var = (0.6 / sqrt(rows)) * T::Random(rows, 1);
_del_var = T::Zero(rows, 1);
_del_grad = T::Zero(rows, 1);
}
var = (0.6 / sqrt(rows + cols)) * T::Random(rows, cols);
_del_var = T::Zero(rows, cols);
_del_grad = T::Zero(rows, cols);
_grad_sum = T::Zero(rows, cols);
_epsilon = EPSILON * T::Ones(rows, cols);
}

void AdagradUpdate(const double& rate, const T& grad) {
_del_grad += grad.cwiseAbs2();
_grad_sum += grad;
var -= rate * grad.cwiseQuotient(_del_grad.cwiseSqrt());
}

void AdagradUpdateWithL1Reg(const double& rate, const T& grad,
const double& l1_reg) {
_update_num += 1;
_del_grad += grad.cwiseAbs2();
_grad_sum += grad;
for (int i = 0; i < var.rows(); ++i) {
for (int j = 0; j < var.cols(); ++j) {
double diff = abs(_grad_sum(i, j)) - _update_num * l1_reg;
if (diff <= 0)
var(i, j) = 0;
else
var(i, j) = -sgn(_grad_sum(i, j)) * rate * diff
/ sqrt(_del_grad(i, j));
}
}
}

void AdagradUpdateWithL1RegNonNeg(const double& rate, const T& grad,
const double& l1_reg) {
_update_num += 1;
_del_grad += grad.cwiseAbs2();
_grad_sum += grad;
for (int i = 0; i < var.rows(); ++i) {
for (int j = 0; j < var.cols(); ++j) {
double diff = abs(_grad_sum(i, j)) - _update_num * l1_reg;
if (diff <= 0)
var(i, j) = 0;
else {
double temp = -sgn(_grad_sum(i, j)) * rate * diff
/ sqrt(_del_grad(i, j));
if (temp >= 0)
var(i, j) = temp;
else
var(i, j) = 0;
}
}
}
}

void WriteToFile(const string filename) {
const unsigned int shape[] = { (unsigned int) var.rows(),
(unsigned int) var.cols() };
float* data = new float[shape[0] * shape[1]];
for (unsigned i = 0; i < shape[0]; ++i) {
for (unsigned j = 0; j < shape[1]; ++j)
data[i * shape[1] + j] = var(i, j);
}
std::cout << filename << "\n";
cnpy::npy_save(filename, data, shape, 2, "w");
delete[] data;
}

void ReadFromFile(ifstream& in) {
string line;
getline(in, line);
vector < string > data = split_line(line, ' ');
int rows = stoi(data[0]), cols = stoi(data[1]);
var = T::Zero(rows, cols);
for (int i = 2; i < data.size(); ++i)
var((i - 2) / cols, (i - 2) % cols) = stod(data[i]);
}

private:
T _del_var, _del_grad, _grad_sum; // updates/gradient memory
T _epsilon;
int _update_num = 0;
};

/* Main class definition that learns the word vectors */
class Model {

public:
/* The parameters of the model */
vector<Param<Col> > atom;
Param<Mat> dict;
int vec_len, factor;

Model(const int& times, const int& vector_len, const int& vocab_len) {
vec_len = vector_len;
factor = times;
dict.Init(vec_len, factor * vec_len);
/* Params initialization */
for (int i = 0; i < vocab_len; ++i) {
Param<Col> vec;
vec.Init(factor * vec_len, 1);
atom.push_back(vec);
}
}

template<typename T> void NonLinearity(T* vec) {
ElemwiseHardTanh(vec);
}

void PredictVector(const Col& word_vec, const int& word_index,
Col* pred_vec) {
*pred_vec = dict.var * atom[word_index].var;
}

void UpdateParams(const int& word_index, const double& rate,
const Col& diff_vec, const double& l1_reg, const double& l2_reg) {
Mat dict_grad = -2 * diff_vec * atom[word_index].var.transpose()
+ 2 * l2_reg * dict.var;
dict.AdagradUpdate(rate, dict_grad);
Col atom_elem_grad = -2 * dict.var.transpose() * diff_vec;
atom[word_index].AdagradUpdateWithL1RegNonNeg(rate, atom_elem_grad,
l1_reg);
}

void WriteVectorsToNpyFile(const string& filename) {
cerr << filename << "\n";
const unsigned int shape[] = {(unsigned int)atom.size(), (unsigned int)atom[0].var.rows()};
float* data = new float[shape[0]*shape[1]];
float* data_ptr = data;
for (unsigned i = 0; i < shape[0]; ++i) {
for (unsigned j = 0; j < shape[1]; ++j) {
*data_ptr++ = atom[i].var[j];
}
}
cerr << "write vectors \n";
cnpy::npy_save(filename, data, shape, 2, "w");
cerr << "vectors written\n";
delete[] data;
}

void WriteDictToFile(const string& filename) {
dict.WriteToFile(filename);
cerr << "\nWritten atom to: " << filename;
}
};

void Train(const string& out_file, const int& factor, const int& cores,
const double& l1_reg, const double& l2_reg,
const vector<Col>& word_vecs, const mapUnsignedStr& vocab) {
Model model(factor, word_vecs[0].size(), word_vecs.size());
double avg_error = 1, prev_avg_err = 0;
int iter = 0;
while (iter < 20
|| (avg_error > 0.05 && iter < 75
&& abs(avg_error - prev_avg_err) > 0.001)) {
iter += 1;
cerr << "\nIteration: " << iter << endl;
unsigned num_words = 0;
double total_error = 0, atom_l1_norm = 0;
int word_id;
#pragma omp parallel num_threads(cores) shared(total_error,atom_l1_norm)
#pragma omp for nowait private(word_id)
for (int word_id = 0; word_id < word_vecs.size(); ++word_id) {
/* Predict the i-th word and compute error */
Col pred_vec;
model.PredictVector(word_vecs[word_id], word_id, &pred_vec);
Col diff_vec = word_vecs[word_id] - pred_vec;
double error = diff_vec.squaredNorm();
#pragma omp critical
{
total_error += error;
num_words += 1;
atom_l1_norm += model.atom[word_id].var.lpNorm<1>();
cerr << num_words << "\r";
}
model.UpdateParams(word_id, RATE, diff_vec, l1_reg, l2_reg);
}
prev_avg_err = avg_error;
avg_error = total_error / num_words;
cerr << "\nError per example: " << total_error / num_words;
cerr << "\nDict L2 norm: " << model.dict.var.lpNorm<2>();
cerr << "\nAvg Atom L1 norm: " << atom_l1_norm / num_words;
}
model.WriteVectorsToNpyFile(out_file);
model.WriteDictToFile(out_file + "_dict");
}

int main(int argc, char **argv) {
mapUnsignedStr vocab;
vector<Col> word_vecs;
if (argc == 7) {
string vec_corpus = argv[1];
int factor = stoi(argv[2]);
double l1_reg = stod(argv[3]), l2_reg = stod(argv[4]);
int num_cores = stoi(argv[5]);
string outfilename = argv[6];

ReadVecsFromNpyFile(vec_corpus, word_vecs);

cerr << "Model specification" << endl;
cerr << "----------------" << endl;
cerr << "Vector length: " << word_vecs[0].size() << endl;
cerr << "Dictionary length: " << factor * word_vecs[0].size() << endl;
cerr << "L2 Reg (Dict): " << l2_reg << endl;
cerr << "L1 Reg (Atom): " << l1_reg << endl;
cerr << "Number of Cores: " << num_cores << endl;
cerr << "----------------" << endl;

Train(outfilename, factor, num_cores, l1_reg, l2_reg, word_vecs, vocab);
} else {
cerr << "Usage: " << argv[0] << " vec_corpus factor l1_reg l2_reg "
<< "num_cores outfilename\n";
}
return 1;
}
Binary file added sample/vector.npy
Binary file not shown.
Loading