Skip to content

[fix] 修复类型不存在错误, 兼容win #31

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions cutText.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
let Segment = require("segment");
const segment = new Segment();
const POSTAG = require("segment/lib/POSTAG")
const load = segment.useDefault()
/**
*
* @param {string} str
* @returns {Promise<string[]>}
*/
async function cutText(str) {
await load;
let words = segment.doSegment(str, {
stripPunctuation: true
});

words = words.filter(v => {
return v.p != POSTAG.D_U && v.p != POSTAG.D_P && v.p != POSTAG.A_M && v.p != POSTAG.D_D
}).map(v => v.w)
// console.log(words)
return words
}

module.exports = { cutText }

// console.log(cutText("明天要去上学"))
97 changes: 77 additions & 20 deletions src/word2vec.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
#include <string.h>
#include <math.h>
#include <pthread.h>

#ifdef _WIN32
#include <malloc.h>
#endif
#define MAX_STRING 100
#define EXP_TABLE_SIZE 1000
#define MAX_EXP 6
Expand All @@ -43,7 +45,7 @@ long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
real alpha = 0.025, starting_alpha, sample = 1e-3;
real *syn0, *syn1, *syn1neg, *expTable;
clock_t start;
int start;

int hs = 0, negative = 5;
const int table_size = 1e8;
Expand Down Expand Up @@ -335,27 +337,82 @@ void ReadVocab() {
fclose(fin);
}

void InitNet() {
void InitNet()
{
long long a, b;
unsigned long long next_random = 1;
a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
if (hs) {
a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
syn1[a * layer1_size + b] = 0;

#ifdef _WIN32
syn0 = (real *)_aligned_malloc((long long)vocab_size * layer1_size * sizeof(real), 128);
#else
if (posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)) != 0)
{
syn0 = NULL;
}
#endif
if (syn0 == NULL)
{
printf("Memory allocation failed\n");
exit(1);
}

if (hs)
{
#ifdef _WIN32
syn1 = (real *)_aligned_malloc((long long)vocab_size * layer1_size * sizeof(real), 128);
#else
if (posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)) != 0)
{
syn1 = NULL;
}
#endif
if (syn1 == NULL)
{
printf("Memory allocation failed\n");
exit(1);
}
for (a = 0; a < vocab_size; a++)
{
for (b = 0; b < layer1_size; b++)
{
syn1[a * layer1_size + b] = 0;
}
}
}
if (negative>0) {
a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
syn1neg[a * layer1_size + b] = 0;

if (negative > 0)
{
#ifdef _WIN32
syn1neg = (real *)_aligned_malloc((long long)vocab_size * layer1_size * sizeof(real), 128);
#else
if (posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)) != 0)
{
syn1neg = NULL;
}
#endif
if (syn1neg == NULL)
{
printf("Memory allocation failed\n");
exit(1);
}
for (a = 0; a < vocab_size; a++)
{
for (b = 0; b < layer1_size; b++)
{
syn1neg[a * layer1_size + b] = 0;
}
}
}
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {

for (a = 0; a < vocab_size; a++)
{
for (b = 0; b < layer1_size; b++)
{
next_random = next_random * (unsigned long long)25214903917 + 11;
syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
}
}

CreateBinaryTree();
}

Expand All @@ -365,7 +422,7 @@ void *TrainModelThread(void *id) {
long long l1, l2, c, target, label, local_iter = iter;
unsigned long long next_random = (long long)id;
real f, g;
clock_t now;
int now;
real *neu1 = (real *)calloc(layer1_size, sizeof(real));
real *neu1e = (real *)calloc(layer1_size, sizeof(real));
FILE *fi = fopen(train_file, "rb");
Expand All @@ -376,9 +433,9 @@ void *TrainModelThread(void *id) {
last_word_count = word_count;
if ((debug_mode > 1)) {
now=clock();
printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,
word_count_actual / (real)(iter * train_words + 1) * 100,
word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
// printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,
// word_count_actual / (real)(iter * train_words + 1) * 100,
// word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
fflush(stdout);
}
alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
Expand Down
61 changes: 61 additions & 0 deletions test/test1.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
const word2vec = require('../');

var w2v = require('./../lib');

const fs = require('fs');
const path = require('path');
const { cutText } = require('../cutText');

w2v.word2phrase('E:/web/wordApp/tmp/遮天.txt', __dirname + '/fixtures/phrases1.txt', {
threshold: 5,
debug: 2,
minCount: 2
});

var out = fs.createWriteStream(path.join(__dirname, "/fixtures/cut.txt"), { flags: "w+" });


readLine(path.join(__dirname + '/fixtures/phrases1.txt'), async function (params) {
var result = await cutText(params);
out.write(result.join(" ") + "\n");
})


// // // 训练Word2Vec模型
// word2vec.word2vec( "E:/web/wordApp/tmp/out.txt", './output_model.txt', {
// size: 200,
// window: 5,
// minCount: 4,
// threshold: 90,
// cbow: 1
// }, (error) => {
// if (error) {
// console.error(error);
// return;
// }

// console.log('模型训练完成');
// });



async function readLine(filePath, insert) {
const fileStream = fs.createReadStream(filePath);
const lineReader = require('readline').createInterface({
input: fileStream,
crlfDelay: Infinity
});

let lines = [];
var i = 0;
for await (const fileLine of lineReader) {
await insert(fileLine);
i++;
if (i % 1000 == 0) {
// break;/
console.log(i);
}

}
return lines;
}