Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions libfsst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Symbol concat(Symbol a, Symbol b) {
u32 length = a.length()+b.length();
if (length > Symbol::maxLength) length = Symbol::maxLength;
s.set_code_len(FSST_CODE_MASK, length);
s.val.num = (b.val.num << (8*a.length())) | a.val.num;
s.store_num((b.load_num() << (8*a.length())) | a.load_num());
return s;
}
} // namespace libfsst
Expand All @@ -33,7 +33,7 @@ template <>
class hash<libfsst::QSymbol> {
public:
size_t operator()(const libfsst::QSymbol& q) const {
uint64_t k = q.symbol.val.num;
uint64_t k = q.symbol.load_num();
const uint64_t m = 0xc6a4a7935bd1e995;
const int r = 47;
uint64_t h = 0x8445d61a4e774912 ^ (8*m);
Expand Down Expand Up @@ -125,7 +125,7 @@ SymbolTable *buildSymbolTable(Counters& counters, vector<const u8*> line, const
Symbol s = st->hashTab[idx];
code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK;
word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) {
if ((s.icl < FSST_ICL_FREE) & (s.load_num() == word)) {
code2 = s.code();
cur += s.length();
} else if (code2 >= FSST_CODE_BASE) {
Expand Down Expand Up @@ -205,7 +205,7 @@ SymbolTable *buildSymbolTable(Counters& counters, vector<const u8*> line, const
}

// insert candidates into priority queue (by gain)
auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.val.num > q2.symbol.val.num); };
auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.load_num() > q2.symbol.load_num()); };
priority_queue<QSymbol,vector<QSymbol>,decltype(cmpGn)> pq(cmpGn);
for (auto& q : cands)
pq.push(q);
Expand Down Expand Up @@ -337,7 +337,7 @@ static inline size_t compressSIMD(SymbolTable &symbolTable, u8* symbolBase, size
Symbol s = symbolTable.hashTab[idx];
out[1] = (u8) word; // speculatively write out escaped byte
word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) {
*out++ = (u8) s.code(); cur += s.length();
} else {
// could be a 2-byte or 1-byte code, or miss
Expand Down Expand Up @@ -398,7 +398,7 @@ static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, const
Symbol s = symbolTable.hashTab[idx];
out[1] = (u8) word; // speculatively write out escaped byte
word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
if ((s.icl < FSST_ICL_FREE) && s.val.num == word) {
if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) {
*out++ = (u8) s.code(); cur += s.length();
} else if (avoidBranch) {
// could be a 2-byte or 1-byte code, or miss
Expand Down Expand Up @@ -535,6 +535,8 @@ extern "C" u32 fsst_export(fsst_encoder_t *encoder, u8 *buf) {
(((u64) e->symbolTable->nSymbols) << 8) |
FSST_ENDIAN_MARKER; // least significant byte is nonzero

version = swap64_if_be(version); // ensure version is little-endian encoded

/* do not assume unaligned reads here */
memcpy(buf, &version, 8);
buf[8] = e->symbolTable->zeroTerminated;
Expand All @@ -559,6 +561,8 @@ extern "C" u32 fsst_import(fsst_decoder_t *decoder, u8 const *buf) {

// version field (first 8 bytes) is now there just for future-proofness, unused still (skipped)
memcpy(&version, buf, 8);
version = swap64_if_be(version); // version is always little-endian encoded

if ((version>>32) != FSST_VERSION) return 0;
decoder->zeroTerminated = buf[8]&1;
memcpy(lenHisto, buf+9, 8);
Expand Down
25 changes: 18 additions & 7 deletions libfsst.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,18 @@ typedef uint64_t u64;
#define FSST_CODE_MASK (FSST_CODE_MAX-1UL) /* all bits set: indicating a symbol that has not been assigned a code yet */

namespace libfsst {
constexpr inline uint64_t swap64_if_be(uint64_t v) noexcept {
#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
return __builtin_bswap64(v);
#else
return v; // little-endian (or unknown), so no swap needed
#endif
}

inline uint64_t fsst_unaligned_load(u8 const* V) {
uint64_t Ret;
memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible)
return Ret;
return swap64_if_be(Ret);
}

struct Symbol {
Expand All @@ -77,7 +85,7 @@ struct Symbol {

Symbol() : icl(0) { val.num = 0; }

explicit Symbol(u8 c, u16 code) : icl((1<<28)|(code<<16)|56) { val.num = c; } // single-char symbol
explicit Symbol(u8 c, u16 code) : icl((1<<28)|(code<<16)|56) { store_num(c); } // single-char symbol
explicit Symbol(const char* begin, const char* end) : Symbol(begin, (u32) (end-begin)) {}
explicit Symbol(const u8* begin, const u8* end) : Symbol((const char*)begin, (u32) (end-begin)) {}
explicit Symbol(const char* input, u32 len) {
Expand All @@ -92,18 +100,21 @@ struct Symbol {
}
void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); }

u64 load_num() const { return swap64_if_be(val.num); }
void store_num(u64 v) { val.num = swap64_if_be(v); }

u32 length() const { return (u32) (icl >> 28); }
u16 code() const { return (icl >> 16) & FSST_CODE_MASK; }
u32 ignoredBits() const { return (u32) icl; }

u8 first() const { assert( length() >= 1); return 0xFF & val.num; }
u16 first2() const { assert( length() >= 2); return 0xFFFF & val.num; }
u8 first() const { assert( length() >= 1); return 0xFF & load_num(); }
u16 first2() const { assert( length() >= 2); return 0xFFFF & load_num(); }

#define FSST_HASH_LOG2SIZE 10
#define FSST_HASH_PRIME 2971215073LL
#define FSST_SHIFT 15
#define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT))
size_t hash() const { size_t v = 0xFFFFFF & val.num; return FSST_HASH(v); } // hash on the next 3 bytes
size_t hash() const { size_t v = 0xFFFFFF & load_num(); return FSST_HASH(v); } // hash on the next 3 bytes
};

// Symbol that can be put in a queue, ordered on gain
Expand Down Expand Up @@ -218,7 +229,7 @@ struct SymbolTable {
bool taken = (hashTab[idx].icl < FSST_ICL_FREE);
if (taken) return false; // collision in hash table
hashTab[idx].icl = s.icl;
hashTab[idx].val.num = s.val.num & (0xFFFFFFFFFFFFFFFF >> (u8) s.icl);
hashTab[idx].store_num(s.load_num() & (0xFFFFFFFFFFFFFFFF >> (u8) s.icl));
return true;
}
bool add(Symbol s) {
Expand All @@ -239,7 +250,7 @@ struct SymbolTable {
/// Find longest expansion, return code (= position in symbol table)
u16 findLongestSymbol(Symbol s) const {
size_t idx = s.hash() & (hashTabSize-1);
if (hashTab[idx].icl <= s.icl && hashTab[idx].val.num == (s.val.num & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) {
if (hashTab[idx].icl <= s.icl && hashTab[idx].load_num() == (s.load_num() & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) {
return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol
}
if (s.length() >= 2) {
Expand Down