diff --git a/include/compact_vector.hpp b/include/compact_vector.hpp index 59c1753..2da67a0 100644 --- a/include/compact_vector.hpp +++ b/include/compact_vector.hpp @@ -14,7 +14,7 @@ struct compact_vector // struct enumerator { using iterator_category = std::random_access_iterator_tag; - enumerator() {} + enumerator() : m_i(0), m_cur_val(0), m_cur_block(0), m_cur_shift(0), m_vec(nullptr) {} enumerator(Vec const* vec, uint64_t i) : m_i(i) diff --git a/include/endpoints_sequence.hpp b/include/endpoints_sequence.hpp new file mode 100644 index 0000000..08cc0be --- /dev/null +++ b/include/endpoints_sequence.hpp @@ -0,0 +1,287 @@ +#pragma once + +#include + +#include "bit_vector.hpp" +#include "darray.hpp" +#include "compact_vector.hpp" + +namespace bits { + +/* + This class encodes with Elias-Fano a sequence with the + following properties: + - all elements are distinct; + - the first element is 0; + - when doing `next_geq(x)`, x is always in [0,U) where + U is the last element of the sequence. + + This is the case when the sequence represents a list of + bin sizes in a cumulative way, assuming that each bin is + non empty and we ask what is the bin that comprises element x. + For example: imagine we have a set of strings that we concatenate + together and we keep the list of string boundaries. + If the have 4 strings of length 3, 10, 7, 5 respectively, + then we encode the sequence S = [0, 3, 13, 20, U=25] + + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 + x x x x x + + where U = 3+10+7+5, and length of i-th string is S[i+1]-S[i], + for i = 0..3. + Given the offset x of a character, we ask what is the string that + comprises that offset. In this case, we have that 0 <= x < U. + + The goal of this class is to support fast next_geq queries at the price + of some space increase compared to an `elias_fano` sequence, so: + - The low bits are always 8. + - We keep an array of H of size ceil(U/2^8), where H[i] indicates the + position of the i-th 0 in the high bit array. Each H[i] is written + in 1+log(n) bits because the high bit array has length <= 2n. +*/ + +template +struct endpoints_sequence { + endpoints_sequence() : m_back(0) {} + + template + void encode(Iterator begin, uint64_t n, uint64_t universe) { + if (n == 0) return; + + const uint64_t num_high_bits = n + (universe >> 8) + 1; + bit_vector::builder bvb_high_bits(num_high_bits); + m_low_bits.reserve(n); + + compact_vector::builder cv_builder((universe + 256 - 1) / 256, // ceil(U/2^8) + util::ceil_log2_uint64(num_high_bits)); + + assert(*begin == 0); + uint64_t prev_pos = 0; + for (uint64_t i = 0; i != n; ++i, ++begin) { + auto v = *begin; + m_low_bits.push_back(v & 255); + uint64_t high_part = v >> 8; + uint64_t pos = high_part + i; + bvb_high_bits.set(pos, 1); + for (uint64_t j = prev_pos + 1; j < pos; ++j) cv_builder.push_back(j); + prev_pos = pos; + m_back = v; + } + cv_builder.push_back(prev_pos + 1); + + bvb_high_bits.build(m_high_bits); + m_high_bits_d1.build(m_high_bits); + cv_builder.build(m_high_bits_d0); + + // for (uint64_t i = 0; i != m_high_bits.num_bits(); ++i) { + // std::cout << m_high_bits.get(i) << ' '; + // } + // std::cout << std::endl; + // auto it = m_high_bits_d0.begin(); + // for (uint64_t i = 0; i != m_high_bits_d0.size(); ++i) { + // std::cout << i << " : " << *it << '\n'; + // ++it; + // } + } + + struct iterator { + iterator() : m_ptr(nullptr), m_pos(0), m_val(0) {} + + iterator(endpoints_sequence const* ptr, uint64_t pos = 0) + : m_ptr(ptr) + , m_pos(pos) + , m_val(0) // + { + if (!has_next() or m_ptr->m_high_bits_d1.num_positions() == 0) return; + uint64_t begin = m_ptr->m_high_bits_d1.select(m_ptr->m_high_bits, m_pos); + m_high_bits_it = m_ptr->m_high_bits.get_iterator_at(begin); + m_low_bits_it = m_ptr->m_low_bits.begin() + m_pos; + read_next_value(); + } + + iterator(endpoints_sequence const* ptr, uint64_t pos, uint64_t pos_in_high_bits /* hint */) + : m_ptr(ptr) + , m_pos(pos) + , m_val(0) // + { + /* + These two assertions are valid because this constructor is only used + in `next_geq`. + */ + assert(has_next()); + assert(pos_in_high_bits == 0 || m_ptr->m_high_bits.get(pos_in_high_bits) == 0); + + m_high_bits_it = m_ptr->m_high_bits.get_iterator_at(pos_in_high_bits); + m_low_bits_it = m_ptr->m_low_bits.begin() + m_pos; + read_next_value(); + } + + bool has_next() const { return m_pos < m_ptr->size(); } + bool has_prev() const { return m_pos > 0; } + uint64_t value() const { return m_val; } + uint64_t position() const { return m_pos; } + + void next() { + ++m_pos; + if (!has_next()) return; + read_next_value(); + } + + /* + Return the value before the current position. + */ + uint64_t prev_value() { + assert(m_pos > 0); + uint64_t pos = m_pos - 1; + /* + `read_next_value()` sets the state ahead of 1 position, + hence must go back by 2 positions to get previous value. + */ + assert(m_high_bits_it.position() >= 2); + uint64_t high = m_high_bits_it.prev(m_high_bits_it.position() - 2); + assert(high == m_ptr->m_high_bits_d1.select(m_ptr->m_high_bits, pos)); + uint64_t low = *(m_low_bits_it - 2); + return ((high - pos) << 8) | low; + } + + private: + endpoints_sequence const* m_ptr; + uint64_t m_pos; + uint64_t m_val; + bit_vector::iterator m_high_bits_it; + std::vector::const_iterator m_low_bits_it; + + void read_next_value() { + assert(m_pos < m_ptr->size()); + uint64_t high = m_high_bits_it.next(); + assert(high == m_ptr->m_high_bits_d1.select(m_ptr->m_high_bits, m_pos)); + uint64_t low = *m_low_bits_it; + m_val = ((high - m_pos) << 8) | low; + ++m_low_bits_it; + } + }; + + iterator get_iterator_at(uint64_t pos) const { return iterator(this, pos); } + iterator begin() const { return get_iterator_at(0); } + + uint64_t access(uint64_t i) const { + assert(i < size()); + return ((m_high_bits_d1.select(m_high_bits, i) - i) << 8) | m_low_bits[i]; + } + + struct return_value { + uint64_t pos; + uint64_t val; + }; + + /* + Return [position,value] of the smallest element that is >= x. + */ + return_value next_geq(const uint64_t x) const { return next_geq_helper(x).first; } + + /* + Determine integers lo and hi, with lo < hi, such that lo <= x < hi, where: + - lo is the largest value that is <= x, + - hi is the smallest value that is > x. + Return the tuple [lo_pos, lo, hi_pos, hi]. + */ + std::pair locate(const uint64_t x) const { + auto [lo, it] = next_geq_helper(x); + return_value hi{lo.pos, lo.val}; + + if (lo.val > x) { + assert(lo.pos > 0); + lo.pos -= 1; + lo.val = it.prev_value(); + } else { + hi.pos += 1; + hi.val = (it.next(), it.value()); + assert(it.position() == hi.pos); + assert(hi.pos < size()); + } + + return {lo, hi}; + } + + uint64_t back() const { return m_back; } + uint64_t size() const { return m_low_bits.size(); } + + uint64_t num_bytes() const { + return sizeof(m_back) + m_high_bits.num_bytes() + m_high_bits_d1.num_bytes() + + m_high_bits_d0.num_bytes() + essentials::vec_bytes(m_low_bits); + } + + void swap(endpoints_sequence& other) { + std::swap(m_back, other.m_back); + m_high_bits.swap(other.m_high_bits); + m_high_bits_d1.swap(other.m_high_bits_d1); + m_high_bits_d0.swap(other.m_high_bits_d0); + m_low_bits.swap(other.m_low_bits); + } + + template + void visit(Visitor& visitor) const { + visit_impl(visitor, *this); + } + + template + void visit(Visitor& visitor) { + visit_impl(visitor, *this); + } + +private: + uint64_t m_back; + bit_vector m_high_bits; + DArray1 m_high_bits_d1; + compact_vector m_high_bits_d0; + std::vector m_low_bits; + + template + static void visit_impl(Visitor& visitor, T&& t) { + visitor.visit(t.m_back); + visitor.visit(t.m_high_bits); + visitor.visit(t.m_high_bits_d1); + visitor.visit(t.m_high_bits_d0); + visitor.visit(t.m_low_bits); + } + + std::pair next_geq_helper(const uint64_t x) const // + { + assert(x >= 0); + assert(x < back()); + + uint64_t h_x = x >> 8; + uint64_t p = 0; + uint64_t begin = 0; + if (h_x > 0) { + p = m_high_bits_d0.access(h_x - 1); + begin = p - h_x + 1; + } + assert(begin < size()); + + /* + `begin` is the position of the elements that have high part >= h_x + and `p` is the position in `m_high_bits` of the (h_x-1)-th 0, + so it is passed to the iterator as a hint to recover the high part + of the element at position `begin` without doing a select_1. + */ + + auto it = iterator(this, begin, p /* position in high_bits hint */); + uint64_t pos = begin; + uint64_t val = it.value(); + while (val < x) { + ++pos; + it.next(); + val = it.value(); + } + /* now pos is the position of the element that is >= x */ + assert(val >= x); + assert(pos < size()); + assert(val == access(pos)); + assert(it.position() == pos); + + return {{pos, val}, it}; + } +}; + +} // namespace bits \ No newline at end of file diff --git a/include/util.hpp b/include/util.hpp index fd16ffd..32ff70a 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -32,6 +32,7 @@ static inline bool msbll(uint64_t x, uint64_t& ret) { } static inline uint64_t ceil_log2_uint32(uint32_t x) { return (x > 1) ? msb(x - 1) + 1 : 0; } +static inline uint64_t ceil_log2_uint64(uint64_t x) { return (x > 1) ? msbll(x - 1) + 1 : 0; } /* return the position of the least significant bit (lsb) */ static inline uint64_t lsb(uint32_t x) { diff --git a/test/common.hpp b/test/common.hpp index dbbaef7..8e2a632 100644 --- a/test/common.hpp +++ b/test/common.hpp @@ -18,6 +18,7 @@ std::vector get_sequence(const uint64_t sequence_length, const uint64_t max_int = 1000, // const uint64_t seed = essentials::get_random_seed()) // { + std::cout << "seed = " << seed << std::endl; srand(seed); double mean = rand() % (max_int + 1); std::mt19937 rng(seed); diff --git a/test/test_endpoints_sequence.cpp b/test/test_endpoints_sequence.cpp new file mode 100644 index 0000000..0460ba0 --- /dev/null +++ b/test/test_endpoints_sequence.cpp @@ -0,0 +1,234 @@ +#include "common.hpp" +#include "include/endpoints_sequence.hpp" + +constexpr uint64_t sequence_length = 10000; + +std::vector get_sequence(const uint64_t sequence_length) { + std::vector s = test::get_sequence(sequence_length); + std::vector seq; + seq.reserve(s.size() + 1); + uint64_t sum = 0; + for (uint64_t i = 0; i != s.size(); ++i) { + seq.push_back(sum); + sum += s[i] + (s[i] == 0); // +1 so that all values are distinct + } + seq.push_back(sum); + assert(std::adjacent_find(seq.begin(), seq.end()) == seq.end()); // must be all distinct + return seq; +} + +endpoints_sequence<> encode(std::vector const& seq) { + endpoints_sequence<> es; + assert(seq.front() == 0); + es.encode(seq.begin(), seq.size(), seq.back()); + REQUIRE(es.size() == seq.size()); + std::cout << "es.size() = " << es.size() << '\n'; + std::cout << "es.back() = " << es.back() << '\n'; + std::cout << "measured bits/int = " << (8.0 * es.num_bytes()) / es.size() << std::endl; + return es; +} + +TEST_CASE("access") { + std::vector seq = get_sequence(sequence_length); + auto es = encode(seq); + std::cout << "checking correctness of random access..." << std::endl; + for (uint64_t i = 0; i != es.size(); ++i) { + uint64_t got = es.access(i); // get the integer at position i + uint64_t expected = seq[i]; + REQUIRE_MESSAGE(got == expected, "got " << got << " at position " << i << "/" << es.size() + << " but expected " << expected); + } + std::cout << "EVERYTHING OK!" << std::endl; +} + +TEST_CASE("iterator::next") { + std::vector seq = get_sequence(sequence_length); + auto es = encode(seq); + + std::cout << "checking correctness of iterator..." << std::endl; + + auto it = es.begin(); + for (uint64_t i = 0; i != es.size(); ++i, it.next()) { + uint64_t got = it.value(); + uint64_t expected = seq[i]; + REQUIRE_MESSAGE(got == expected, "got " << got << " at position " << i << "/" << es.size() + << " but expected " << expected); + REQUIRE(it.position() == i); + REQUIRE(it.has_next() == true); + } + REQUIRE(it.position() == es.size()); + REQUIRE(it.has_next() == false); + + uint64_t i = 0; + it = es.begin(); + while (it.has_next()) { + uint64_t got = it.value(); + uint64_t expected = seq[i]; + REQUIRE_MESSAGE(got == expected, "got " << got << " at position " << i << "/" << es.size() + << " but expected " << expected); + REQUIRE(it.position() == i); + REQUIRE(it.has_next() == true); + it.next(); + ++i; + } + REQUIRE(i == es.size()); + REQUIRE(it.position() == es.size()); + REQUIRE(it.has_next() == false); + + std::cout << "EVERYTHING OK!" << std::endl; +} + +TEST_CASE("next_geq") { + std::vector seq = get_sequence(sequence_length); + auto es = encode(seq); + // test::print(seq); + + std::cout << "checking correctness of next_geq..." << std::endl; + + for (uint64_t i = 0; i != es.size() - 1; ++i) { + auto x = seq[i]; + /* since x is in the sequence, next_geq must return (i,x) */ + + auto p = es.next_geq(x); + uint64_t pos = p.pos; + uint64_t got = p.val; + uint64_t expected = seq[i]; + + // std::cout << "x=" << x << "; pos=" << pos << "; got=" << got << std::endl; + + bool good = (got == expected) && (pos == i); + REQUIRE_MESSAGE(good, "got " << got << " at position " << pos << "/" << seq.size() + << " but expected " << expected << " at position " << i); + } + std::cout << "EVERYTHING OK!" << std::endl; + + std::cout << "checking correctness of next_geq..." << std::endl; + for (uint64_t i = 0; i != 10000; ++i) { + uint64_t x = seq[rand() % seq.size()] + (i % 2 == 0 ? 3 : -3); // get some value + if (x >= seq.back()) x = seq.back() - 1; // lower bound must be x < back() for assumption + + auto p = es.next_geq(x); + uint64_t pos = p.pos; + uint64_t got = p.val; + + // std::cout << "x=" << x << "; pos=" << pos << "; got=" << got << std::endl; + + auto it = std::lower_bound(seq.begin(), seq.end(), x); + assert(it != seq.end()); + uint64_t expected = *it; + uint64_t pos_expected = std::distance(seq.begin(), it); + bool good = (got == expected) && (pos == pos_expected); + REQUIRE_MESSAGE(good, "got " << got << " at position " << pos << "/" << seq.size() + << " but expected " << expected << " at position " + << pos_expected); + } + std::cout << "EVERYTHING OK!" << std::endl; +} + +TEST_CASE("small_next_geq") { + std::cout << "checking correctness of next_geq over a small sequence..." << std::endl; + + std::vector seq{0, 3, 5, 6, 9, 13, 23, 31}; + auto es = encode(seq); + + auto p = es.next_geq(0); + REQUIRE(p.pos == 0); + REQUIRE(p.val == 0); + + p = es.next_geq(2); + REQUIRE(p.pos == 1); + REQUIRE(p.val == 3); + + p = es.next_geq(4); + REQUIRE(p.pos == 2); + REQUIRE(p.val == 5); + + p = es.next_geq(6); + REQUIRE(p.pos == 3); + REQUIRE(p.val == 6); + + p = es.next_geq(23); + REQUIRE(p.pos == 6); + REQUIRE(p.val == 23); + + p = es.next_geq(28); + REQUIRE(p.pos == 7); + REQUIRE(p.val == 31); + + p = es.next_geq(30); + REQUIRE(p.pos == 7); + REQUIRE(p.val == 31); +} + +TEST_CASE("locate") { + std::vector seq = get_sequence(sequence_length); + auto es = encode(seq); + // test::print(seq); + + std::cout << "checking correctness of locate..." << std::endl; + + for (uint64_t i = 0; i != seq.size() - 1; ++i) // + { + auto x = seq[i]; + + /* since x is in the sequence, p.first.pos must be i */ + auto p = es.locate(x); + uint64_t lo_pos = p.first.pos; + uint64_t lo_val = p.first.val; + uint64_t hi_pos = p.second.pos; + uint64_t hi_val = p.second.val; + + uint64_t expected_lo_pos = i; + uint64_t expected_lo_val = seq[i]; + uint64_t expected_hi_pos = i + 1; + uint64_t expected_hi_val = seq[i + 1]; + + // std::cout << "x = " << x << " lo_pos = " << lo_pos << " lo_val = " << lo_val + // << " hi_pos = " << hi_pos << " hi_val = " << hi_val << std::endl; + + bool good = (lo_val == expected_lo_val) && (lo_pos == expected_lo_pos); + REQUIRE_MESSAGE(good, "got " << lo_val << " at position " << lo_pos << "/" << seq.size() + << " but expected " << expected_lo_val << " at position " + << expected_lo_pos); + good = (hi_val == expected_hi_val) && (hi_pos == expected_hi_pos); + REQUIRE_MESSAGE(good, "got " << hi_val << " at position " << hi_pos << "/" << seq.size() + << " but expected " << expected_hi_val << " at position " + << expected_hi_pos); + } + std::cout << "EVERYTHING OK!" << std::endl; + + std::cout << "checking correctness of locate..." << std::endl; + + for (uint64_t i = 0; i != 10000; ++i) // + { + uint64_t x = seq[rand() % es.size()] + (i % 2 == 0 ? 3 : -3); // get some value value + if (x >= seq.back()) x = seq.back() - 1; // lower bound must be x < back() for assumption + + auto p = es.locate(x); + uint64_t lo_pos = p.first.pos; + uint64_t lo_val = p.first.val; + uint64_t hi_pos = p.second.pos; + uint64_t hi_val = p.second.val; + + // std::cout << "x = " << x << " lo_pos = " << lo_pos << " lo_val = " << lo_val + // << " hi_pos = " << hi_pos << " hi_val = " << hi_val << std::endl; + + auto it = std::upper_bound(seq.begin(), seq.end(), x) - 1; + uint64_t expected_lo_pos = std::distance(seq.begin(), it); + assert(expected_lo_pos != es.size() - 1); + uint64_t expected_lo_val = *it; + uint64_t expected_hi_pos = expected_lo_pos + 1; + uint64_t expected_hi_val = *(it + 1); + + bool good = (lo_val == expected_lo_val) && (lo_pos == expected_lo_pos); + REQUIRE_MESSAGE(good, "got " << lo_val << " at position " << lo_pos << "/" << seq.size() + << " but expected " << expected_lo_val << " at position " + << expected_lo_pos); + good = (hi_val == expected_hi_val) && (hi_pos == expected_hi_pos); + REQUIRE_MESSAGE(good, "got " << hi_val << " at position " << hi_pos << "/" << seq.size() + << " but expected " << expected_hi_val << " at position " + << expected_hi_pos); + } + + std::cout << "EVERYTHING OK!" << std::endl; +}