lmori's Library

This documentation is automatically generated by competitive-verifier/competitive-verifier

View the Project on GitHub lmorinn/library

:heavy_check_mark: FM-index
(string/FMindex.hpp)

概要

todo

計算量

todo

Depends on

Verified with

Code

#pragma once
#include "../data-structure/wavelet-matrix/WaveletMatrixString.hpp"
#include "../string/SuffixArray.hpp"

class FMindex {
   private:
    int n;
    WaveletMatrix w;
    vector<int> less;

   public:
    FMindex(string &s) {
        n = s.size();
        less.resize(256);
        vector<int> cnt(256);
        string bwt(n + 1, '$');
        vector<int> sa = suffixArray(s);
        reverse(sa.begin(), sa.end());
        sa.emplace_back(s.size());
        reverse(sa.begin(), sa.end());

        for (int i = 0; i < s.size() + 1; i++) {
            if (sa[i] > 0) {
                bwt[i] = s[sa[i] - 1];
            }
            cnt[s[i]]++;
        }

        for (int i = 1; i < 256; i++) {
            less[i] = less[i - 1] + cnt[i - 1];
        }
        w = WaveletMatrix(bwt);
    }

    int substrCount(string &t) {
        int m = t.size();
        int l = 0;
        int r = n + 1;
        for (int i = 0; i < m; i++) {
            unsigned char c = t[m - 1 - i];
            l = less[c] + w.rank(l, c);
            r = less[c] + w.rank(r, c);
            if (l >= r) {
                l = 0;
                r = 0;
                break;
            }
        }
        return r - l;
    }
};
#line 1 "data-structure/wavelet-matrix/WaveletMatrixString.hpp"
template <class T>
class BitVector {
   private:
    unsigned n, cur, p;
    vector<unsigned> acc, bit;

   public:
    BitVector() {
    }

    BitVector(vector<bool> &b) {
        cur = 0;
        n = b.size();
        acc.resize((n >> 5) + 2, 0);
        bit.resize((n >> 5) + 2, 0);
        for (int i = 0; i < n; i++) {
            if (!(i & 31)) {
                cur++;
                acc[cur] = acc[cur - 1];
            }
            if (b[i]) {
                acc[cur] += int(b[i]);
                bit[cur - 1] |= (1U << (32 - (i & 31) - 1));
            }
        }
    }

    inline unsigned rank(unsigned k) {
        if (!(k & 31)) return acc[k >> 5];
        return acc[k >> 5] + __builtin_popcount(bit[k >> 5] >> (32 - (k & 31)));
    }

    inline bool access(unsigned k) {
        return (rank(k + 1) - rank(k)) == 1;
    }
};

class WaveletMatrix {
   private:
    unsigned n;
    unsigned bitsize;
    vector<BitVector<unsigned char>> b;
    vector<unsigned> zero;
    vector<int> stInd;
    unsigned char MI, MA;

    // v[l,r) の中で値がk未満の個数を返す
    unsigned rank_less(unsigned l, unsigned r, unsigned char k) {
        unsigned less = 0;
        for (unsigned i = 0; i < bitsize and l < r; i++) {
            const unsigned rank1_l = b[i].rank(l);
            const unsigned rank1_r = b[i].rank(r);
            const unsigned rank0_l = l - rank1_l;
            const unsigned rank0_r = r - rank1_r;
            if (k & (1U << (bitsize - i - 1))) {
                less += (rank0_r - rank0_l);
                l = zero[i] + rank1_l;
                r = zero[i] + rank1_r;
            } else {
                l = rank0_l;
                r = rank0_r;
            }
        }
        return less;
    }

   public:
    // コンストラクタ
    WaveletMatrix() {}
    WaveletMatrix(string v) {
        MI = numeric_limits<unsigned char>::min();
        MA = numeric_limits<unsigned char>::max();
        n = v.size();

        vector<unsigned> tmp(n);
        stInd.resize(256, -1);
        bitsize = 8;
        b.resize(bitsize);
        zero.resize(bitsize, 0);
        vector<bool> bit(n, 0);
        for (unsigned i = 0; i < bitsize; i++) {
            for (unsigned j = 0; j < n; j++) {
                bit[j] = v[j] & (1U << (bitsize - i - 1));
                zero[i] += unsigned(!bit[j]);
                tmp[j] = v[j];
            }
            b[i] = BitVector<unsigned char>(bit);
            int cur = 0;
            for (unsigned j = 0; j < n; j++) {
                if (!bit[j]) {
                    v[cur] = tmp[j];
                    cur++;
                }
            }
            for (unsigned j = 0; j < n; j++) {
                if (bit[j]) {
                    v[cur] = tmp[j];
                    cur++;
                }
            }
        }

        for (unsigned i = 0; i < n; i++) {
            if (stInd[v[i]] == -1) {
                stInd[v[i]] = i;
            }
        }
    }

    // get v[k]
    unsigned char access(unsigned k) {
        unsigned char res = 0;
        unsigned cur = k;
        for (unsigned i = 0; i < bitsize; i++) {
            if (b[i].access(cur)) {
                res |= (1U << (bitsize - i - 1));
                cur = zero[i] + b[i].rank(cur);
            } else {
                cur -= b[i].rank(cur);
            }
        }
        return res;
    }

    // v[0,k) 中でのcの出現回数を返す
    unsigned rank(unsigned k, unsigned char c) {
        unsigned cur = k;
        if (stInd[c] == -1) {
            return 0;
        }
        for (unsigned i = 0; i < bitsize; i++) {
            if (c & (1U << (bitsize - i - 1))) {
                cur = zero[i] + b[i].rank(cur);
            } else {
                cur -= b[i].rank(cur);
            }
        }
        return cur - stInd[c];
    }

    // v[l,r) の中でk番目(1-origin)に小さい値を返す
    unsigned char kth_smallest(unsigned l, unsigned r, unsigned k) {
        unsigned char res = 0;
        for (unsigned i = 0; i < bitsize; i++) {
            unsigned num1 = b[i].rank(r) - b[i].rank(l);
            unsigned num0 = r - l - num1;
            if (num0 < k) {
                res |= (1ULL << (bitsize - i - 1));
                l = zero[i] + b[i].rank(l);
                r = zero[i] + b[i].rank(r);
                k -= num0;
            } else {
                l -= b[i].rank(l);
                r -= b[i].rank(r);
            }
        }
        return res;
    }

    // v[l,r) の中でk番目(1-origin)に大きい値を返す
    unsigned char kth_largest(unsigned l, unsigned r, unsigned k) {
        return kth_smallest(l, r, r - l - k + 1);
    }

    // v[l,r) の中で[mink,maxk)に入る値の個数を返す
    unsigned range_freq(unsigned l, unsigned r, unsigned char mink, unsigned char maxk) {
        if (mink == 0) {
            return rank_less(l, r, maxk);
        }
        return rank_less(l, r, maxk) - rank_less(l, r, mink);
    }

    // v[l,r)の中でvalを超えない最大の値を返す
    unsigned char prev_value(unsigned l, unsigned r, unsigned char val) {
        int num = range_freq(l, r, MI, val);
        if (num == 0) {
            return MA;
        } else {
            return kth_smallest(l, r, num);
        }
    }

    // v[l,r)の中でvalより大きい最小の値を返す
    unsigned char next_value(unsigned l, unsigned r, unsigned char val) {
        int num = range_freq(l, r, MI, val + 1);
        if (num == r - l) {
            return MI;
        } else {
            return kth_smallest(l, r, num + 1);
        }
    }
};
#line 2 "string/SuffixArray.hpp"

#define tget(i) (((t)[(i) >> 3] >> (7 - ((i) & 7))) & 1)

#define tset(i, b)                                    \
    do {                                              \
        if (b)                                        \
            (t)[(i) >> 3] |= (1 << (7 - ((i) & 7)));  \
        else                                          \
            (t)[(i) >> 3] &= ~(1 << (7 - ((i) & 7))); \
    } while (0)

#define chr(i) (cs == sizeof(int) ? ((int *)s)[i] : ((unsigned char *)s)[i])
#define isLMS(i) (i > 0 and tget(i) and !tget(i - 1))

void getBuckets(unsigned char *s, int *bkt, int n, int K, int cs, bool end) {
    int sum = 0;
    for (int i = 0; i <= K; i++) {
        bkt[i] = 0;
    }
    for (int i = 0; i < n; i++) {
        bkt[chr(i)]++;
    }
    for (int i = 0; i <= K; i++) {
        sum += bkt[i];
        bkt[i] = end ? sum : sum - bkt[i];
    }
}

void induceSAl(unsigned char *t, int *SA, unsigned char *s, int *bkt, int n, int K, int cs, bool end) {
    getBuckets(s, bkt, n, K, cs, end);
    for (int i = 0; i < n; i++) {
        int j = SA[i] - 1;
        if (j >= 0 and !tget(j)) {
            SA[bkt[chr(j)]++] = j;
        }
    }
}

void induceSAs(unsigned char *t, int *SA, unsigned char *s, int *bkt, int n, int K, int cs, bool end) {
    getBuckets(s, bkt, n, K, cs, end);
    for (int i = n - 1; i >= 0; i--) {
        int j = SA[i] - 1;
        if (j >= 0 and tget(j)) {
            SA[--bkt[chr(j)]] = j;
        }
    }
}

void SA_IS(unsigned char *s, int *SA, int n, int K, int cs) {
    unsigned char *t = (unsigned char *)malloc(n / 8 + 1);
    tset(n - 2, 0);
    tset(n - 1, 1);
    for (int i = n - 3; i >= 0; i--) {
        tset(i, (chr(i) < chr(i + 1) or (chr(i) == chr(i + 1) and tget(i + 1) == 1)) ? 1 : 0);
    }

    int *bkt = (int *)malloc(sizeof(int) * (K + 1));
    getBuckets(s, bkt, n, K, cs, true);
    for (int i = 0; i < n; i++) {
        SA[i] = -1;
    }
    for (int i = 1; i < n; i++) {
        if (isLMS(i)) {
            SA[--bkt[chr(i)]] = i;
        }
    }
    induceSAl(t, SA, s, bkt, n, K, cs, false);
    induceSAs(t, SA, s, bkt, n, K, cs, true);
    free(bkt);

    int n1 = 0;
    for (int i = 0; i < n; i++) {
        if (isLMS(SA[i])) {
            SA[n1++] = SA[i];
        }
    }

    for (int i = n1; i < n; i++) {
        SA[i] = -1;
    }
    int name = 0;
    int prev = -1;
    for (int i = 0; i < n1; i++) {
        int pos = SA[i];
        bool diff = false;
        for (int d = 0; d < n; d++) {
            if (prev == -1 or chr(pos + d) != chr(prev + d) or tget(pos + d) != tget(prev + d)) {
                diff = true;
                break;
            } else if (d > 0 and (isLMS(pos + d) or isLMS(prev + d))) {
                break;
            }
        }
        if (diff) {
            name++;
            prev = pos;
        }
        pos = (pos % 2 == 0) ? pos / 2 : (pos - 1) / 2;
        SA[n1 + pos] = name - 1;
    }
    for (int i = n - 1, j = n - 1; i >= n1; i--) {
        if (SA[i] >= 0) {
            SA[j--] = SA[i];
        }
    }

    int *SA1 = SA;
    int *s1 = SA + n - n1;
    if (name < n1) {
        SA_IS((unsigned char *)s1, SA1, n1, name - 1, sizeof(int));
    } else {
        for (int i = 0; i < n1; i++) {
            SA1[s1[i]] = i;
        }
    }
    bkt = (int *)malloc(sizeof(int) * (K + 1));
    getBuckets(s, bkt, n, K, cs, true);
    for (int i = 1, j = 0; i < n; i++) {
        if (isLMS(i)) {
            s1[j++] = i;
        }
    }
    for (int i = 0; i < n1; i++) {
        SA1[i] = s1[SA1[i]];
    }
    for (int i = n1; i < n; i++) {
        SA[i] = -1;
    }
    for (int i = n1 - 1; i >= 0; i--) {
        int j = SA[i];
        SA[i] = -1;
        SA[--bkt[chr(j)]] = j;
    }
    induceSAl(t, SA, s, bkt, n, K, cs, false);
    induceSAs(t, SA, s, bkt, n, K, cs, true);
    free(bkt);
    free(t);
}

vector<int> suffixArray(string &str) {
    int n = str.size() + 1;
    int *sa = (int *)malloc(sizeof(int) * n);
    unsigned char *s = (unsigned char *)malloc(sizeof(unsigned char) * (n + 2));
    int k = 256;
    int cs = sizeof(unsigned char);
    for (int i = 0; i < str.size(); i++) {
        s[i] = str[i];
    }
    s[int(str.size())] = 0;
    SA_IS((unsigned char *)(s), sa, n, k, cs);
    vector<int> res(n - 1);
    for (int i = 0; i < n - 1; i++) {
        res[i] = sa[i + 1];
    }
    free(sa);
    free(s);
    return res;
}
#line 4 "string/FMindex.hpp"

class FMindex {
   private:
    int n;
    WaveletMatrix w;
    vector<int> less;

   public:
    FMindex(string &s) {
        n = s.size();
        less.resize(256);
        vector<int> cnt(256);
        string bwt(n + 1, '$');
        vector<int> sa = suffixArray(s);
        reverse(sa.begin(), sa.end());
        sa.emplace_back(s.size());
        reverse(sa.begin(), sa.end());

        for (int i = 0; i < s.size() + 1; i++) {
            if (sa[i] > 0) {
                bwt[i] = s[sa[i] - 1];
            }
            cnt[s[i]]++;
        }

        for (int i = 1; i < 256; i++) {
            less[i] = less[i - 1] + cnt[i - 1];
        }
        w = WaveletMatrix(bwt);
    }

    int substrCount(string &t) {
        int m = t.size();
        int l = 0;
        int r = n + 1;
        for (int i = 0; i < m; i++) {
            unsigned char c = t[m - 1 - i];
            l = less[c] + w.rank(l, c);
            r = less[c] + w.rank(r, c);
            if (l >= r) {
                l = 0;
                r = 0;
                break;
            }
        }
        return r - l;
    }
};
Back to top page