快速幂与光速幂

简介

本文将介绍：快速幂、龟速乘、矩阵快速幂、欧拉降幂、光速幂。

快速幂在众多数论中有非常重要的地位，也是其中最频繁用到的部分，朴素地求 $a^b$ 时我们可以做到 $O(n)$ 的时间复杂度，而快速幂将其提升到 $O(\log_2{n})$ 。

本文除了介绍快速幂还会介绍相关拓展，如上面列出。

快速幂

其实快速幂的思想很简单，我们可以求出 $a,~a^2,~a^4,~a^8~\dots$ ，而任意所求的 $a^b$ 一定可以表示成若干个 $a^{2^n}$ 的乘积，而这个数量级一定是 $O(\log_2{n})$ 的。

// 不带模数的快速幂, 基本不用
int quick_pow(int base, int exponent) {
    int res = 1;
    while (exponent) {
        if (exponent & 1)
            res *= base;
        base *= base;
        exponent >>= 1;
    }
    return res;
}

// 带模数的快速幂
int quick_pow(int base, int exponent, int modulo = MOD) {
    int res = 1 % modulo;
    base %= modulo;
    while (exponent) {
        if (exponent & 1)
            res = res * base % modulo;
        base = base * base % modulo;
        exponent >>= 1;
    }
    return res;
}

龟速乘

幂是连续的乘法，而乘则可以看作连续的加法，按照这个思路在乘法中也可使用这种做法。

龟速乘非常形象，本身乘法是 $O(1)$ 的，而龟速乘将其降到了 $O(\log_2{n})$ 。但实际有时我们必须要使用这样一种算法，比如计算 $1e12+7$ 与 $1e12+9$ 的乘积对 $1e13+3$ 取模时，直接相乘取模会溢出。

链接：没找到相关模板题。

// 龟速乘 - Useless. 复杂度: O(log(multiplier))
const int MOD = 1000000007;
int slow_mul(int base, int multiplier) {
    int res = 0;
    base %= MOD;
    multiplier %= MOD;
    while (multiplier) {
        if (multiplier & 1)
            res = (res + base) % MOD;
        base = (base << 1) % MOD;
        multiplier >>= 1;
    }
    return res;
}

其实龟速乘很少用，这里只是介绍一个思路。上面讲到的问题有更好的解决方案，我们使用浮点乘，这样做一定要小心浮点数的精度问题。C++ 还提供了 __int128_t 只是有时候用不了。

// 用于解决 1e18 级别取模. 通常使用浮点乘更快
// 考虑效率 __int128_t 更快约 20%
int float_mul(int x, int y, int modulo) {
    int d = (long double)x / modulo * y + 0.5;
    int r = x * y - d * modulo;
    if (r < 0) r += modulo;
    return r;
}

矩阵快速幂

首先你需要了解矩阵乘法的定义，然后你需要知道矩阵乘法同样满足结合律，因此它可以使用快速幂的思想。

其中最典型的问题就是求解斐波那契数列。

链接：洛谷 P3390 【模板】矩阵快速幂

const int MOD = 1000000007;

// 矩阵快速幂. 复杂度: O(log(exponent)), 取模需写入 Matrix类中
class Matrix {
   public:
    vector<vector<int>> data;
    int n, m;
    Matrix() : n(0), m(0) {}
    Matrix(int n, int m) : data(vector<vector<int>>(n, vector<int>(m, 0))), n(n), m(m) {}
    Matrix(vector<vector<int>> vec) : data(vec), n(data.size()), m(data.front().size()) {}
    vector<int> &operator[](int idx) { return data[idx]; }
    Matrix operator*(Matrix &x) {
        if (n != x.m)
            throw "Matrix size error.";
        Matrix res(x.n, m);
        for (int i = 0; i < x.n; i++)
            for (int j = 0; j < m; j++)
                for (int k = 0; k < n; k++)
                    res[i][j] = (res[i][j] + data[k][j] * x[i][k]) % MOD;
        return res;
    }
    Matrix operator*=(Matrix &x) { return *this = *this * x; }
};

Matrix matrix_pow(Matrix base, int exponent) {
    Matrix res = base;
    exponent--;
    while (exponent) {
        if (exponent & 1)
            res *= base;
        base *= base;
        exponent >>= 1;
    }
    return res;
}

欧拉降幂（扩展欧拉定理）

这部分与快速幂思想无关。幂在模意义上具有循环节，而扩展欧拉定理证明了这个循环节的位置，这使得指数很大时快速幂依然具有优秀的时间复杂度。

对于任意数 $a$ 和 $p$ ，如果 $gcd(a,p)=1$ ，满足欧拉定理： $a^{\phi(p)} \equiv 1 \pmod{p}$ 。

如对任意数 $a$ 和 $p$ ，可得扩展欧拉定理，又称欧拉降幂：

$a^x \equiv \left\{ \begin{array}{ll} a^{x\%\phi(p)} & ,gcd(a,p)=1 \\ a^x & ,gcd(a,p)\neq1 & ,x<\phi(p) \\ a^{x\%\phi(p)+\phi(p)} & ,gcd(a,p)\neq1 & ,x\ge\phi(p) \\ \end{array} \right. \pmod{p}$

我们可以归纳出一个粗糙且更实用的公式：

$a^x \equiv a^{(x-\phi(p))\%\phi(p)+\phi(p)} \pmod{p}$

其中 $\phi(x)$ 表示欧拉函数。

链接：洛谷 P5091 【模板】扩展欧拉定理

const int MOD = 1000000007;

// 快速幂. 复杂度: O(log(exponent))
int quick_pow(int base, int exponent) {
    int res = 1 % MOD;
    base %= MOD;
    while (exponent) {
        if (exponent & 1)
            res = res * base % MOD;
        base = base * base % MOD;
        exponent >>= 1;
    }
    return res;
}

// 欧拉函数. 求区间 [1, x] 中与 x 互质的数的个数. 复杂度: O(logx) -> O(x^0.5).
int _euler(int x) {
    if (x < 2)
        return 0;
    int res = x;
    for (int i = 2; i <= x / i; i++)
        if (x % i == 0) {
            res = res / i * (i - 1);
            while (x % i == 0)
                x /= i;
        }
    if (x > 1)
        res = res / x * (x - 1);
    return res;
}
inline int euler(int x) {
    static int old = -1, res = -1;
    if (x == old) return res;
    old = x, res = _euler(x);
    return res;
}

// 扩展欧拉定理. 求 base ^ exponent = x (mod MOD). 复杂度: o(log(min(exponent, MOD)))
// 用于处理 exponent 很大的幂数, 如 base ^ (a ^ b) = x (mod MOD)
int exeuler(int base, int exponent) {
    if (__gcd(base, MOD) == 1)
        return quick_pow(base, exponent % euler(MOD));
    else if (exponent < euler(MOD))
        return quick_pow(base, exponent);
    else
        return quick_pow(base, exponent % euler(MOD) + euler(MOD));
}

// 粗糙版扩展欧拉定理. 区别是没有降幂至最低, 但是写得快.
#define exmod(x) ((x) < euler(MOD) ? (x) : (x) % euler(MOD) + euler(MOD))
int exeuler(int base, int exponent) { return quick_pow(base, exmod(exponent)); }

光速幂

光速幂实际也与快速幂无关，但用于解决相同问题。

光速幂在 $O(\sqrt{n})$ 预处理后可以 $O(1)$ 询问幂，局限性是每次询问底数和模数不变。

光速幂基于分块，设定 $siz$ ，只需预处理所有 $[0,siz)$ 和 $siz$ 的倍数即可，询问时 $a^b=a^{b/siz} \times a^{b\%siz}$ 。结合扩展欧拉定理，当 $siz \approx \sqrt{2\phi(n)}$ 时最优。

链接：没找到相关模板题。

const int MOD = 1000000007;
const int SQR = 65536;  // 严格大小 sqrt(euler(MOD) * 2)

int euler_modulo, gcd_base_modulo, size_block;
int size_pow[SQR];
int base_pow[SQR];

// 欧拉函数. 复杂度: O(logx) -> O(sqrt(x)).
int euler(int x) {
    if (x < 2)
        return 0;
    int res = x;
    for (int i = 2; i <= x / i; i++)
        if (x % i == 0) {
            res = res / i * (i - 1);
            while (x % i == 0)
                x /= i;
        }
    if (x > 1)
        res = res / x * (x - 1);
    return res;
}

// 光速幂预处理. 基于分块, 时间复杂度: O(sqrt(euler(base)))
void load_supper_pow(int base) {
    base %= MOD;
    ::euler_modulo = euler(MOD);
    ::gcd_base_modulo = __gcd(base, MOD);
    ::size_block = sqrt(euler_modulo * 2);
    base_pow[0] = 1;
    for (int i = 1; i <= size_block; i++)
        base_pow[i] = base_pow[i - 1] * base % MOD;
    size_pow[0] = 1;
    for (int i = 1, lim = euler_modulo * 2 / size_block; i <= lim; i++)
        size_pow[i] = size_pow[i - 1] * base_pow[size_block] % MOD;
}

// 光速幂 + 扩展欧拉定理. 时间复杂度: O(1)
int supper_pow(int exponent) {
    if (exponent < euler_modulo)
        ;
    else if (gcd_base_modulo == 1)
        exponent = exponent % euler_modulo;
    else
        exponent = exponent % euler_modulo + euler_modulo;
    return size_pow[exponent / size_block] * base_pow[exponent % size_block] % MOD;
}