Question

众所周知，np.sum(arr) 比 arr.sum() 慢很多。例如：

import numpy as np
np.random.seed(7)
A = np.random.random(1000)
%timeit np.sum(A)
2.94 µs ± 13.8 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
%timeit A.sum()
1.8 µs ± 40.8 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

有人能给出一个详细的基于代码的解释，说明 np.sum(arr) 执行的操作以及 arr.sum() 不执行的操作吗？

对于更长的数组来说，这种差异并不明显。但是，对于长度为 1000 或更短的数组来说，这种差异就比较明显了。

在我的代码中我做了数百万个数组求和，因此差异特别显著。

在这里，无论输入的大小如何，差异似乎都保持相当稳定。这似乎是通用函数中的一些额外调度开销，而不是直接调用 NDArray 上的方法？ — 
上次我检查时，我认为__array_function__调度机器是造成大部分问题的原因。不知道现在是否仍然如此。 — 
考虑到大小以及已知的 CPU 缓存层次结构的工作机制，明确地将所有“生成”阶段的数据从缓存命中中逐出是公平的，这样才能真正“测量”出下一个具有代表性的数据（错误地享受仍然在当代大型 L1/L2-L3 缓存中的数据的缓存命中（除非明确使用 LRU 策略，由某些“安全”的较大 B=np.random.random( 1E9 ) 手动“逐出”）。 — 
您能否重现相同（顺序独立和缓存独立）的测量结果，如在 — 
1 µs差异是我在比较 numpy 函数和方法时看到的典型差异。（不仅仅是sum）。 —

Answer 1

当我在电脑上以调试模式运行 np.sum(a) 时，它会进入以下代码。

以下是相关代码的部分。

import numpy as np
import types


def _wrapreduction(obj, ufunc, method, axis, dtype, out, **kwargs):
    passkwargs = {k: v for k, v in kwargs.items()
                  if v is not np._NoValue}

    if type(obj) is not np.ndarray:
        raise NotImplementedError

    return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


def copied_np_sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, initial=np._NoValue, where=np._NoValue):
    if isinstance(a, types.GeneratorType):
        raise NotImplementedError

    return _wrapreduction(
        a, np.add, 'sum', axis, dtype, out, keepdims=keepdims,
        initial=initial, where=where
    )

请注意，这最终会调用np.add.reduce(a)。

基准：

import timeit


def benchmark(setup, stmt, repeat, number):
    print(f"{stmt:16}: {min(timeit.repeat(setup=setup, stmt=stmt, globals=globals(), repeat=repeat, number=number)) / number}")


n_item = 10 ** 3
n_loop = 1000
n_set = 1000

data_setup = f"""\
import numpy as np
rng = np.random.default_rng(0)
a = rng.random({n_item})
"""

benchmark(setup=data_setup, stmt="np.sum(a)", repeat=n_set, number=n_loop)
benchmark(setup=data_setup, stmt="a.sum()", repeat=n_set, number=n_loop)
benchmark(setup=data_setup, stmt="copied_np_sum(a)", repeat=n_set, number=n_loop)
benchmark(setup=data_setup, stmt="np.add.reduce(a)", repeat=n_set, number=n_loop)

np.sum(a)       : 2.6407251134514808e-06
a.sum()         : 1.3474803417921066e-06
copied_np_sum(a): 2.50667380169034e-06
np.add.reduce(a): 1.195137854665518e-06

如您所见，的copied_np_sum性能与类似np.sum，并且np.add.reduce与类似。因此，和a.sum之间的大部分差异可能是由于调用之前执行的操作。换句话说，这是字典理解和附加函数调用造成的开销。np.suma.sumcopied_np_sumnp.add.reduce

但是，尽管上述基准测试与 OP 的基准测试存在显著差异，但正如评论中指出的那样这可能被夸大了。由于 timeit 重复执行代码并使用（最佳）平均值，因此对于像本基准测试中的小数组，在测量时该数组可能已经在 CPU 缓存中。这不一定是不公平的情况。在实际使用中可能会发生同样的事情。相反，只要有可能，就应该如此。话虽如此，为了得到规范的答案，我们应该对其进行测量。

根据@user3666197的建议，我们可以在创建后立即创建一个大数组以从缓存中a逐出。请注意，我决定在这里使用它，我确认它具有相同的效果但运行速度更快。anp.arange

import timeit


def benchmark(setup, stmt, repeat, number):
    print(f"{stmt:16}: {min(timeit.repeat(setup=setup, stmt=stmt, globals=globals(), repeat=repeat, number=number)) / number}")


n_item = 10 ** 3
n_loop = 1
n_set = 100

data_setup = f"""\
import numpy as np
rng = np.random.default_rng(0)
a = rng.random({n_item})
_ = np.arange(10 ** 9, dtype=np.uint8)  # To evict `a` from the CPU cache.
"""

benchmark(setup=data_setup, stmt="np.sum(a)", repeat=n_set, number=n_loop)
benchmark(setup=data_setup, stmt="a.sum()", repeat=n_set, number=n_loop)
benchmark(setup=data_setup, stmt="copied_np_sum(a)", repeat=n_set, number=n_loop)
benchmark(setup=data_setup, stmt="np.add.reduce(a)", repeat=n_set, number=n_loop)

无驱逐（有缓存）：

np.sum(a)       : 2.6407251134514808e-06
a.sum()         : 1.3474803417921066e-06
copied_np_sum(a): 2.50667380169034e-06
np.add.reduce(a): 1.195137854665518e-06

带驱逐（无缓存）：

np.sum(a)       : 4.916824400424957e-05
a.sum()         : 3.245798870921135e-05
copied_np_sum(a): 4.7205016016960144e-05
np.add.reduce(a): 3.0195806175470352e-05

当然，缓存的有无对性能的影响是巨大的。不过，虽然差异变小了，但还是可以说有很大的差异。另外，由于这四个关系与以前一样，所以结论也是一样的。

我应该补充几点。

注1

关于方法加载的说法是不正确的。

benchmark(setup=f"{data_setup}f = np.sum", stmt="f(a)", repeat=n_set, number=n_loop)
benchmark(setup=f"{data_setup}f = a.sum", stmt="f()", repeat=n_set, number=n_loop)

np.sum(a)       : 4.916824400424957e-05
a.sum()         : 3.245798870921135e-05
f(a)            : 4.6479981392621994e-05  <-- Same as np.sum.
f()             : 3.27317975461483e-05    <-- Same as a.sum.
np.add.reduce(a): 3.0195806175470352e-05  <-- Also, note that this one is fast.

笔记2

正如所有基准测试所显示的那样，np.add.reduce是最快的（开销最小）。如果您的实际应用程序也只处理一维数组，并且如此小的差异对您来说很重要，那么您应该考虑使用np.add.reduce。

注3

实际上，在这种情况下，numba 可能是最快的。

from numba import njit
import numpy as np
import math


@njit(cache=True)
def nb_numpy_sum(a):
    # This will be a reduce sum.
    return np.sum(a)


@njit(cache=True)
def nb_pairwise_sum(a):
    # https://en.wikipedia.org/wiki/Pairwise_summation
    N = 2
    if len(a) <= N:
        return sum(a)  # reduce sum
    else:
        m = len(a) // 2
        return nb_pairwise_sum(a[:m]) + nb_pairwise_sum(a[m:])


@njit(cache=True)
def nb_kahan_sum(a):
    # https://en.wikipedia.org/wiki/Kahan_summation_algorithm
    total = np.zeros(shape=1, dtype=a.dtype)[0]
    c = np.zeros(shape=1, dtype=a.dtype)[0]
    for i in range(len(a)):
        y = a[i] - c
        t = total + y
        c = (t - total) - y
        total = t
    return total


def test():
    candidates = [
        ("np.sum", np.sum),
        ("math.fsum", math.fsum),
        ("nb_numpy_sum", nb_numpy_sum),
        ("nb_pairwise_sum", nb_pairwise_sum),
        ("nb_kahan_sum", nb_kahan_sum),
    ]

    n = 10 ** 7 + 1
    a = np.full(n, 0.1, dtype=np.float64)
    for name, f in candidates:
        print(f"{name:16}: {f(a)}")


test()

准确性：

np.sum          : 1000000.0999999782
math.fsum       : 1000000.1000000001
nb_numpy_sum    : 1000000.0998389754
nb_pairwise_sum : 1000000.1
nb_kahan_sum    : 1000000.1000000001

定时：

np.sum(a)         : 4.769209772348404e-05
a.sum()           : 3.242073580622673e-05
np.add.reduce(a)  : 2.933526411652565e-05
nb_numpy_sum(a)   : 1.1243857443332672e-05
nb_pairwise_sum(a): 1.6139820218086243e-05
nb_kahan_sum(a)   : 1.3509299606084824e-05

请注意，虽然nb_pairwise_sum和nb_kahan_sum的数学精度与 NumPy 相当，但它们都不是 NumPy 实现的精确复制品。因此，无法保证结果与 NumPy 完全相同。

还应该澄清的是，这种差异是由于开销的数量造成的，而对于大型数组（例如> 10000），NumPy 的速度明显更快。

不，永远不会，那样的话，您将失去结果“种群”的所有可塑性。尝试如下操作：(a) 在设置字符串中的“a = …;”和“f = …”命令之间放入“_ = np.random.random( int( 1E9 ) )”，或者甚至 (b) 将“_ = np.random.default_rng( 0 ).random( int( 1E9 ) ); …”放入 stmt 字符串中（+稍后，在收集此类测试多次重复的持续时间数据后，减去此类 1E9 大“缓存清除”数组生成（仅通过测试清除器数据的生成获得）的类似时间（设置为可重现）的持续时间），(c) 子对 time.monotonic_ns() — 
@user3666197 再次感谢您的建议。我使用选项 (a) 重新运行了测试。结果已添加到我的帖子中。为了节省时间，我决定使用 1GB 的数组大小并使用 arange 而不是随机。正如预期的那样，性能发生了显着变化，但问题仍然重现，我的结论仍然相同 🙂 — 
@Simd numba 使用 Reduce 求和，而 numpy 使用成对求和。因此结果不会相同。 — 
（+1）很高兴你发现它相当有帮助 – 你可能喜欢实际延迟成本的更全局图景（因为这些对于低延迟和 HPC 设计都很重要），如果不加以精心策划，可能会损害并行编程（或矢量化语法的使用）由于增加了阿姆达尔定律预期加速的成本…… — 
@Simd 我在 numba 示例中添加了成对求和和 kahan 求和（因为我很好奇）。发现对于小数组来说，它仍然比 numpy 快。 —

Answer 2

这种差异仅发生在足够小的数组中，并且实际上可以忽略不计（但很重要）。

如果您查看字节码，np.sum(A)必须从模块中获取方法，这可能会对小的延迟产生一点点影响。

# np.sum(A)
  6           0 LOAD_GLOBAL              0 (np)
              2 LOAD_METHOD              1 (sum)
              4 LOAD_FAST                0 (A)
              6 CALL_METHOD              1
              8 RETURN_VALUE

# A.sum()
  9           0 LOAD_FAST                0 (A)
              2 LOAD_METHOD              0 (sum)
              4 CALL_METHOD              0
              6 RETURN_VALUE

此外，还有一条稍微不同的路径可以到达实际执行计算的代码（例如np.sum检查输入不是生成器等），这会增加不同的开销，但总的来说，使用的算法/代码是相同的。实际上在文档中提到。

如果对这两个代码运行分析器，您会发现区别在于fromnumeric.py（请参阅，其中也讨论了这一点）：

import cProfile

A = np.array([1])

cProfile.run('np.sum(A)')

#          10 function calls in 0.000 seconds

#    Ordered by: standard name

#    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
#         1    0.000    0.000    0.000    0.000 <string>:1(<module>)
#         1    0.000    0.000    0.000    0.000 fromnumeric.py:2172(_sum_dispatcher)
#         1    0.000    0.000    0.000    0.000 fromnumeric.py:2177(sum)
#         1    0.000    0.000    0.000    0.000 fromnumeric.py:71(_wrapreduction)
#         1    0.000    0.000    0.000    0.000 fromnumeric.py:72(<dictcomp>)
#         1    0.000    0.000    0.000    0.000 {built-in method builtins.exec}
#         1    0.000    0.000    0.000    0.000 {built-in method builtins.isinstance}
#         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
#         1    0.000    0.000    0.000    0.000 {method 'items' of 'dict' objects}
#         1    0.000    0.000    0.000    0.000 {method 'reduce' of 'numpy.ufunc' objects}


cProfile.run('A.sum()')

#          6 function calls in 0.000 seconds

#    Ordered by: standard name

#    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
#         1    0.000    0.000    0.000    0.000 <string>:1(<module>)
#         1    0.000    0.000    0.000    0.000 _methods.py:47(_sum)
#         1    0.000    0.000    0.000    0.000 {built-in method builtins.exec}
#         1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
#         1    0.000    0.000    0.000    0.000 {method 'reduce' of 'numpy.ufunc' objects}
#         1    0.000    0.000    0.000    0.000 {method 'sum' of 'numpy.ndarray' objects}

有趣的是，从列表开始时也有同样的差异：

您说得对，这只适用于不太长的数组。例如，对于长度为 1000 或更短的数组，差异就很大。 — 
@Simd 是的，这很重要（总是有差异），但可以忽略不计（时间很短） — 
在实际执行操作之前可能还会进行一些额外的检查np.sum。我的观点是，这只是开销，而不是算法的根本区别。实际上，的文档指np.narray.sum的是np.sum。 — 
我的代码进行了数百万次这样的求和，因此差别很大。 — 
@user3666197 好问题，但我还没有足够的知识来测试这一点；） —

python – 是什么让np.sum变慢了？ – 堆栈内存溢出

最佳答案
2

注1

笔记2

注3

最佳答案 2

注1

笔记2

注3

最佳答案
2