Use Intel SVML
I have no working numexpr
with Intel SVML, but numexpr
with working SVML should perform as good as Numba. The Numba
Benchmarks show quite the same behaviour without SVML, but perform much better with SVML.
Code
import numpy as np
import numba as nb
myarr = np.random.uniform(-1,1,[1100,1100])
@nb.njit(error_model="numpy",parallel=True)
def func(arr,div):
return np.exp( - 0.5 * (myarr / div)**2 )
Timings
#Core i7 4771
#Windows 7 x64
#Anaconda Python 3.5.5
#Numba 0.41 (compilation overhead excluded)
func(myarr,0.1) -> 3.6ms
func(myarr,0.001) -> 3.8ms
#Numba (set NUMBA_DISABLE_INTEL_SVML=1), parallel=True
func(myarr,0.1) -> 5.19ms
func(myarr,0.001) -> 12.0ms
#Numba (set NUMBA_DISABLE_INTEL_SVML=1), parallel=False
func(myarr,0.1) -> 16.7ms
func(myarr,0.001) -> 63.2ms
#Numpy (1.13.3), set OMP_NUM_THREADS=4
np.exp( - 0.5 * (myarr / 0.001)**2 ) -> 70.82ms
np.exp( - 0.5 * (myarr / 0.1)**2 ) -> 12.58ms
#Numpy (1.13.3), set OMP_NUM_THREADS=1
np.exp( - 0.5 * (myarr / 0.001)**2 ) -> 189.4ms
np.exp( - 0.5 * (myarr / 0.1)**2 ) -> 17.4ms
#Numexpr (2.6.8), no SVML, parallel
ne.evaluate("exp( - 0.5 * (myarr / 0.001)**2 )") ->17.2ms
ne.evaluate("exp( - 0.5 * (myarr / 0.1)**2 )") ->4.38ms
#Numexpr (2.6.8), no SVML, single threaded
ne.evaluate("exp( - 0.5 * (myarr / 0.001)**2 )") ->50.85ms
ne.evaluate("exp( - 0.5 * (myarr / 0.1)**2 )") ->13.9ms