From 1be165936bbc7cf839cd9f56246919e0bb66203b Mon Sep 17 00:00:00 2001
From: y <andres.guzman-ballen@intel.com>
Date: Fri, 28 Jul 2023 15:15:32 -0500
Subject: [PATCH 01/38] Transition from numpy.distutils to scikit-build

In preparation to switch from using deprecated ICC to its successor ICX,
changed build system from deprecated numpy.distutils to scikit-build.

Renamed files: loops_intel -> mkl_umath_loops

Bumped up the version of the package from 0.1.1 to 0.1.2

Co-authored-by: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Co-authored-by: Andres Guzman-Ballen <andres.guzman-ballen@intel.com>
---
 .gitignore                                    |  96 +++++
 CMakeLists.txt                                | 124 +++++++
 build.sh                                      |  18 +
 icpx_for_conda.cfg                            |   1 +
 mkl_umath/_version.py                         |   2 +-
 mkl_umath/generate_umath.py                   |  16 +-
 mkl_umath/src/{patch.pyx => _patch.pyx}       |   0
 mkl_umath/src/fast_loop_macros.h              |   4 +
 mkl_umath/src/loops_intel.h.src               | 306 ----------------
 ...oops_intel.c.src => mkl_umath_loops.c.src} | 327 ++++++++----------
 mkl_umath/src/mkl_umath_loops.h.src           | 306 ++++++++++++++++
 mkl_umath/tests/test_basic.py                 |   6 +-
 setup.py                                      | 149 +++++---
 mkl_umath/setup.py => template                |   0
 14 files changed, 812 insertions(+), 543 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 CMakeLists.txt
 create mode 100644 build.sh
 create mode 100644 icpx_for_conda.cfg
 rename mkl_umath/src/{patch.pyx => _patch.pyx} (100%)
 delete mode 100644 mkl_umath/src/loops_intel.h.src
 rename mkl_umath/src/{loops_intel.c.src => mkl_umath_loops.c.src} (88%)
 create mode 100644 mkl_umath/src/mkl_umath_loops.h.src
 rename mkl_umath/setup.py => template (100%)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7cc71d7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,96 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions and binary files
+*.o
+*.so
+*.so.*
+*.exe
+*.lib
+*.dll
+
+# CMake build and local install directory
+build
+_skbuild
+build_cmake
+install
+
+# Code project files
+.vscode
+
+# Eclipse project files
+.project
+.pydevproject
+
+# Emacs temp files
+*~
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+dpctl_conda_pkg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+junit.xml
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# pyenv python configuration file
+.python-version
+
+_cmake_test_compile
+
+# generated numpy files
+mkl_umath/src/__umath_generated.c
+mkl_umath/src/mkl_umath_loops.c
+mkl_umath/src/mkl_umath_loops.h
+mkl_umath/src/_patch.c
+
+# moved cmake scripts
+dpctl/resources/cmake
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..87adc4c
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,124 @@
+cmake_minimum_required(VERSION 3.21...3.25 FATAL_ERROR)
+
+if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24")
+  cmake_policy(SET CMP0135 NEW)
+endif()
+
+project(mkl_umath
+  LANGUAGES C
+  DESCRIPTION "mkl_umath module"
+)
+
+find_package(Python COMPONENTS Interpreter Development REQUIRED)
+find_package(NumPy REQUIRED)
+find_package(PythonExtensions REQUIRED)
+
+set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"")
+find_package(Cython REQUIRED)
+
+set(MKL_ARCH intel64)
+set(MKL_LINK sdl)
+set(MKL_THREADING intel_thread)
+set(MKL_INTERFACE ilp64)
+# MKL_ARCH: None, set to ` intel64` by default
+# MKL_ROOT /localdisk/work/aguzmanb/Development/miniconda3.py310/envs/numpy_umath_prefix.v5
+# MKL_DPCPP_LINK: None, set to ` dynamic` by default
+# MKL_LINK: None, set to ` dynamic` by default
+# MKL_DPCPP_INTERFACE_FULL: None, set to ` intel_ilp64` by default
+# MKL_INTERFACE_FULL: None, set to ` intel_ilp64` by default
+# MKL_DPCPP_THREADING: None, set to ` tbb_thread` by default
+# MKL_THREADING: None, set to ` intel_thread` by default
+find_package(MKL REQUIRED)
+
+if(WIN32)
+ string(CONCAT WARNING_FLAGS
+     "-Wall "
+     "-Wextra "
+     "-Winit-self "
+     "-Wunused-function "
+     "-Wuninitialized "
+     "-Wmissing-declarations "
+     "-Wstrict-prototypes "
+     "-Wno-unused-parameter "
+   )
+   string(CONCAT SDL_FLAGS
+     "/GS "
+     "/DynamicBase "
+   )
+   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}")
+   set(CMAKE_C_FLAGS_DEBUG
+     "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG"
+   )
+  set(MKL_UMATH_LDFLAGS "/NXCompat;/DynamicBase")
+elseif(UNIX)
+   string(CONCAT WARNING_FLAGS
+     "-Wall "
+     "-Wextra "
+     "-Winit-self "
+     "-Wunused-function "
+     "-Wuninitialized "
+     "-Wmissing-declarations "
+     "-Wstrict-prototypes "
+     "-Wno-unused-parameter "
+     "-fdiagnostics-color=auto "
+   )
+   string(CONCAT SDL_FLAGS
+     "-fstack-protector "
+     "-fstack-protector-all "
+     "-fpic "
+     "-fPIC "
+     "-D_FORTIFY_SOURCE=2 "
+     "-Wformat "
+     "-Wformat-security "
+#     "-fno-strict-overflow "    # no-strict-overflow is implied by -fwrapv
+     "-fno-delete-null-pointer-checks "
+     "-fwrapv "
+   )
+   string(CONCAT CFLAGS
+     "${WARNING_FLAGS}"
+     "${SDL_FLAGS}"
+   )
+   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 ${CFLAGS}")
+   set(CMAKE_C_FLAGS_DEBUG
+     "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O0 -g1 -DDEBUG"
+   )
+   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-incompatible-function-pointer-types ${CFLAGS}")
+  set(MKL_UMATH_LDFLAGS "-z,noexecstack,-z,relro,-z,now")
+else()
+  message(FATAL_ERROR "Unsupported system.")
+endif()
+
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
+# set_property(GLOBAL PROPERTY GLOBAL_DEPENDS_DEBUG_MODE 1)
+set(_linker_options "LINKER:${MKL_UMATH_LDFLAGS}")
+
+set(_trgt mkl_umath_loops)
+add_library(${_trgt} SHARED "mkl_umath/src/mkl_umath_loops.c")
+set_target_properties(${_trgt} PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON)
+target_include_directories(${_trgt} PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR} ${MKL_INCLUDE_DIR})
+target_link_libraries(${_trgt} PRIVATE mkl_rt)
+target_link_options(${_trgt} PRIVATE ${_linker_options})
+install(TARGETS ${_trgt} LIBRARY DESTINATION mkl_umath)
+
+add_library(_ufuncs MODULE "mkl_umath/src/ufuncsmodule.c" "mkl_umath/src/__umath_generated.c")
+target_include_directories(_ufuncs PRIVATE "mkl_umath/src" ${NumPy_INCLUDE_DIR} ${MKL_INCLUDE_DIR})
+target_compile_definitions(_ufuncs PUBLIC NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION)
+target_link_options(_ufuncs PRIVATE ${_linker_options})
+target_link_libraries(_ufuncs mkl_umath_loops)
+python_extension_module(_ufuncs)
+if (UNIX)
+  set_target_properties(_ufuncs PROPERTIES INSTALL_RPATH "$ORIGIN")
+endif()
+install(TARGETS _ufuncs LIBRARY DESTINATION mkl_umath)
+
+add_cython_target(_patch "mkl_umath/src/_patch.pyx" C OUTPUT_VAR _generated_src)
+add_library(_patch MODULE ${_generated_src})
+target_include_directories(_patch PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR})
+target_compile_definitions(_patch PUBLIC NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION)
+target_link_libraries(_patch mkl_umath_loops)
+python_extension_module(_patch)
+if (UNIX)
+  set_target_properties(_patch PROPERTIES INSTALL_RPATH "$ORIGIN")
+endif()
+install(TARGETS _patch LIBRARY DESTINATION mkl_umath)
diff --git a/build.sh b/build.sh
new file mode 100644
index 0000000..bd34337
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,18 @@
+# This is necessary to help DPC++ find Intel libraries such as SVML, IRNG, etc in build prefix
+export BUILD_PREFIX=$CONDA_PREFIX
+export HOST=x86_64-conda-linux-gnu
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${BUILD_PREFIX}/lib"
+
+# Intel LLVM must cooperate with compiler and sysroot from conda
+echo "--gcc-toolchain=${BUILD_PREFIX} --sysroot=${BUILD_PREFIX}/${HOST}/sysroot -target ${HOST}" > icpx_for_conda.cfg
+export ICPXCFG="$(pwd)/icpx_for_conda.cfg"
+export ICXCFG="$(pwd)/icpx_for_conda.cfg"
+
+# if [ -e "_skbuild" ]; then
+#   python setup.py clean --all
+# fi
+
+export CMAKE_GENERATOR="Ninja"
+SKBUILD_ARGS="-- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
+echo "python setup.py install ${SKBUILD_ARGS}"
+python setup.py install ${SKBUILD_ARGS}
diff --git a/icpx_for_conda.cfg b/icpx_for_conda.cfg
new file mode 100644
index 0000000..d828bd2
--- /dev/null
+++ b/icpx_for_conda.cfg
@@ -0,0 +1 @@
+--gcc-toolchain=/localdisk/work/aguzmanb/Development/miniconda3.py310/envs/numpy_umath_prefix.v6 --sysroot=/localdisk/work/aguzmanb/Development/miniconda3.py310/envs/numpy_umath_prefix.v6/x86_64-conda-linux-gnu/sysroot -target x86_64-conda-linux-gnu
diff --git a/mkl_umath/_version.py b/mkl_umath/_version.py
index df9144c..10939f0 100644
--- a/mkl_umath/_version.py
+++ b/mkl_umath/_version.py
@@ -1 +1 @@
-__version__ = '0.1.1'
+__version__ = '0.1.2'
diff --git a/mkl_umath/generate_umath.py b/mkl_umath/generate_umath.py
index 7ff39b2..cc2034f 100644
--- a/mkl_umath/generate_umath.py
+++ b/mkl_umath/generate_umath.py
@@ -343,12 +343,6 @@ def english_upper(s):
           None, 
           TD(inexactvec + cmplxvec),
           ),
-'floor_divide':
-    Ufunc(2, 1, None,
-          docstrings.get('numpy.core.umath.floor_divide'),
-          None,
-          TD(inexactvec + cmplxvec),
-          ),
 'true_divide':
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.true_divide'),
@@ -797,16 +791,16 @@ def make_arrays(funcdict):
                 tname = english_upper(chartoname[t.type])
                 datalist.append('(void *)NULL')
                 funclist.append(
-                        '%s_%s_%s_%s' % (tname, t.in_, t.out, name))
+                        'mkl_umath_%s_%s_%s_%s' % (tname, t.in_, t.out, name))
             elif isinstance(t.func_data, FuncNameSuffix):
                 datalist.append('(void *)NULL')
                 tname = english_upper(chartoname[t.type])
                 funclist.append(
-                        '%s_%s_%s' % (tname, name, t.func_data.suffix))
+                        'mkl_umath_%s_%s_%s' % (tname, name, t.func_data.suffix))
             elif t.func_data is None:
                 datalist.append('(void *)NULL')
                 tname = english_upper(chartoname[t.type])
-                funclist.append('%s_%s' % (tname, name))
+                funclist.append('mkl_umath_%s_%s' % (tname, name))
                 if t.simd is not None:
                     for vt in t.simd:
                         code2list.append(textwrap.dedent("""\
@@ -936,8 +930,10 @@ def make_code(funcdict, filename):
         Please make changes to the code generator program (%s)
     **/
     #include "Python.h"
+    #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+    #include "numpy/arrayobject.h"
     #include "numpy/ufuncobject.h"
-    #include "loops_intel.h"
+    #include "mkl_umath_loops.h"
     %s
 
     static int
diff --git a/mkl_umath/src/patch.pyx b/mkl_umath/src/_patch.pyx
similarity index 100%
rename from mkl_umath/src/patch.pyx
rename to mkl_umath/src/_patch.pyx
diff --git a/mkl_umath/src/fast_loop_macros.h b/mkl_umath/src/fast_loop_macros.h
index 50f9d41..d26174c 100644
--- a/mkl_umath/src/fast_loop_macros.h
+++ b/mkl_umath/src/fast_loop_macros.h
@@ -41,6 +41,10 @@
 #define NPY_PRAGMA_VECTOR _Pragma("vector")
 #define NPY_PRAGMA_NOVECTOR _Pragma("novector")
 #define NPY_ASSUME_ALIGNED(p, b) __assume_aligned((p), (b));
+#elif defined(__clang__)
+#define NPY_PRAGMA_VECTOR _Pragma("clang loop vectorize(enable)")
+#define NPY_PRAGMA_NOVECTOR _Pragma("clang loop vectorize(disable)")
+#define NPY_ASSUME_ALIGNED(p, b)
 #else
 #define NPY_PRAGMA_VECTOR _Pragma("GCC ivdep")
 #define NPY_PRAGMA_NOVECTOR
diff --git a/mkl_umath/src/loops_intel.h.src b/mkl_umath/src/loops_intel.h.src
deleted file mode 100644
index c45bab4..0000000
--- a/mkl_umath/src/loops_intel.h.src
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
- * Copyright (c) 2019-2021, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *     * Redistributions of source code must retain the above copyright notice,
- *       this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of Intel Corporation nor the names of its contributors
- *       may be used to endorse or promote products derived from this software
- *       without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _MKL_UMATH_LOOPS_H_
-#define _MKL_UMATH_LOOPS_H_
-
-#include "numpy/ndarraytypes.h"
-
-#include <string.h>
-
-/**begin repeat
- * Float types
- *  #TYPE = FLOAT, DOUBLE#
- */
-
-NPY_NO_EXPORT void
-@TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_invsqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_exp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_exp2(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_expm1(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_erf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_log(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_log2(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_log10(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_log1p(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_cos(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_sin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_tan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_arccos(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_arcsin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_arctan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_cosh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_sinh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_tanh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_arccosh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_arcsinh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_arctanh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_fabs(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_floor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_ceil(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_rint(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_trunc(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_cbrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-/**end repeat1**/
-
-/**begin repeat1
- * Arithmetic
- * # kind = equal, not_equal, less, less_equal, greater, greater_equal,
- *        logical_and, logical_or#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-/**end repeat1**/
-
-NPY_NO_EXPORT void
-@TYPE@_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-/**begin repeat1
- * #kind = isnan, isinf, isfinite, signbit#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-/**end repeat1**/
-
-NPY_NO_EXPORT void
-@TYPE@_spacing(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-
-NPY_NO_EXPORT void
-@TYPE@_copysign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_nextafter(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-/**begin repeat1
- * #kind = maximum, minimum, fmax, fmin#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-/**end repeat1**/
-
-NPY_NO_EXPORT void
-@TYPE@_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_negative(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_modf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_frexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_ldexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-#define @TYPE@_true_divide @TYPE@_divide
-
-/**end repeat**/
-
-/*
- *****************************************************************************
- **                           COMPLEX LOOPS                                 **
- *****************************************************************************
- */
-
-#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi));
-#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi));
-#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi));
-#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi));
-#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi);
-#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi);
-
-/**begin repeat
- * complex types
- * #TYPE = CFLOAT, CDOUBLE#
- */
-
-/**begin repeat1
- * arithmetic
- * #kind = add, subtract#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-/**end repeat1**/
-
-NPY_NO_EXPORT void
-@TYPE@_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@TYPE@_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-
-
-/**begin repeat1
- * arithmetic
- * #kind = greater, greater_equal, less, less_equal, equal, 
-           not_equal, logical_and, logical_or, logical_xor, logical_not,
-	   isnan, isinf, isfinite#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-/**end repeat1**/
-
-NPY_NO_EXPORT void
-@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@__arg(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data));
-
-/**begin repeat1
- * arithmetic
- * #kind = maximum, minimum, fmax, fmin#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
-/**end repeat1**/
-
-#define @TYPE@_true_divide @TYPE@_divide
-
-/**end repeat**/
-
-#undef CGE
-#undef CLE
-#undef CGT
-#undef CLT
-#undef CEQ
-#undef CNE
-
-#endif
diff --git a/mkl_umath/src/loops_intel.c.src b/mkl_umath/src/mkl_umath_loops.c.src
similarity index 88%
rename from mkl_umath/src/loops_intel.c.src
rename to mkl_umath/src/mkl_umath_loops.c.src
index 0a199dc..b5cbbaf 100644
--- a/mkl_umath/src/loops_intel.c.src
+++ b/mkl_umath/src/mkl_umath_loops.c.src
@@ -29,7 +29,6 @@
 #include "mkl.h"
 #include <float.h>
 #include <fenv.h>
-#include "mathimf.h"
 #include "Python.h"
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
@@ -40,7 +39,7 @@
 #include "numpy/ufuncobject.h"
 #include "numpy/npy_math.h"
 #include "blocking_utils.h"
-#include "loops_intel.h"
+#include "mkl_umath_loops.h"
 
 /* Adapated from NumPy's source code. 
  * https://github.com/numpy/numpy/blob/main/LICENSE.txt */
@@ -223,8 +222,8 @@ divmod@c@(@type@ a, @type@ b, @type@ *modulus)
  *  #scalarf = sqrtf, sqrt#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_sqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_sqrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -251,8 +250,8 @@ NPY_NO_EXPORT void
  *  #scalarf = (1.0f)/sqrtf, (1.0)/sqrt#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_invsqrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_invsqrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -280,8 +279,8 @@ NPY_NO_EXPORT void
  *  #scalarf = expf, exp#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_exp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_exp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     int ignore_fpstatus = 0;
 
@@ -318,8 +317,8 @@ NPY_NO_EXPORT void
  */
 
 /* TODO: Use VML */
-NPY_NO_EXPORT void
-@TYPE@_exp2(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_exp2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP_DISPATCH(
         DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
@@ -339,8 +338,8 @@ NPY_NO_EXPORT void
  *  #scalarf = expm1f, expm1#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_expm1(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_expm1(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -367,8 +366,8 @@ NPY_NO_EXPORT void
  *  #scalarf = erff, erf#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_erf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_erf(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -395,8 +394,8 @@ NPY_NO_EXPORT void
  *  #scalarf = logf, log#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_log(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_log(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -424,8 +423,8 @@ NPY_NO_EXPORT void
  */
 
 /* TODO: Use VML */
-NPY_NO_EXPORT void
-@TYPE@_log2(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_log2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP_DISPATCH(
         DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
@@ -445,8 +444,8 @@ NPY_NO_EXPORT void
  *  #scalarf = log10f, log10#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_log10(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_log10(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -473,8 +472,8 @@ NPY_NO_EXPORT void
  *  #scalarf = log1pf, log1p#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_log1p(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_log1p(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -501,8 +500,8 @@ NPY_NO_EXPORT void
  *  #scalarf = cosf, cos#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_cos(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_cos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -529,8 +528,8 @@ NPY_NO_EXPORT void
  *  #scalarf = sinf, sin#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_sin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_sin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -557,8 +556,8 @@ NPY_NO_EXPORT void
  *  #scalarf = tanf, tan#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_tan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_tan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -585,8 +584,8 @@ NPY_NO_EXPORT void
  *  #scalarf = acosf, acos#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_arccos(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_arccos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -613,8 +612,8 @@ NPY_NO_EXPORT void
  *  #scalarf = asinf, asin#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_arcsin(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_arcsin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -641,8 +640,8 @@ NPY_NO_EXPORT void
  *  #scalarf = atanf, atan#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_arctan(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_arctan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -669,8 +668,8 @@ NPY_NO_EXPORT void
  *  #scalarf = coshf, cosh#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_cosh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_cosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -697,8 +696,8 @@ NPY_NO_EXPORT void
  *  #scalarf = sinhf, sinh#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_sinh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_sinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -725,8 +724,8 @@ NPY_NO_EXPORT void
  *  #scalarf = tanhf, tanh#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_tanh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_tanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -753,8 +752,8 @@ NPY_NO_EXPORT void
  *  #scalarf = acoshf, acosh#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_arccosh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_arccosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -781,8 +780,8 @@ NPY_NO_EXPORT void
  *  #scalarf = asinhf, asinh#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_arcsinh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_arcsinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -809,8 +808,8 @@ NPY_NO_EXPORT void
  *  #scalarf = atanhf, atanh#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_arctanh(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_arctanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -837,8 +836,8 @@ NPY_NO_EXPORT void
  *  #scalarf = fabsf, fabs#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_fabs(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_fabs(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP_DISPATCH(
         DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
@@ -858,8 +857,8 @@ NPY_NO_EXPORT void
  *  #scalarf = floorf, floor#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_floor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_floor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(steps[0] == sizeof(@type@) && steps[1] == sizeof(@type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -886,8 +885,8 @@ NPY_NO_EXPORT void
  *  #scalarf = ceilf, ceil#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_ceil(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_ceil(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -914,8 +913,8 @@ NPY_NO_EXPORT void
  *  #scalarf = rintf, rint#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_rint(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_rint(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(steps[0] == sizeof(@type@) && steps[1] == sizeof(@type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -942,8 +941,8 @@ NPY_NO_EXPORT void
  *  #scalarf = truncf, trunc#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_trunc(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_trunc(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -970,8 +969,8 @@ NPY_NO_EXPORT void
  *  #scalarf = cbrtf, cbrt#
  */
 
-NPY_NO_EXPORT void
-@TYPE@_cbrt(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_cbrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_UNARY_CONT(@type@, @type@) &&
            dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
@@ -1094,8 +1093,8 @@ pairwise_sum_@TYPE@(char *a, npy_intp n, npy_intp stride)
  * # PW = 1#
  * # VML = Add#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_BINARY_CONT(@type@, @type@)) {
 #if @SUPPORTED_BY_VML@
@@ -1262,8 +1261,8 @@ NPY_NO_EXPORT void
  * # PW = 0#
  * # VML = Sub#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_BINARY_CONT(@type@, @type@)) {
 #if @SUPPORTED_BY_VML@
@@ -1430,8 +1429,8 @@ NPY_NO_EXPORT void
  * # PW = 0#
  * # VML = Mul#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_BINARY_CONT(@type@, @type@)) {
 #if @SUPPORTED_BY_VML@
@@ -1598,8 +1597,8 @@ NPY_NO_EXPORT void
  * # PW = 0#
  * # VML = Div#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_BINARY_CONT(@type@, @type@)) {
 #if @SUPPORTED_BY_VML@
@@ -1750,8 +1749,8 @@ NPY_NO_EXPORT void
  *        logical_and, logical_or#
  * #OP = ==, !=, <, <=, >, >=, &&, ||#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     {
         BINARY_LOOP {
@@ -1763,8 +1762,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void
-@TYPE@_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_logical_xor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const int t1 = !!*(@type@ *)ip1;
@@ -1773,8 +1772,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_logical_not(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -1786,8 +1785,8 @@ NPY_NO_EXPORT void
  * #kind = isnan, isinf, isfinite, signbit#
  * #func = isnan, isinf, isfinite, signbit#
  **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     {
         UNARY_LOOP {
@@ -1799,8 +1798,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void
-@TYPE@_spacing(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_spacing(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -1808,8 +1807,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_copysign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_copysign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -1818,8 +1817,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_nextafter(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_nextafter(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -1832,8 +1831,8 @@ NPY_NO_EXPORT void
  * #kind = maximum, minimum#
  * #OP =  >=, <=#
  **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     /*  */
     if (IS_BINARY_REDUCE) {
@@ -1863,8 +1862,8 @@ NPY_NO_EXPORT void
  * #kind = fmax, fmin#
  * #OP =  >=, <=#
  **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     /*  */
     if (IS_BINARY_REDUCE) {
@@ -1887,19 +1886,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void
-@TYPE@_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ in2 = *(@type@ *)ip2;
-        @type@ mod;
-        *((@type@ *)op1) = divmod@c@(in1, in2, &mod);
-    }
-}
-
-NPY_NO_EXPORT void
-@TYPE@_remainder(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_remainder(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -1908,8 +1896,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_divmod(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_divmod(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP_TWO_OUT {
         const @type@ in1 = *(@type@ *)ip1;
@@ -1918,8 +1906,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+void
+mkl_umath_@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data))
 {
 #if @SUPPORTED_BY_VML@
     if(IS_UNARY_CONT(@type@, @type@) &&
@@ -1937,8 +1925,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+void
+mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data))
 {
 #if @SUPPORTED_BY_VML@
     if(IS_UNARY_CONT(@type@, @type@) &&
@@ -1956,16 +1944,16 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+void
+mkl_umath_@TYPE@__ones_like(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
         *((@type@ *)op1) = 1;
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -1973,8 +1961,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
 #if @SUPPORTED_BY_VML@
     if(IS_UNARY_CONT(@type@, @type@) &&
@@ -1995,8 +1983,8 @@ NPY_NO_EXPORT void
     feclearexcept(FE_ALL_EXCEPT); /* clear floatstatus */
 }
 
-NPY_NO_EXPORT void
-@TYPE@_negative(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_negative(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     {
         UNARY_LOOP {
@@ -2006,8 +1994,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_positive(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_positive(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -2015,8 +2003,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_sign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     /* Sign of nan is nan */
     UNARY_LOOP {
@@ -2025,8 +2013,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_modf(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_modf(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP_TWO_OUT {
         const @type@ in1 = *(@type@ *)ip1;
@@ -2034,8 +2022,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_frexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_frexp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP_TWO_OUT {
         const @type@ in1 = *(@type@ *)ip1;
@@ -2043,8 +2031,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_ldexp(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_ldexp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
@@ -2053,8 +2041,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_ldexp_long(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     /*
      * Additional loop to handle npy_long integer inputs (cf. #866, #1633).
@@ -2083,7 +2071,7 @@ NPY_NO_EXPORT void
     }
 }
 
-#define @TYPE@_true_divide @TYPE@_divide
+#define mkl_umath_@TYPE@_true_divide mkl_umath_@TYPE@_divide
 
 /**end repeat**/
 
@@ -2200,8 +2188,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
  * #OP = +, -#
  * #PW = 1, 0#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     if (IS_BINARY_REDUCE && @PW@) {
         npy_intp n = dimensions[0];
@@ -2227,8 +2215,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void
-@TYPE@_multiply(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2240,8 +2228,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2272,33 +2260,12 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_floor_divide(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        const @ftype@ in1r = ((@ftype@ *)ip1)[0];
-        const @ftype@ in1i = ((@ftype@ *)ip1)[1];
-        const @ftype@ in2r = ((@ftype@ *)ip2)[0];
-        const @ftype@ in2i = ((@ftype@ *)ip2)[1];
-        if (fabs@c@(in2r) >= fabs@c@(in2i)) {
-            const @ftype@ rat = in2i/in2r;
-            ((@ftype@ *)op1)[0] = floor@c@((in1r + in1i*rat)/(in2r + in2i*rat));
-            ((@ftype@ *)op1)[1] = 0;
-        }
-        else {
-            const @ftype@ rat = in2r/in2i;
-            ((@ftype@ *)op1)[0] = floor@c@((in1r*rat + in1i)/(in2i + in2r*rat));
-            ((@ftype@ *)op1)[1] = 0;
-        }
-    }
-}
-
 /**begin repeat1
  * #kind= greater, greater_equal, less, less_equal, equal, not_equal#
  * #OP = CGT, CGE, CLT, CLE, CEQ, CNE#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2315,8 +2282,8 @@ NPY_NO_EXPORT void
    #OP1 = ||, ||#
    #OP2 = &&, ||#
 */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2328,8 +2295,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void
-@TYPE@_logical_xor(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_logical_xor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2342,8 +2309,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_logical_not(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_logical_not(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2357,8 +2324,8 @@ NPY_NO_EXPORT void
  * #func = isnan, isinf, isfinite#
  * #OP = ||, ||, &&#
  **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2369,8 +2336,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void
-@TYPE@_square(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+void
+mkl_umath_@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data))
 {
     UNARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2380,8 +2347,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_reciprocal(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+void
+mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data))
 {
     UNARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2400,8 +2367,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@__ones_like(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(data))
+void
+mkl_umath_@TYPE@__ones_like(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
         ((@ftype@ *)op1)[0] = 1;
@@ -2409,8 +2376,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_conjugate(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) {
+void
+mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func)) {
     UNARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
         const @ftype@ in1i = ((@ftype@ *)ip1)[1];
@@ -2419,8 +2386,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_absolute(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     int ignore_fpstatus = 0;
 
@@ -2449,8 +2416,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@__arg(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@__arg(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2459,8 +2426,8 @@ NPY_NO_EXPORT void
     }
 }
 
-NPY_NO_EXPORT void
-@TYPE@_sign(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_sign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     /* fixme: sign of nan is currently 0 */
     UNARY_LOOP {
@@ -2478,8 +2445,8 @@ NPY_NO_EXPORT void
  * #kind = maximum, minimum#
  * #OP = CGE, CLE#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2501,8 +2468,8 @@ NPY_NO_EXPORT void
  * #kind = fmax, fmin#
  * #OP = CGE, CLE#
  */
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
     BINARY_LOOP {
         const @ftype@ in1r = ((@ftype@ *)ip1)[0];
@@ -2522,7 +2489,7 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
-#define @TYPE@_true_divide @TYPE@_divide
+#define mkl_umath_@TYPE@_true_divide mkl_umath_@TYPE@_divide
 
 /**end repeat**/
 
diff --git a/mkl_umath/src/mkl_umath_loops.h.src b/mkl_umath/src/mkl_umath_loops.h.src
new file mode 100644
index 0000000..70a7e94
--- /dev/null
+++ b/mkl_umath/src/mkl_umath_loops.h.src
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2019-2021, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *     * Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Intel Corporation nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MKL_UMATH_LOOPS_H_
+#define _MKL_UMATH_LOOPS_H_
+
+#include "numpy/ndarraytypes.h"
+
+#include <string.h>
+
+/**begin repeat
+ * Float types
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+
+extern void
+mkl_umath_@TYPE@_sqrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_invsqrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_exp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_exp2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_expm1(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_erf(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_log(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_log2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_log10(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_log1p(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_cos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_sin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_tan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_arccos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_arcsin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_arctan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_cosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_sinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_tanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_arccosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_arcsinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_arctanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_fabs(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_floor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_ceil(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_rint(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_trunc(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_cbrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ */
+extern void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * Arithmetic
+ * # kind = equal, not_equal, less, less_equal, greater, greater_equal,
+ *        logical_and, logical_or#
+ */
+extern void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+extern void
+mkl_umath_@TYPE@_logical_xor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_logical_not(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ **/
+extern void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+extern void
+mkl_umath_@TYPE@_spacing(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+
+extern void
+mkl_umath_@TYPE@_copysign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_nextafter(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind = maximum, minimum, fmax, fmin#
+ **/
+extern void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+extern void
+mkl_umath_@TYPE@_floor_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_remainder(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_divmod(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+extern void
+mkl_umath_@TYPE@__ones_like(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+extern void
+mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_negative(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_positive(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_sign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_modf(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_frexp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_ldexp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_ldexp_long(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+#define mkl_umath_@TYPE@_true_divide mkl_umath_@TYPE@_divide
+
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                           COMPLEX LOOPS                                 **
+ *****************************************************************************
+ */
+
+#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi));
+#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi));
+#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi));
+#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi));
+#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi);
+#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi);
+
+/**begin repeat
+ * complex types
+ * #TYPE = CFLOAT, CDOUBLE#
+ */
+
+/**begin repeat1
+ * arithmetic
+ * #kind = add, subtract#
+ */
+extern void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+extern void
+mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+extern void
+mkl_umath_@TYPE@_floor_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+
+/**begin repeat1
+ * arithmetic
+ * #kind = greater, greater_equal, less, less_equal, equal, 
+           not_equal, logical_and, logical_or, logical_xor, logical_not,
+	   isnan, isinf, isfinite#
+ */
+extern void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+extern void
+mkl_umath_@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+extern void
+mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+extern void
+mkl_umath_@TYPE@__ones_like(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+extern void
+mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+extern void
+mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+extern void
+mkl_umath_@TYPE@__arg(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+extern void
+mkl_umath_@TYPE@_sign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+/**begin repeat1
+ * arithmetic
+ * #kind = maximum, minimum, fmax, fmin#
+ */
+extern void
+mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+#define mkl_umath_@TYPE@_true_divide mkl_umath_@TYPE@_divide
+
+/**end repeat**/
+
+#undef CGE
+#undef CLE
+#undef CGT
+#undef CLT
+#undef CEQ
+#undef CNE
+
+#endif
diff --git a/mkl_umath/tests/test_basic.py b/mkl_umath/tests/test_basic.py
index 14e5ded..664d4c8 100644
--- a/mkl_umath/tests/test_basic.py
+++ b/mkl_umath/tests/test_basic.py
@@ -41,9 +41,9 @@ def get_args(args_str):
         elif s == 'D':
             args.append(np.double(np.random.random_sample()) + np.double(np.random.random_sample()) * 1j)
         elif s == 'i':
-            args.append(np.int(np.random.randint(low=1, high=10)))
+            args.append(np.int_(np.random.randint(low=1, high=10)))
         elif s == 'l':
-            args.append(np.long(np.random.randint(low=1, high=10)))
+            args.append(np.longlong(np.random.randint(low=1, high=10)))
         else:
             raise ValueError("Unexpected type specified!")
     return tuple(args)
@@ -86,7 +86,7 @@ def get_args(args_str):
     print("mkl res", mkl_res)
     print("npy res", np_res)
 
-    assert np.array_equal(mkl_res, np_res)
+    assert np.allclose(mkl_res, np_res)
 
 print("Test cases count:", len(test_cases))
 print("All looks good!")
diff --git a/setup.py b/setup.py
index fb9b500..cd1d3b0 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright (c) 2019-2021, Intel Corporation
+# Copyright (c) 2019-2023, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
@@ -24,8 +24,24 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import importlib.machinery
 import io
+import os
 import re
+from distutils.dep_util import newer
+from numpy.distutils.conv_template import process_file as process_c_file
+from os import (getcwd, environ, makedirs)
+from os import (getcwd, environ, makedirs)
+from os.path import join, exists, abspath, dirname
+from setuptools import Extension
+
+import skbuild
+import skbuild.setuptools_wrap
+import skbuild.utils
+from skbuild.command.build_py import build_py as _skbuild_build_py
+from skbuild.command.install import install as _skbuild_install
+
+# import versioneer
 
 with io.open('mkl_umath/_version.py', 'rt', encoding='utf8') as f:
     version = re.search(r'__version__ = \'(.*?)\'', f.read()).group(1)
@@ -51,45 +67,92 @@
 Operating System :: MacOS
 """
 
-def configuration(parent_package='',top_path=None):
-    from numpy.distutils.misc_util import Configuration
-
-    config = Configuration(None, parent_package, top_path)
-    config.set_options(ignore_setup_xxx_py=True,
-                       assume_default_configuration=True,
-                       delegate_options_to_subpackages=True,
-                       quiet=True)
-
-    config.add_subpackage('mkl_umath')
-
-    config.version = VERSION
-
-    return config
-
-
-def setup_package():
-    from setuptools import setup
-    from numpy.distutils.core import setup
-    metadata = dict(
-        name = 'mkl_umath',
-        maintainer = "Intel Corp.",
-        maintainer_email = "scripting@intel.com",
-        description = "MKL-based universal functions for NumPy arrays",
-        long_description = """Universal functions for real and complex floating point arrays powered by Intel(R) Math Kernel Library Vector (Intel(R) MKL) and Intel(R) Short Vector Math Library (Intel(R) SVML)""",
-        url = "http://github.com/IntelPython/mkl_umath",
-        author = "Intel Corporation",
-        download_url = "http://github.com/IntelPython/mkl_umath",
-        license = 'BSD',
-        classifiers = [_f for _f in CLASSIFIERS.split('\n') if _f],
-        platforms = ["Windows", "Linux", "Mac OS-X"],
-        test_suite = 'nose.collector',
-        python_requires = '>=3.6',
-        install_requires = ['numpy'],
-        configuration = configuration
-    )
-    setup(**metadata)
-
-    return None
-
-if __name__ == '__main__':
-    setup_package()
+
+def load_module(name, fn):
+    """
+    Credit: numpy.compat.npy_load_module
+    """
+    return importlib.machinery.SourceFileLoader(name, fn).load_module()
+
+def separator_join(sep, strs):
+    """
+    Joins non-empty arguments strings with dot.
+
+    Credit: numpy.distutils.misc_util.dot_join
+    """
+    assert isinstance(strs, (list, tuple))
+    assert isinstance(sep, str)
+    return sep.join([si for si in strs if si])
+
+pdir = join(dirname(__file__), 'mkl_umath')
+wdir = join(pdir, 'src')
+
+generate_umath_py = join(pdir, 'generate_umath.py')
+n = separator_join('_', ('mkl_umath', 'generate_umath'))
+generate_umath = load_module(n, generate_umath_py)
+del n
+
+def generate_umath_c(build_dir):
+    target_dir = join(build_dir, 'src')
+    target = join(target_dir, '__umath_generated.c')
+    if not exists(target_dir):
+        print("Folder {} was expected to exist, but creating".format(target_dir))
+        makedirs(target_dir)
+    script = generate_umath_py
+    if newer(script, target):
+        with open(target, 'w') as f:
+            f.write(generate_umath.make_code(generate_umath.defdict,
+                                             generate_umath.__file__))
+    return []
+
+generate_umath_c(pdir)
+
+loops_header_templ = join(wdir, "mkl_umath_loops.h.src")
+processed_loops_h_fn = join(wdir, "mkl_umath_loops.h")
+loops_header_processed = process_c_file(loops_header_templ)
+
+with open(processed_loops_h_fn, 'w') as fid:
+    fid.write(loops_header_processed)
+
+loops_src_templ = join(wdir, "mkl_umath_loops.c.src")
+processed_loops_src_fn = join(wdir, "mkl_umath_loops.c")
+loops_src_processed = process_c_file(loops_src_templ)
+
+with open(processed_loops_src_fn, 'w') as fid:
+    fid.write(loops_src_processed)
+
+skbuild.setup(
+    name="mkl_umath",
+    version=VERSION,
+    ## cmdclass=_get_cmdclass(),
+    description = "MKL-based universal functions for NumPy arrays",
+    long_description = """Universal functions for real and complex floating point arrays powered by Intel(R) Math Kernel Library Vector (Intel(R) MKL) and Intel(R) Short Vector Math Library (Intel(R) SVML)""",
+    long_description_content_type="text/markdown",
+    license = 'BSD',
+    author="Intel Corporation",
+    url="http://github.com/IntelPython/mkl_umath",
+    packages=[
+        "mkl_umath",
+    ],
+    package_data={"mkl_umath": ["tests/*.*", "tests/helper/*.py"]},
+    include_package_data=True,
+    zip_safe=False,
+    setup_requires=["Cython"],
+    install_requires=[
+        "numpy",
+    ],
+    extras_require={
+        "docs": [
+            "Cython",
+            "sphinx",
+            "sphinx_rtd_theme",
+            "pydot",
+            "graphviz",
+            "sphinxcontrib-programoutput",
+        ],
+        "coverage": ["Cython", "pytest", "pytest-cov", "coverage", "tomli"],
+    },
+    keywords="mkl_umath",
+    classifiers=[_f for _f in CLASSIFIERS.split("\n") if _f],
+    platforms=["Linux", "Windows"]
+)
diff --git a/mkl_umath/setup.py b/template
similarity index 100%
rename from mkl_umath/setup.py
rename to template

From ad035905945667b762fd2b82e452b5e26ead4de7 Mon Sep 17 00:00:00 2001
From: Guzman-ballen <andres.guzman-ballen@intel.com>
Date: Wed, 16 Aug 2023 13:53:16 -0500
Subject: [PATCH 02/38] Add scikit-build support for Windows platform

---
 CMakeLists.txt | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 87adc4c..9f0b16a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,6 +40,7 @@ if(WIN32)
      "-Wmissing-declarations "
      "-Wstrict-prototypes "
      "-Wno-unused-parameter "
+     "-Wno-implicit-function-declaration "
    )
    string(CONCAT SDL_FLAGS
      "/GS "
@@ -88,6 +89,10 @@ else()
   message(FATAL_ERROR "Unsupported system.")
 endif()
 
+if (WIN32)
+   set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
+
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
 # set_property(GLOBAL PROPERTY GLOBAL_DEPENDS_DEBUG_MODE 1)
@@ -96,8 +101,11 @@ set(_linker_options "LINKER:${MKL_UMATH_LDFLAGS}")
 set(_trgt mkl_umath_loops)
 add_library(${_trgt} SHARED "mkl_umath/src/mkl_umath_loops.c")
 set_target_properties(${_trgt} PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON)
-target_include_directories(${_trgt} PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR} ${MKL_INCLUDE_DIR})
+target_include_directories(${_trgt} PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR} "C:/Users/aguzmanb/Development/mambaforge/envs/mkl_umath_prefix/Library/include")
 target_link_libraries(${_trgt} PRIVATE mkl_rt)
+if (WIN32)
+   target_link_directories(${_trgt} PRIVATE "C:/Users/aguzmanb/Development/mambaforge/envs/mkl_umath_prefix/Libs")
+endif()
 target_link_options(${_trgt} PRIVATE ${_linker_options})
 install(TARGETS ${_trgt} LIBRARY DESTINATION mkl_umath)
 

From c62b5d7ef8fd5cfec0ec58969ef5a9feb3988c9b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sat, 2 Sep 2023 12:58:19 -0500
Subject: [PATCH 03/38] Remove stray leftover file

---
 template | 185 -------------------------------------------------------
 1 file changed, 185 deletions(-)
 delete mode 100644 template

diff --git a/template b/template
deleted file mode 100644
index 81a77bf..0000000
--- a/template
+++ /dev/null
@@ -1,185 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2019-2021, Intel Corporation
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#     * Redistributions of source code must retain the above copyright notice,
-#       this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * Neither the name of Intel Corporation nor the names of its contributors
-#       may be used to endorse or promote products derived from this software
-#       without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import sys
-from os import (getcwd, environ, makedirs)
-from os.path import join, exists, abspath, dirname
-import importlib.machinery # requires Python >= 3.4
-from distutils.dep_util import newer
-
-from numpy.distutils.ccompiler import new_compiler
-from distutils.sysconfig import customize_compiler
-import platform
-from numpy import get_include as get_numpy_include
-from distutils.sysconfig import get_python_inc as get_python_include
-
-def ensure_Intel_compiler():
-    ccompiler = new_compiler()
-    customize_compiler(ccompiler)
-    if hasattr(ccompiler, 'compiler'):
-        compiler_name = ccompiler.compiler[0]
-    else:
-        compiler_name = ccompiler.__class__.__name__
-
-    assert ('icl' in compiler_name or 'icc' in compiler_name), \
-        "Intel(R) C Compiler is required to build mkl_umath, found {}".format(compiler_name)
-    
-
-def load_module(name, fn):
-    """
-    Credit: numpy.compat.npy_load_module
-    """
-    return importlib.machinery.SourceFileLoader(name, fn).load_module()
-
-
-def separator_join(sep, strs):
-    """
-    Joins non-empty arguments strings with dot.
-
-    Credit: numpy.distutils.misc_util.dot_join
-    """
-    assert isinstance(strs, (list, tuple))
-    assert isinstance(sep, str)
-    return sep.join([si for si in strs if si])
-
-
-def configuration(parent_package='',top_path=None):
-    from numpy.distutils.misc_util import Configuration
-    from numpy.distutils.system_info import get_info
-    config = Configuration('mkl_umath', parent_package, top_path)
-
-    mkl_root = environ.get('MKLROOT', None)
-    if mkl_root:
-        mkl_info = {
-            'include_dirs': [join(mkl_root, 'include')],
-            'library_dirs': [join(mkl_root, 'lib'), join(mkl_root, 'lib', 'intel64')],
-            'libraries': ['mkl_rt']
-        }
-    else:
-        mkl_info = get_info('mkl')
-
-    print(mkl_info)
-    mkl_include_dirs = mkl_info.get('include_dirs', [])
-    mkl_library_dirs = mkl_info.get('library_dirs', [])
-    mkl_libraries = mkl_info.get('libraries', ['mkl_rt'])
-
-    pdir = dirname(__file__)
-    wdir = join(pdir, 'src')
-    mkl_info = get_info('mkl')
-
-    generate_umath_py = join(pdir, 'generate_umath.py')
-    n = separator_join('_', (config.name, 'generate_umath'))
-    generate_umath = load_module(n, generate_umath_py)
-    del n
-
-    def generate_umath_c(ext, build_dir):
-        target_dir = join(build_dir, 'src')
-        target = join(target_dir, '__umath_generated.c')
-        if not exists(target_dir):
-            print("Folder {} was expected to exist, but creating".format(target_dir))
-            makedirs(target_dir)
-        script = generate_umath_py
-        if newer(script, target):
-            with open(target, 'w') as f:
-                f.write(generate_umath.make_code(generate_umath.defdict,
-                                                 generate_umath.__file__))
-        config.add_include_dirs(target_dir)
-        return []
-
-    sources = [generate_umath_c]
-
-    # ensure_Intel_compiler()
-
-    if platform.system() == "Windows":
-        eca = ['/fp:fast=2', '/Qimf-precision=high', '/Qprec-sqrt', '/Qstd=c99', '/Qprotect-parens']
-    else:
-        eca = ['-fp-model', 'fast=2', '-fimf-precision=high', '-prec-sqrt', '-fprotect-parens']
-
-    numpy_include_dir = get_numpy_include()
-    python_include_dir = get_python_include()
-    config.add_library(
-        'loops_intel',
-        sources = [
-            join(wdir, 'loops_intel.h.src'),
-            join(wdir, 'loops_intel.c.src'),
-        ],
-        include_dirs = [wdir] + mkl_include_dirs + [numpy_include_dir, python_include_dir],
-        depends = [
-            join(wdir, 'blocking_utils.h'),
-            join(wdir, 'fast_loop_macros.h'),
-            join(numpy_include_dir, 'numpy', '*object.h'),
-            join(python_include_dir, "Python.h")
-        ],
-        libraries=mkl_libraries,
-        extra_compiler_args=eca,
-        macros=getattr(config, 'define_macros', getattr(config.get_distribution(), 'define_macros', []))
-    )
-
-    config.add_extension(
-        name = '_ufuncs',
-        sources = [
-            join(wdir, 'ufuncsmodule.c'),
-        ] + sources,
-        depends = [
-            join(wdir, 'loops_intel.c.src'),
-            join(wdir, 'loops_intel.h.src'),
-        ],
-        include_dirs = [wdir] + mkl_include_dirs,
-        libraries = mkl_libraries + ['loops_intel'],
-        library_dirs = mkl_library_dirs,
-        extra_compile_args = [
-            '-DNDEBUG',
-            # '-ggdb', '-O0', '-Wall', '-Wextra', '-DDEBUG',
-        ]
-    )
-
-    from Cython.Build import cythonize
-    from setuptools import Extension
-    cythonize(Extension('_patch', sources=[join(wdir, 'patch.pyx'),]))
-
-    config.add_extension(
-        name = '_patch',
-        sources = [
-            join(wdir, 'patch.c'),
-        ],
-        libraries = mkl_libraries + ['loops_intel'],
-        library_dirs = mkl_library_dirs,
-        extra_compile_args = [
-            '-DNDEBUG',
-            #'-ggdb', '-O0', '-Wall', '-Wextra', '-DDEBUG',
-        ]
-    )
-
-    config.add_data_dir('tests')
-
-#    if have_cython:
-#        config.ext_modules = cythonize(config.ext_modules, include_path=[pdir, wdir])
-
-    return config
-
-if __name__ == '__main__':
-    from numpy.distutils.core import setup
-    setup(configuration=configuration)

From 0319b439f6b7417acef6a7e64f5472032273f035 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 5 Sep 2023 06:04:05 -0500
Subject: [PATCH 04/38] Add conda-recipe, a GH action workflow

---
 .github/workflows/conda-package.yml | 251 ++++++++++++++++++++++++++++
 build.sh                            |  18 --
 conda-recipe/bld.bat                |  25 +++
 conda-recipe/build.sh               |  23 +++
 conda-recipe/meta.yaml              |  53 ++++++
 conda-recipe/run_tests.bat          |   1 +
 conda-recipe/run_tests.sh           |   1 +
 icpx_for_conda.cfg                  |   1 -
 8 files changed, 354 insertions(+), 19 deletions(-)
 create mode 100644 .github/workflows/conda-package.yml
 delete mode 100644 build.sh
 create mode 100644 conda-recipe/bld.bat
 create mode 100644 conda-recipe/build.sh
 create mode 100644 conda-recipe/meta.yaml
 create mode 100644 conda-recipe/run_tests.bat
 create mode 100644 conda-recipe/run_tests.sh
 delete mode 100644 icpx_for_conda.cfg

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
new file mode 100644
index 0000000..b890920
--- /dev/null
+++ b/.github/workflows/conda-package.yml
@@ -0,0 +1,251 @@
+name: Conda package
+
+on: push
+
+env:
+  PACKAGE_NAME: mkl_umath
+  MODULE_NAME: mkl_umath
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python: ['3.10']
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Set pkgs_dirs
+        run: |
+          echo "pkgs_dirs: [~/.conda/pkgs]" >> ~/.condarc
+      - name: Cache conda packages
+        uses: actions/cache@v3
+        env:
+          CACHE_NUMBER: 0  # Increase to reset cache
+        with:
+          path: ~/.conda/pkgs
+          key:
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-${{hashFiles('**/meta.yaml') }}
+          restore-keys: |
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-
+
+      - name: Add conda to system path
+        run: echo $CONDA/bin >> $GITHUB_PATH
+      - name: Install conda-build
+        run: conda install conda-build
+      - name: Build conda package
+        run: |
+          CHANNELS="-c conda-forge -c intel --override-channels"
+          VERSIONS="--python ${{ matrix.python }}"
+          TEST="--no-test"
+
+          conda build \
+            $TEST \
+            $VERSIONS \
+            $CHANNELS \
+            conda-recipe
+      - name: Upload artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
+          path: /usr/share/miniconda/conda-bld/linux-64/${{ env.PACKAGE_NAME }}-*.tar.bz2
+
+  test:
+    needs: build
+    runs-on:  ${{ matrix.runner }}
+
+    strategy:
+      matrix:
+        python: ['3.10']
+        experimental: [false]
+        runner: [ubuntu-latest]
+    continue-on-error: ${{ matrix.experimental }}
+    env:
+      CHANNELS: -c intel -c main --override-channels
+
+    steps:
+      - name: Download artifact
+        uses: actions/download-artifact@v3
+        with:
+          name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
+      - name: Add conda to system path
+        run: echo $CONDA/bin >> $GITHUB_PATH
+      - name: Install conda-build
+        run: conda install conda-build
+      - name: Create conda channel
+        run: |
+          mkdir -p $GITHUB_WORKSPACE/channel/linux-64
+          mv ${PACKAGE_NAME}-*.tar.bz2 $GITHUB_WORKSPACE/channel/linux-64
+          conda index $GITHUB_WORKSPACE/channel
+          # Test channel
+          conda search $PACKAGE_NAME -c $GITHUB_WORKSPACE/channel --override-channels
+
+      - name: Collect dependencies
+        run: |
+          CHANNELS="-c $GITHUB_WORKSPACE/channel ${{ env.CHANNELS }}"
+          conda create -n test_mkl_umath $PACKAGE_NAME python=${{ matrix.python }} $CHANNELS --only-deps --dry-run > lockfile
+      - name: Display lockfile
+        run: cat lockfile
+      - name: Set pkgs_dirs
+        run: |
+          echo "pkgs_dirs: [~/.conda/pkgs]" >> ~/.condarc
+      - name: Cache conda packages
+        uses: actions/cache@v3
+        env:
+          CACHE_NUMBER: 0  # Increase to reset cache
+        with:
+          path: ~/.conda/pkgs
+          key:
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-${{hashFiles('lockfile') }}
+          restore-keys: |
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-
+
+      - name: Install mkl_umath
+        run: |
+          CHANNELS="-c $GITHUB_WORKSPACE/channel ${{ env.CHANNELS }}"
+          conda create -n test_mkl_umath python=${{ matrix.python }} $PACKAGE_NAME pytest $CHANNELS
+          # Test installed packages
+          conda list -n test_mkl_umath
+      - name: Run tests
+        run: |
+          source $CONDA/etc/profile.d/conda.sh
+          conda activate test_mkl_umath
+          python -c "import mkl_umath, numpy as np; mkl_umath.use_in_numpy(); np.sin(np.linspace(0, 1, num=10**6));"
+
+  build_windows:
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        python: ['3.10']
+    env:
+      conda-bld: C:\Miniconda\conda-bld\win-64\
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-activate-base: true
+          conda-build-version: "*"
+          activate-environment: true
+          python-version: ${{ matrix.python }}
+
+      - name: Cache conda packages
+        uses: actions/cache@v3
+        env:
+          CACHE_NUMBER: 3  # Increase to reset cache
+        with:
+          path: /home/runner/conda_pkgs_dir
+          key:
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-${{hashFiles('**/meta.yaml') }}
+          restore-keys: |
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-
+      - name: Build conda package
+        run: conda build --no-test --python ${{ matrix.python }} -c intel -c conda-forge --override-channels conda-recipe
+      - name: Upload artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
+          path: ${{ env.conda-bld }}${{ env.PACKAGE_NAME }}-*.tar.bz2
+
+  test_windows:
+    needs: build_windows
+    runs-on:  ${{ matrix.runner }}
+    defaults:
+      run:
+        shell: cmd /C CALL {0}
+    strategy:
+      matrix:
+        python: ['3.10']
+        experimental: [false]
+        runner: [windows-latest]
+    continue-on-error: ${{ matrix.experimental }}
+    env:
+      workdir: '${{ github.workspace }}'
+      CHANNELS: -c intel -c conda-forge --override-channels
+
+    steps:
+      - name: Download artifact
+        uses: actions/download-artifact@v3
+        with:
+          name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
+      - uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          conda-build-version: '*'
+          miniconda-version: 'latest'
+          activate-environment: mkl_umath_test
+          python-version: ${{ matrix.python }}
+      - name: Create conda channel with the artifact bit
+        shell: cmd /C CALL {0}
+        run: |
+          echo ${{ env.workdir }}
+          mkdir ${{ env.workdir }}\channel\win-64
+          move ${{ env.PACKAGE_NAME }}-*.tar.bz2 ${{ env.workdir }}\channel\win-64
+          dir ${{ env.workdir }}\channel\win-64
+      - name: Index the channel
+        shell: cmd /C CALL {0}
+        run: conda index ${{ env.workdir }}\channel
+
+      - name: Dump mkl_umath version info from created channel into ver.json
+        shell: cmd /C CALL {0}
+        run: |
+          conda search ${{ env.PACKAGE_NAME }} -c ${{ env.workdir }}/channel --override-channels --info --json > ${{ env.workdir }}\ver.json
+      - name: Output content of produced ver.json
+        shell: pwsh
+        run: Get-Content -Path ${{ env.workdir }}\ver.json
+      - name: Collect dependencies
+        shell: cmd /C CALL {0}
+        run: |
+          IF NOT EXIST ver.json (
+              copy /Y ${{ env.workdir }}\ver.json .
+          )
+          SET "SCRIPT=%VER_SCRIPT1% %VER_SCRIPT2%"
+          FOR /F "tokens=* USEBACKQ" %%F IN (`python -c "%SCRIPT%"`) DO (
+             SET PACKAGE_VERSION=%%F
+          )
+          conda install -n mkl_umath_test ${{ env.PACKAGE_NAME }}=%PACKAGE_VERSION% python=${{ matrix.python }} -c ${{ env.workdir }}/channel ${{ env.CHANNELS }} --only-deps --dry-run > lockfile
+      - name: Display lockfile content
+        shell: pwsh
+        run: Get-Content -Path .\lockfile
+      - name: Cache conda packages
+        uses: actions/cache@v3
+        env:
+          CACHE_NUMBER: 0  # Increase to reset cache
+        with:
+          path: /home/runner/conda_pkgs_dir
+          key:
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-${{hashFiles('lockfile') }}
+          restore-keys: |
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-
+            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-
+      - name: Install mkl_umath
+        shell: cmd /C CALL {0}
+        run: |
+          @ECHO ON
+          IF NOT EXIST ver.json (
+              copy /Y ${{ env.workdir }}\ver.json .
+          )
+          set "SCRIPT=%VER_SCRIPT1% %VER_SCRIPT2%"
+          FOR /F "tokens=* USEBACKQ" %%F IN (`python -c "%SCRIPT%"`) DO (
+             SET PACKAGE_VERSION=%%F
+          )
+          SET "TEST_DEPENDENCIES=pytest pytest-cov"
+          conda install -n mkl_umath_test ${{ env.PACKAGE_NAME }}=%PACKAGE_VERSION% %TEST_DEPENDENCIES% python=${{ matrix.python }} -c ${{ env.workdir }}/channel ${{ env.CHANNELS }}
+      - name: Report content of test environment
+        shell: cmd /C CALL {0}
+        run: |
+          echo "Value of CONDA enviroment variable was: " %CONDA%
+          echo "Value of CONDA_PREFIX enviroment variable was: " %CONDA_PREFIX%
+          conda info && conda list -n mkl_umath_test
+      - name: Run tests
+        shell: cmd /C CALL {0}
+        run: >-
+          conda activate mkl_umath_test && python -c "import mkl_umath, numpy as np; mkl_umath.use_in_numpy(); np.sin(np.linspace(0, 1, num=10**6));" 
+
diff --git a/build.sh b/build.sh
deleted file mode 100644
index bd34337..0000000
--- a/build.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-# This is necessary to help DPC++ find Intel libraries such as SVML, IRNG, etc in build prefix
-export BUILD_PREFIX=$CONDA_PREFIX
-export HOST=x86_64-conda-linux-gnu
-export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${BUILD_PREFIX}/lib"
-
-# Intel LLVM must cooperate with compiler and sysroot from conda
-echo "--gcc-toolchain=${BUILD_PREFIX} --sysroot=${BUILD_PREFIX}/${HOST}/sysroot -target ${HOST}" > icpx_for_conda.cfg
-export ICPXCFG="$(pwd)/icpx_for_conda.cfg"
-export ICXCFG="$(pwd)/icpx_for_conda.cfg"
-
-# if [ -e "_skbuild" ]; then
-#   python setup.py clean --all
-# fi
-
-export CMAKE_GENERATOR="Ninja"
-SKBUILD_ARGS="-- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
-echo "python setup.py install ${SKBUILD_ARGS}"
-python setup.py install ${SKBUILD_ARGS}
diff --git a/conda-recipe/bld.bat b/conda-recipe/bld.bat
new file mode 100644
index 0000000..e27318d
--- /dev/null
+++ b/conda-recipe/bld.bat
@@ -0,0 +1,25 @@
+REM A workaround for activate-dpcpp.bat issue to be addressed in 2021.4
+set "LIB=%BUILD_PREFIX%\Library\lib;%BUILD_PREFIX%\compiler\lib;%LIB%"
+set "INCLUDE=%BUILD_PREFIX%\include;%INCLUDE%"
+
+"%PYTHON%" setup.py clean --all
+set "SKBUILD_ARGS=-G Ninja -- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
+
+FOR %%V IN (14.0.0 14 15.0.0 15 16.0.0 16 17.0.0 17) DO @(
+  REM set DIR_HINT if directory exists
+  IF EXIST "%BUILD_PREFIX%\Library\lib\clang\%%V\" (
+     SET "SYCL_INCLUDE_DIR_HINT=%BUILD_PREFIX%\Library\lib\clang\%%V"
+  )
+)
+
+if NOT "%WHEELS_OUTPUT_FOLDER%"=="" (
+    rem Install and assemble wheel package from the build bits
+    "%PYTHON%" setup.py install bdist_wheel %SKBUILD_ARGS%
+    if errorlevel 1 exit 1
+    copy dist\mkl_umath*.whl %WHEELS_OUTPUT_FOLDER%
+    if errorlevel 1 exit 1
+) ELSE (
+    rem Only install
+    "%PYTHON%" setup.py install %SKBUILD_ARGS%
+    if errorlevel 1 exit 1
+)
diff --git a/conda-recipe/build.sh b/conda-recipe/build.sh
new file mode 100644
index 0000000..fc4459c
--- /dev/null
+++ b/conda-recipe/build.sh
@@ -0,0 +1,23 @@
+# This is necessary to help DPC++ find Intel libraries such as SVML, IRNG, etc in build prefix
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${BUILD_PREFIX}/lib"
+
+# Intel LLVM must cooperate with compiler and sysroot from conda
+echo "--gcc-toolchain=${BUILD_PREFIX} --sysroot=${BUILD_PREFIX}/${HOST}/sysroot -target ${HOST}" > icx_for_conda.cfg
+export ICXCFG="$(pwd)/icx_for_conda.cfg"
+
+export CMAKE_GENERATOR="Ninja"
+SKBUILD_ARGS="-- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
+
+if [ -n "${WHEELS_OUTPUT_FOLDER}" ]; then
+    # Install packages and assemble wheel package from built bits
+    if [ "$CONDA_PY" == "36" ]; then
+        WHEELS_BUILD_ARGS="-p manylinux1_x86_64"
+    else
+        WHEELS_BUILD_ARGS="-p manylinux2014_x86_64"
+    fi
+    ${PYTHON} setup.py install bdist_wheel ${WHEELS_BUILD_ARGS} ${SKBUILD_ARGS}
+    cp dist/mkl_umath*.whl ${WHEELS_OUTPUT_FOLDER}
+else
+    # Perform regular install
+    ${PYTHON} setup.py install ${SKBUILD_ARGS}
+fi
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
new file mode 100644
index 0000000..9f98829
--- /dev/null
+++ b/conda-recipe/meta.yaml
@@ -0,0 +1,53 @@
+{% set version = "0.1.2" %}
+{% set buildnumber = 0 %}
+
+package:
+    name: mkl_umath
+    version: {{ version }}
+
+source:
+    path: ../
+
+build:
+    number: {{ buildnumber }}
+    ignore_run_exports:
+      - blas
+
+requirements:
+    build:
+      - {{ compiler('c') }}
+      - {{ compiler('cxx') }}
+      - {{ compiler('dpcpp') }} >=2023.2  # [not osx]
+      - sysroot_linux-64 >=2.28  # [linux]
+    host:
+      - setuptools
+      - cmake
+      - ninja
+      - git
+      - cython
+      - scikit-build
+      - python
+      - mkl-devel
+      - numpy-base
+    run:
+      - python
+      - mkl
+      - mkl-service
+      - {{ pin_compatible('intel-cmplr-lib-rt') }}
+      - {{ pin_compatible('numpy') }}
+
+test:
+    source_files:
+      - mkl_umath/tests/test_basic.py
+    commands:
+      - python mkl_umath/tests/test_basic.py
+    imports:
+      - mkl_umath
+      - mkl_umath._ufuncs
+      - mkl_umath._patch
+
+about:
+    home: http://github.com/IntelPython/mkl_umath
+    license: BSD-3
+    license_file: LICENSE.txt
+    summary: Universal functions for real and complex floating point arrays powered by Intel(R) Math Kernel Library Vector (Intel(R) MKL) and Intel(R) Short Vector Math Library (Intel(R) SVML)
diff --git a/conda-recipe/run_tests.bat b/conda-recipe/run_tests.bat
new file mode 100644
index 0000000..590db89
--- /dev/null
+++ b/conda-recipe/run_tests.bat
@@ -0,0 +1 @@
+%PYTHON% tests\test_basic.py
\ No newline at end of file
diff --git a/conda-recipe/run_tests.sh b/conda-recipe/run_tests.sh
new file mode 100644
index 0000000..7bfca5d
--- /dev/null
+++ b/conda-recipe/run_tests.sh
@@ -0,0 +1 @@
+$PYTHON tests/test_basic.py
diff --git a/icpx_for_conda.cfg b/icpx_for_conda.cfg
deleted file mode 100644
index d828bd2..0000000
--- a/icpx_for_conda.cfg
+++ /dev/null
@@ -1 +0,0 @@
---gcc-toolchain=/localdisk/work/aguzmanb/Development/miniconda3.py310/envs/numpy_umath_prefix.v6 --sysroot=/localdisk/work/aguzmanb/Development/miniconda3.py310/envs/numpy_umath_prefix.v6/x86_64-conda-linux-gnu/sysroot -target x86_64-conda-linux-gnu

From 0350a29aee9674993111f8fd1c9410f3d7fe578b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 5 Sep 2023 06:20:32 -0500
Subject: [PATCH 05/38] Updated instructions to build from source using ICX

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a9f571c..006aa88 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ Where `<numpy_version>` should be the latest version from https://anaconda.org/i
 Intel(R) C compiler and Intel(R) Math Kernel Library are required to build `mkl_umath` from source:
 
 ```sh
-# ensure that MKL is installed, icc is activated
+# ensure that MKL is installed into Python prefix, Intel LLVM compiler is activated
 export MKLROOT=$CONDA_PREFIX
-python setup.py config_cc --compiler=intelem build_ext --inplace
+CC=icx pip install --no-build-isolation --no-deps -e .
 ```

From c2bbcd30c7137c635fe8a9023712aefab54c7f25 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 5 Sep 2023 07:22:06 -0500
Subject: [PATCH 06/38] Try using /FORCE:UNRESOLVED for MSVC linker

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9f0b16a..2314b6f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,7 +50,7 @@ if(WIN32)
    set(CMAKE_C_FLAGS_DEBUG
      "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG"
    )
-  set(MKL_UMATH_LDFLAGS "/NXCompat;/DynamicBase")
+  set(MKL_UMATH_LDFLAGS "/NXCompat;/DynamicBase;/FORCE:UNRESOLVED")
 elseif(UNIX)
    string(CONCAT WARNING_FLAGS
      "-Wall "

From ef16268d64b669688f5a3ca17e6c5bfaf80c4af2 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 5 Sep 2023 08:18:29 -0500
Subject: [PATCH 07/38] Use multiple linker options

---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2314b6f..7ad25d2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,7 +50,7 @@ if(WIN32)
    set(CMAKE_C_FLAGS_DEBUG
      "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG"
    )
-  set(MKL_UMATH_LDFLAGS "/NXCompat;/DynamicBase;/FORCE:UNRESOLVED")
+  set(MKL_UMATH_LINKER_OPTIONS "LINKER:/NXCompat;LINKER:/DynamicBase;LINKER:/FORCE:UNRESOLVED")
 elseif(UNIX)
    string(CONCAT WARNING_FLAGS
      "-Wall "
@@ -84,7 +84,7 @@ elseif(UNIX)
      "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O0 -g1 -DDEBUG"
    )
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-incompatible-function-pointer-types ${CFLAGS}")
-  set(MKL_UMATH_LDFLAGS "-z,noexecstack,-z,relro,-z,now")
+  set(MKL_UMATH_LINKER_OPTIONS "LINKER:-z,noexecstack,-z,relro,-z,now")
 else()
   message(FATAL_ERROR "Unsupported system.")
 endif()
@@ -96,7 +96,7 @@ endif()
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
 # set_property(GLOBAL PROPERTY GLOBAL_DEPENDS_DEBUG_MODE 1)
-set(_linker_options "LINKER:${MKL_UMATH_LDFLAGS}")
+set(_linker_options ${MKL_UMATH_LINKER_OPTIONS})
 
 set(_trgt mkl_umath_loops)
 add_library(${_trgt} SHARED "mkl_umath/src/mkl_umath_loops.c")

From fd833e5c563a2325ab138af4c6b4df30549fab63 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 5 Sep 2023 08:29:01 -0500
Subject: [PATCH 08/38] Updated copyright year to 2023

---
 mkl_umath/__init__.py               | 2 +-
 mkl_umath/generate_umath.py         | 2 +-
 mkl_umath/src/mkl_umath_loops.c.src | 2 +-
 mkl_umath/src/mkl_umath_loops.h.src | 2 +-
 mkl_umath/tests/test_basic.py       | 2 +-
 mkl_umath/ufunc_docstrings.py       | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mkl_umath/__init__.py b/mkl_umath/__init__.py
index 92960ad..a6e2927 100644
--- a/mkl_umath/__init__.py
+++ b/mkl_umath/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, Intel Corporation
+# Copyright (c) 2019-2023, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
diff --git a/mkl_umath/generate_umath.py b/mkl_umath/generate_umath.py
index cc2034f..e6609ab 100644
--- a/mkl_umath/generate_umath.py
+++ b/mkl_umath/generate_umath.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, Intel Corporation
+# Copyright (c) 2019-2023, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
diff --git a/mkl_umath/src/mkl_umath_loops.c.src b/mkl_umath/src/mkl_umath_loops.c.src
index b5cbbaf..be3e8d3 100644
--- a/mkl_umath/src/mkl_umath_loops.c.src
+++ b/mkl_umath/src/mkl_umath_loops.c.src
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, Intel Corporation
+ * Copyright (c) 2019-2023, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/mkl_umath/src/mkl_umath_loops.h.src b/mkl_umath/src/mkl_umath_loops.h.src
index 70a7e94..7dccf0a 100644
--- a/mkl_umath/src/mkl_umath_loops.h.src
+++ b/mkl_umath/src/mkl_umath_loops.h.src
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, Intel Corporation
+ * Copyright (c) 2019-2023, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
diff --git a/mkl_umath/tests/test_basic.py b/mkl_umath/tests/test_basic.py
index 664d4c8..1a9fc53 100644
--- a/mkl_umath/tests/test_basic.py
+++ b/mkl_umath/tests/test_basic.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, Intel Corporation
+# Copyright (c) 2019-2023, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
diff --git a/mkl_umath/ufunc_docstrings.py b/mkl_umath/ufunc_docstrings.py
index 5abc3af..79877e2 100644
--- a/mkl_umath/ufunc_docstrings.py
+++ b/mkl_umath/ufunc_docstrings.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, Intel Corporation
+# Copyright (c) 2019-2023, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:

From 589046defc55af641d1259f388ab5db7509c616f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 5 Sep 2023 08:59:32 -0500
Subject: [PATCH 09/38] Link mkl_umath_loops to Python lib

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ad25d2..e375544 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,7 +50,7 @@ if(WIN32)
    set(CMAKE_C_FLAGS_DEBUG
      "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG"
    )
-  set(MKL_UMATH_LINKER_OPTIONS "LINKER:/NXCompat;LINKER:/DynamicBase;LINKER:/FORCE:UNRESOLVED")
+  set(MKL_UMATH_LINKER_OPTIONS "LINKER:/NXCompat;LINKER:/DynamicBase")
 elseif(UNIX)
    string(CONCAT WARNING_FLAGS
      "-Wall "
@@ -102,7 +102,7 @@ set(_trgt mkl_umath_loops)
 add_library(${_trgt} SHARED "mkl_umath/src/mkl_umath_loops.c")
 set_target_properties(${_trgt} PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON)
 target_include_directories(${_trgt} PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR} "C:/Users/aguzmanb/Development/mambaforge/envs/mkl_umath_prefix/Library/include")
-target_link_libraries(${_trgt} PRIVATE mkl_rt)
+target_link_libraries(${_trgt} PRIVATE mkl_rt ${Python_LIBRARIES})
 if (WIN32)
    target_link_directories(${_trgt} PRIVATE "C:/Users/aguzmanb/Development/mambaforge/envs/mkl_umath_prefix/Libs")
 endif()

From e80e7108d4d424ec1e13f092ac68b2a482f257b7 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 5 Sep 2023 09:29:43 -0500
Subject: [PATCH 10/38] No need to export all symbols

---
 CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e375544..f2cb2e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,10 +89,6 @@ else()
   message(FATAL_ERROR "Unsupported system.")
 endif()
 
-if (WIN32)
-   set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-endif()
-
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
 # set_property(GLOBAL PROPERTY GLOBAL_DEPENDS_DEBUG_MODE 1)

From effe815b51db631d531731a1652386543487cfdb Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 5 Sep 2023 10:07:23 -0500
Subject: [PATCH 11/38] Removed stray hard path

---
 CMakeLists.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f2cb2e0..7c2813a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,9 +99,6 @@ add_library(${_trgt} SHARED "mkl_umath/src/mkl_umath_loops.c")
 set_target_properties(${_trgt} PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON)
 target_include_directories(${_trgt} PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR} "C:/Users/aguzmanb/Development/mambaforge/envs/mkl_umath_prefix/Library/include")
 target_link_libraries(${_trgt} PRIVATE mkl_rt ${Python_LIBRARIES})
-if (WIN32)
-   target_link_directories(${_trgt} PRIVATE "C:/Users/aguzmanb/Development/mambaforge/envs/mkl_umath_prefix/Libs")
-endif()
 target_link_options(${_trgt} PRIVATE ${_linker_options})
 install(TARGETS ${_trgt} LIBRARY DESTINATION mkl_umath)
 

From bb87659942e8acf77c263f91503a9f9c5be42993 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 5 Sep 2023 10:07:30 -0500
Subject: [PATCH 12/38] Ensure symbols are property annotated for export

---
 mkl_umath/src/mkl_umath_loops.h.src | 205 +++++++++++++++++++---------
 1 file changed, 140 insertions(+), 65 deletions(-)

diff --git a/mkl_umath/src/mkl_umath_loops.h.src b/mkl_umath/src/mkl_umath_loops.h.src
index 7dccf0a..c643c20 100644
--- a/mkl_umath/src/mkl_umath_loops.h.src
+++ b/mkl_umath/src/mkl_umath_loops.h.src
@@ -32,100 +32,139 @@
 
 #include <string.h>
 
+#ifdef _WIN32
+#ifdef mkl_umath_loops_EXPORTS
+#define MKL_UMATH_API __declspec(dllexport)
+#else
+#define MKL_UMATH_API __declspec(dllimport)
+#endif
+#else
+#define MKL_UMATH_API
+#endif
+
 /**begin repeat
  * Float types
  *  #TYPE = FLOAT, DOUBLE#
  */
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_sqrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_invsqrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_exp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_exp2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_expm1(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_erf(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_log(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_log2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_log10(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_log1p(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_cos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_sin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_tan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_arccos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_arcsin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_arctan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_cosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_sinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_tanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_arccosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_arcsinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_arctanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_fabs(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_floor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_ceil(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_rint(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_trunc(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_cbrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
 /**begin repeat1
  * Arithmetic
  * # kind = add, subtract, multiply, divide#
  */
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 /**end repeat1**/
 
@@ -134,83 +173,106 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
  * # kind = equal, not_equal, less, less_equal, greater, greater_equal,
  *        logical_and, logical_or#
  */
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 /**end repeat1**/
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_logical_xor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_logical_not(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
 /**begin repeat1
  * #kind = isnan, isinf, isfinite, signbit#
  **/
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 /**end repeat1**/
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_spacing(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_copysign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_nextafter(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
 /**begin repeat1
  * #kind = maximum, minimum, fmax, fmin#
  **/
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 /**end repeat1**/
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_floor_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_remainder(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_divmod(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@__ones_like(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_negative(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_positive(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_sign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_modf(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_frexp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_ldexp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_ldexp_long(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
 #define mkl_umath_@TYPE@_true_divide mkl_umath_@TYPE@_divide
@@ -239,17 +301,21 @@ mkl_umath_@TYPE@_ldexp_long(char **args, const npy_intp *dimensions, const npy_i
  * arithmetic
  * #kind = add, subtract#
  */
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 /**end repeat1**/
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_multiply(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_floor_divide(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
 
@@ -259,36 +325,45 @@ mkl_umath_@TYPE@_floor_divide(char **args, const npy_intp *dimensions, const npy
            not_equal, logical_and, logical_or, logical_xor, logical_not,
 	   isnan, isinf, isfinite#
  */
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 /**end repeat1**/
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_reciprocal(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@__ones_like(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@__arg(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
 
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_sign(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
 
 /**begin repeat1
  * arithmetic
  * #kind = maximum, minimum, fmax, fmin#
  */
-extern void
+MKL_UMATH_API
+void
 mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 /**end repeat1**/
 

From e3db203b4718081a3ebf9324d65f30e46fed5eb1 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 5 Sep 2023 12:53:15 -0500
Subject: [PATCH 13/38] Specify ARCHIVE/RUNTIME/LIBRARY destinations for
 mkl_umath on Windows

---
 CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c2813a..edef098 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,7 +100,11 @@ set_target_properties(${_trgt} PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON)
 target_include_directories(${_trgt} PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR} "C:/Users/aguzmanb/Development/mambaforge/envs/mkl_umath_prefix/Library/include")
 target_link_libraries(${_trgt} PRIVATE mkl_rt ${Python_LIBRARIES})
 target_link_options(${_trgt} PRIVATE ${_linker_options})
-install(TARGETS ${_trgt} LIBRARY DESTINATION mkl_umath)
+install(TARGETS ${_trgt}
+  LIBRARY DESTINATION mkl_umath
+  ARCHIVE DESTINATION mkl_umath
+  RUNTIME DESTINATION mkl_umath
+)
 
 add_library(_ufuncs MODULE "mkl_umath/src/ufuncsmodule.c" "mkl_umath/src/__umath_generated.c")
 target_include_directories(_ufuncs PRIVATE "mkl_umath/src" ${NumPy_INCLUDE_DIR} ${MKL_INCLUDE_DIR})

From fe29ae613e1c9bc9078634b85dd2178e402b163a Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 11 Sep 2023 04:37:45 -0500
Subject: [PATCH 14/38] Use vendored copy of conv_template script

---
 _vendored/README.md        |   5 +
 _vendored/__init__.py      |   1 +
 _vendored/conv_template.py | 329 +++++++++++++++++++++++++++++++++++++
 setup.py                   |   2 +-
 4 files changed, 336 insertions(+), 1 deletion(-)
 create mode 100644 _vendored/README.md
 create mode 100644 _vendored/__init__.py
 create mode 100644 _vendored/conv_template.py

diff --git a/_vendored/README.md b/_vendored/README.md
new file mode 100644
index 0000000..0ebafcb
--- /dev/null
+++ b/_vendored/README.md
@@ -0,0 +1,5 @@
+## Vendored files
+
+File `conv_template.py` is copied from NumPy's numpy/distutils folder, since
+`numpy.distutils` is absent from the installation layout starting with
+Python 3.12
\ No newline at end of file
diff --git a/_vendored/__init__.py b/_vendored/__init__.py
new file mode 100644
index 0000000..fa81ada
--- /dev/null
+++ b/_vendored/__init__.py
@@ -0,0 +1 @@
+# empty file
diff --git a/_vendored/conv_template.py b/_vendored/conv_template.py
new file mode 100644
index 0000000..c8933d1
--- /dev/null
+++ b/_vendored/conv_template.py
@@ -0,0 +1,329 @@
+#!/usr/bin/env python3
+"""
+takes templated file .xxx.src and produces .xxx file  where .xxx is
+.i or .c or .h, using the following template rules
+
+/**begin repeat  -- on a line by itself marks the start of a repeated code
+                    segment
+/**end repeat**/ -- on a line by itself marks it's end
+
+After the /**begin repeat and before the */, all the named templates are placed
+these should all have the same number of replacements
+
+Repeat blocks can be nested, with each nested block labeled with its depth,
+i.e.
+/**begin repeat1
+ *....
+ */
+/**end repeat1**/
+
+When using nested loops, you can optionally exclude particular
+combinations of the variables using (inside the comment portion of the inner loop):
+
+ :exclude: var1=value1, var2=value2, ...
+
+This will exclude the pattern where var1 is value1 and var2 is value2 when
+the result is being generated.
+
+
+In the main body each replace will use one entry from the list of named replacements
+
+ Note that all #..# forms in a block must have the same number of
+   comma-separated entries.
+
+Example:
+
+    An input file containing
+
+        /**begin repeat
+         * #a = 1,2,3#
+         * #b = 1,2,3#
+         */
+
+        /**begin repeat1
+         * #c = ted, jim#
+         */
+        @a@, @b@, @c@
+        /**end repeat1**/
+
+        /**end repeat**/
+
+    produces
+
+        line 1 "template.c.src"
+
+        /*
+         *********************************************************************
+         **       This file was autogenerated from a template  DO NOT EDIT!!**
+         **       Changes should be made to the original source (.src) file **
+         *********************************************************************
+         */
+
+        #line 9
+        1, 1, ted
+
+        #line 9
+        1, 1, jim
+
+        #line 9
+        2, 2, ted
+
+        #line 9
+        2, 2, jim
+
+        #line 9
+        3, 3, ted
+
+        #line 9
+        3, 3, jim
+
+"""
+
+__all__ = ['process_str', 'process_file']
+
+import os
+import sys
+import re
+
+# names for replacement that are already global.
+global_names = {}
+
+# header placed at the front of head processed file
+header =\
+"""
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+"""
+# Parse string for repeat loops
+def parse_structure(astr, level):
+    """
+    The returned line number is from the beginning of the string, starting
+    at zero. Returns an empty list if no loops found.
+
+    """
+    if level == 0 :
+        loopbeg = "/**begin repeat"
+        loopend = "/**end repeat**/"
+    else :
+        loopbeg = "/**begin repeat%d" % level
+        loopend = "/**end repeat%d**/" % level
+
+    ind = 0
+    line = 0
+    spanlist = []
+    while True:
+        start = astr.find(loopbeg, ind)
+        if start == -1:
+            break
+        start2 = astr.find("*/", start)
+        start2 = astr.find("\n", start2)
+        fini1 = astr.find(loopend, start2)
+        fini2 = astr.find("\n", fini1)
+        line += astr.count("\n", ind, start2+1)
+        spanlist.append((start, start2+1, fini1, fini2+1, line))
+        line += astr.count("\n", start2+1, fini2)
+        ind = fini2
+    spanlist.sort()
+    return spanlist
+
+
+def paren_repl(obj):
+    torep = obj.group(1)
+    numrep = obj.group(2)
+    return ','.join([torep]*int(numrep))
+
+parenrep = re.compile(r"\(([^)]*)\)\*(\d+)")
+plainrep = re.compile(r"([^*]+)\*(\d+)")
+def parse_values(astr):
+    # replaces all occurrences of '(a,b,c)*4' in astr
+    # with 'a,b,c,a,b,c,a,b,c,a,b,c'. Empty braces generate
+    # empty values, i.e., ()*4 yields ',,,'. The result is
+    # split at ',' and a list of values returned.
+    astr = parenrep.sub(paren_repl, astr)
+    # replaces occurrences of xxx*3 with xxx, xxx, xxx
+    astr = ','.join([plainrep.sub(paren_repl, x.strip())
+                     for x in astr.split(',')])
+    return astr.split(',')
+
+
+stripast = re.compile(r"\n\s*\*?")
+named_re = re.compile(r"#\s*(\w*)\s*=([^#]*)#")
+exclude_vars_re = re.compile(r"(\w*)=(\w*)")
+exclude_re = re.compile(":exclude:")
+def parse_loop_header(loophead) :
+    """Find all named replacements in the header
+
+    Returns a list of dictionaries, one for each loop iteration,
+    where each key is a name to be substituted and the corresponding
+    value is the replacement string.
+
+    Also return a list of exclusions.  The exclusions are dictionaries
+     of key value pairs. There can be more than one exclusion.
+     [{'var1':'value1', 'var2', 'value2'[,...]}, ...]
+
+    """
+    # Strip out '\n' and leading '*', if any, in continuation lines.
+    # This should not effect code previous to this change as
+    # continuation lines were not allowed.
+    loophead = stripast.sub("", loophead)
+    # parse out the names and lists of values
+    names = []
+    reps = named_re.findall(loophead)
+    nsub = None
+    for rep in reps:
+        name = rep[0]
+        vals = parse_values(rep[1])
+        size = len(vals)
+        if nsub is None :
+            nsub = size
+        elif nsub != size :
+            msg = "Mismatch in number of values, %d != %d\n%s = %s"
+            raise ValueError(msg % (nsub, size, name, vals))
+        names.append((name, vals))
+
+
+    # Find any exclude variables
+    excludes = []
+
+    for obj in exclude_re.finditer(loophead):
+        span = obj.span()
+        # find next newline
+        endline = loophead.find('\n', span[1])
+        substr = loophead[span[1]:endline]
+        ex_names = exclude_vars_re.findall(substr)
+        excludes.append(dict(ex_names))
+
+    # generate list of dictionaries, one for each template iteration
+    dlist = []
+    if nsub is None :
+        raise ValueError("No substitution variables found")
+    for i in range(nsub):
+        tmp = {name: vals[i] for name, vals in names}
+        dlist.append(tmp)
+    return dlist
+
+replace_re = re.compile(r"@(\w+)@")
+def parse_string(astr, env, level, line) :
+    lineno = "#line %d\n" % line
+
+    # local function for string replacement, uses env
+    def replace(match):
+        name = match.group(1)
+        try :
+            val = env[name]
+        except KeyError:
+            msg = 'line %d: no definition of key "%s"'%(line, name)
+            raise ValueError(msg) from None
+        return val
+
+    code = [lineno]
+    struct = parse_structure(astr, level)
+    if struct :
+        # recurse over inner loops
+        oldend = 0
+        newlevel = level + 1
+        for sub in struct:
+            pref = astr[oldend:sub[0]]
+            head = astr[sub[0]:sub[1]]
+            text = astr[sub[1]:sub[2]]
+            oldend = sub[3]
+            newline = line + sub[4]
+            code.append(replace_re.sub(replace, pref))
+            try :
+                envlist = parse_loop_header(head)
+            except ValueError as e:
+                msg = "line %d: %s" % (newline, e)
+                raise ValueError(msg)
+            for newenv in envlist :
+                newenv.update(env)
+                newcode = parse_string(text, newenv, newlevel, newline)
+                code.extend(newcode)
+        suff = astr[oldend:]
+        code.append(replace_re.sub(replace, suff))
+    else :
+        # replace keys
+        code.append(replace_re.sub(replace, astr))
+    code.append('\n')
+    return ''.join(code)
+
+def process_str(astr):
+    code = [header]
+    code.extend(parse_string(astr, global_names, 0, 1))
+    return ''.join(code)
+
+
+include_src_re = re.compile(r"(\n|\A)#include\s*['\"]"
+                            r"(?P<name>[\w\d./\\]+[.]src)['\"]", re.I)
+
+def resolve_includes(source):
+    d = os.path.dirname(source)
+    with open(source) as fid:
+        lines = []
+        for line in fid:
+            m = include_src_re.match(line)
+            if m:
+                fn = m.group('name')
+                if not os.path.isabs(fn):
+                    fn = os.path.join(d, fn)
+                if os.path.isfile(fn):
+                    lines.extend(resolve_includes(fn))
+                else:
+                    lines.append(line)
+            else:
+                lines.append(line)
+    return lines
+
+def process_file(source):
+    lines = resolve_includes(source)
+    sourcefile = os.path.normcase(source).replace("\\", "\\\\")
+    try:
+        code = process_str(''.join(lines))
+    except ValueError as e:
+        raise ValueError('In "%s" loop at %s' % (sourcefile, e)) from None
+    return '#line 1 "%s"\n%s' % (sourcefile, code)
+
+
+def unique_key(adict):
+    # this obtains a unique key given a dictionary
+    # currently it works by appending together n of the letters of the
+    #   current keys and increasing n until a unique key is found
+    # -- not particularly quick
+    allkeys = list(adict.keys())
+    done = False
+    n = 1
+    while not done:
+        newkey = "".join([x[:n] for x in allkeys])
+        if newkey in allkeys:
+            n += 1
+        else:
+            done = True
+    return newkey
+
+
+def main():
+    try:
+        file = sys.argv[1]
+    except IndexError:
+        fid = sys.stdin
+        outfile = sys.stdout
+    else:
+        fid = open(file, 'r')
+        (base, ext) = os.path.splitext(file)
+        newname = base
+        outfile = open(newname, 'w')
+
+    allstr = fid.read()
+    try:
+        writestr = process_str(allstr)
+    except ValueError as e:
+        raise ValueError("In %s loop at %s" % (file, e)) from None
+
+    outfile.write(writestr)
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
index f3cb490..6dd3ae5 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@
 import os
 import re
 from distutils.dep_util import newer
-from numpy.distutils.conv_template import process_file as process_c_file
+from _vendored.conv_template import process_file as process_c_file
 from os import (getcwd, environ, makedirs)
 from os import (getcwd, environ, makedirs)
 from os.path import join, exists, abspath, dirname

From 2c87ff707ee2df0aafa6fedc7d26a6ada91daeda Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 8 Jan 2024 05:25:29 -0600
Subject: [PATCH 15/38] Removed duplicate import line

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6dd3ae5..0ee7fa6 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,6 @@
 from distutils.dep_util import newer
 from _vendored.conv_template import process_file as process_c_file
 from os import (getcwd, environ, makedirs)
-from os import (getcwd, environ, makedirs)
 from os.path import join, exists, abspath, dirname
 from setuptools import Extension
 

From de82deb4be79932c710bd96f60a7d942937105b0 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 8 Jan 2024 05:25:44 -0600
Subject: [PATCH 16/38] Removed hard-coded paths, updated to CMake 3.27

Used Python_add_library, instead of removed add_library followed
by python_extension_module function from scikit-build.

Removed superfluous comments
---
 CMakeLists.txt | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index edef098..c778fce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,6 @@
-cmake_minimum_required(VERSION 3.21...3.25 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.27...3.28 FATAL_ERROR)
 
-if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.24")
-  cmake_policy(SET CMP0135 NEW)
-endif()
+cmake_policy(SET CMP0135 NEW)
 
 project(mkl_umath
   LANGUAGES C
@@ -11,7 +9,6 @@ project(mkl_umath
 
 find_package(Python COMPONENTS Interpreter Development REQUIRED)
 find_package(NumPy REQUIRED)
-find_package(PythonExtensions REQUIRED)
 
 set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"")
 find_package(Cython REQUIRED)
@@ -20,14 +17,6 @@ set(MKL_ARCH intel64)
 set(MKL_LINK sdl)
 set(MKL_THREADING intel_thread)
 set(MKL_INTERFACE ilp64)
-# MKL_ARCH: None, set to ` intel64` by default
-# MKL_ROOT /localdisk/work/aguzmanb/Development/miniconda3.py310/envs/numpy_umath_prefix.v5
-# MKL_DPCPP_LINK: None, set to ` dynamic` by default
-# MKL_LINK: None, set to ` dynamic` by default
-# MKL_DPCPP_INTERFACE_FULL: None, set to ` intel_ilp64` by default
-# MKL_INTERFACE_FULL: None, set to ` intel_ilp64` by default
-# MKL_DPCPP_THREADING: None, set to ` tbb_thread` by default
-# MKL_THREADING: None, set to ` intel_thread` by default
 find_package(MKL REQUIRED)
 
 if(WIN32)
@@ -97,32 +86,32 @@ set(_linker_options ${MKL_UMATH_LINKER_OPTIONS})
 set(_trgt mkl_umath_loops)
 add_library(${_trgt} SHARED "mkl_umath/src/mkl_umath_loops.c")
 set_target_properties(${_trgt} PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON)
-target_include_directories(${_trgt} PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR} "C:/Users/aguzmanb/Development/mambaforge/envs/mkl_umath_prefix/Library/include")
+target_include_directories(${_trgt} PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR})
 target_link_libraries(${_trgt} PRIVATE mkl_rt ${Python_LIBRARIES})
 target_link_options(${_trgt} PRIVATE ${_linker_options})
+target_compile_options(${_trgt} PRIVATE -fveclib=SVML)
+target_compile_options(${_trgt} PRIVATE -fvectorize)
 install(TARGETS ${_trgt}
   LIBRARY DESTINATION mkl_umath
   ARCHIVE DESTINATION mkl_umath
   RUNTIME DESTINATION mkl_umath
 )
 
-add_library(_ufuncs MODULE "mkl_umath/src/ufuncsmodule.c" "mkl_umath/src/__umath_generated.c")
+Python_add_library(_ufuncs MODULE WITH_SOABI "mkl_umath/src/ufuncsmodule.c" "mkl_umath/src/__umath_generated.c")
 target_include_directories(_ufuncs PRIVATE "mkl_umath/src" ${NumPy_INCLUDE_DIR} ${MKL_INCLUDE_DIR})
 target_compile_definitions(_ufuncs PUBLIC NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION)
 target_link_options(_ufuncs PRIVATE ${_linker_options})
-target_link_libraries(_ufuncs mkl_umath_loops)
-python_extension_module(_ufuncs)
+target_link_libraries(_ufuncs PRIVATE mkl_umath_loops)
 if (UNIX)
   set_target_properties(_ufuncs PROPERTIES INSTALL_RPATH "$ORIGIN")
 endif()
 install(TARGETS _ufuncs LIBRARY DESTINATION mkl_umath)
 
 add_cython_target(_patch "mkl_umath/src/_patch.pyx" C OUTPUT_VAR _generated_src)
-add_library(_patch MODULE ${_generated_src})
+Python_add_library(_patch MODULE WITH_SOABI ${_generated_src})
 target_include_directories(_patch PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR})
 target_compile_definitions(_patch PUBLIC NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION)
-target_link_libraries(_patch mkl_umath_loops)
-python_extension_module(_patch)
+target_link_libraries(_patch PRIVATE mkl_umath_loops)
 if (UNIX)
   set_target_properties(_patch PROPERTIES INSTALL_RPATH "$ORIGIN")
 endif()

From ea90d0f6387a91064db6f20715141752fafac303 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 10 Jan 2024 12:47:20 -0600
Subject: [PATCH 17/38] Changes to permit vectorization of most loops by ICX

Some loops are not vectorized due to compiler's cost model analysis.
Added CMake option OPTIMIZATION_REPORT (OFF by default). It would
instruct compiler to generate optimization report for mkl_umath
library.
---
 CMakeLists.txt                      |   8 +
 mkl_umath/src/fast_loop_macros.h    |  22 +-
 mkl_umath/src/mkl_umath_loops.c.src | 579 +++++++++++++++++-----------
 3 files changed, 378 insertions(+), 231 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c778fce..6fc047f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,11 @@ project(mkl_umath
   DESCRIPTION "mkl_umath module"
 )
 
+option(OPTIMIZATION_REPORT
+  "Whether to generate optimization vectorization report"
+  OFF
+)
+
 find_package(Python COMPONENTS Interpreter Development REQUIRED)
 find_package(NumPy REQUIRED)
 
@@ -91,6 +96,9 @@ target_link_libraries(${_trgt} PRIVATE mkl_rt ${Python_LIBRARIES})
 target_link_options(${_trgt} PRIVATE ${_linker_options})
 target_compile_options(${_trgt} PRIVATE -fveclib=SVML)
 target_compile_options(${_trgt} PRIVATE -fvectorize)
+if(OPTIMIZATION_REPORT)
+  target_compile_options(${_trgt} PRIVATE -qopt-report=3)
+endif()
 install(TARGETS ${_trgt}
   LIBRARY DESTINATION mkl_umath
   ARCHIVE DESTINATION mkl_umath
diff --git a/mkl_umath/src/fast_loop_macros.h b/mkl_umath/src/fast_loop_macros.h
index d26174c..12ef2e1 100644
--- a/mkl_umath/src/fast_loop_macros.h
+++ b/mkl_umath/src/fast_loop_macros.h
@@ -74,19 +74,19 @@
     npy_intp is1 = steps[0], os1 = steps[1];\
     npy_intp n = dimensions[0];\
     npy_intp i;\
-    for(i = 0; i < n; i++, ip1 += is1, op1 += os1)
+    for(i = 0; i < n; ++i, ip1 += is1, op1 += os1)
 
-#define UNARY_LOOP_VECTORIZED\
-    char *ip1 = args[0], *op1 = args[1];\
-    npy_intp is1 = steps[0], os1 = steps[1];\
+#define UNARY_LOOP_VECTORIZED(tin, tout)\
+    tin *ip1 = (tin *) args[0];\
+    tout *op1 = (tout *) args[1];		\
     npy_intp n = dimensions[0];\
     npy_intp i;\
     NPY_PRAGMA_VECTOR\
-    for(i = 0; i < n; i++, ip1 += is1, op1 += os1)
+    for(i = 0; i < n; ++i, ++ip1, ++op1)
 
-#define UNARY_LOOP_DISPATCH(cond, body)\
+#define UNARY_LOOP_DISPATCH(tin, tout, cond, body)\
     if (cond) {\
-        UNARY_LOOP_VECTORIZED { body; }\
+        UNARY_LOOP_VECTORIZED(tin, tout) { body; }\
     } else {\
         UNARY_LOOP { body; }\
     }
@@ -97,7 +97,7 @@
     npy_intp is1 = steps[0], os1 = steps[1], os2 = steps[2];\
     npy_intp n = dimensions[0];\
     npy_intp i;\
-    for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)
+    for(i = 0; i < n; ++i, ip1 += is1, op1 += os1, op2 += os2)
 
 /** (ip1, ip2) -> (op1) */
 #define BINARY_LOOP\
@@ -105,7 +105,7 @@
     npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
     npy_intp n = dimensions[0];\
     npy_intp i;\
-    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1)
+    for(i = 0; i < n; ++i, ip1 += is1, ip2 += is2, op1 += os1)
 
 /** (ip1, ip2) -> (op1, op2) */
 #define BINARY_LOOP_TWO_OUT\
@@ -113,7 +113,7 @@
     npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2], os2 = steps[3];\
     npy_intp n = dimensions[0];\
     npy_intp i;\
-    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1, op2 += os2)
+    for(i = 0; i < n; ++i, ip1 += is1, ip2 += is2, op1 += os1, op2 += os2)
 
 /** (ip1, ip2, ip3) -> (op1) */
 #define TERNARY_LOOP\
@@ -121,7 +121,7 @@
     npy_intp is1 = steps[0], is2 = steps[1], is3 = steps[2], os1 = steps[3];\
     npy_intp n = dimensions[0];\
     npy_intp i;\
-    for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1)
+    for(i = 0; i < n; ++i, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1)
 
 /** @} */
 
diff --git a/mkl_umath/src/mkl_umath_loops.c.src b/mkl_umath/src/mkl_umath_loops.c.src
index be3e8d3..50fc7ea 100644
--- a/mkl_umath/src/mkl_umath_loops.c.src
+++ b/mkl_umath/src/mkl_umath_loops.c.src
@@ -41,7 +41,7 @@
 #include "blocking_utils.h"
 #include "mkl_umath_loops.h"
 
-/* Adapated from NumPy's source code. 
+/* Adapated from NumPy's source code.
  * https://github.com/numpy/numpy/blob/main/LICENSE.txt */
 
 /*
@@ -142,13 +142,13 @@
 
 static inline npy_double spacing(npy_double x) {
     if (isinf(x))
-	return ((npy_double) NAN);
+    return ((npy_double) NAN);
     return copysign(nextafter(fabs(x), ((npy_double) INFINITY)), x) - x;
 }
 
 static inline npy_float spacingf(npy_float x) {
     if (isinff(x))
-	return ((npy_float) NAN);
+    return ((npy_float) NAN);
 
     return copysignf(nextafterf(fabsf(x), INFINITY), x) - x;
 }
@@ -225,18 +225,23 @@ divmod@c@(@type@ a, @type@ b, @type@ *modulus)
 void
 mkl_umath_@TYPE@_sqrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Sqrt, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Sqrt(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
+            @type@, @type@
+            ,
+            can_vectorize
             ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -253,18 +258,23 @@ mkl_umath_@TYPE@_sqrt(char **args, const npy_intp *dimensions, const npy_intp *s
 void
 mkl_umath_@TYPE@_invsqrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@InvSqrt, dimensions[0], @type@, args[0], args[1]);
         /* v@c@InvSqrt(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
+            @type@, @type@
+            ,
+            can_vectorize
             ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -282,24 +292,26 @@ mkl_umath_@TYPE@_invsqrt(char **args, const npy_intp *dimensions, const npy_intp
 void
 mkl_umath_@TYPE@_exp(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
     int ignore_fpstatus = 0;
 
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))) {
+    if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         ignore_fpstatus = 1;
         CHUNKED_VML_CALL2(v@c@Exp, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Exp(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
+            @type@, @type@
+            ,
+            can_vectorize
             ,
             const @type@ in1 = *(@type@ *)ip1;
-            if(in1 == -NPY_INFINITY@A@){
-                ignore_fpstatus = 1;
-            }
+            ignore_fpstatus |= ((in1 == -NPY_INFINITY@A@) ? 1 : 0);
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+    )
     }
     if(ignore_fpstatus) {
         feclearexcept(FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW | FE_INVALID);
@@ -320,8 +332,14 @@ mkl_umath_@TYPE@_exp(char **args, const npy_intp *dimensions, const npy_intp *st
 void
 mkl_umath_@TYPE@_exp2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
     UNARY_LOOP_DISPATCH(
-        DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
+        @type@, @type@
+        ,
+        can_vectorize
         ,
         const @type@ in1 = *(@type@ *)ip1;
         *(@type@ *)op1 = @scalarf@(in1);
@@ -341,18 +359,22 @@ mkl_umath_@TYPE@_exp2(char **args, const npy_intp *dimensions, const npy_intp *s
 void
 mkl_umath_@TYPE@_expm1(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if(can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD) {
         CHUNKED_VML_CALL2(v@c@Expm1, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Expm1(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -369,18 +391,23 @@ mkl_umath_@TYPE@_expm1(char **args, const npy_intp *dimensions, const npy_intp *
 void
 mkl_umath_@TYPE@_erf(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Erf, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Erf(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -397,18 +424,23 @@ mkl_umath_@TYPE@_erf(char **args, const npy_intp *dimensions, const npy_intp *st
 void
 mkl_umath_@TYPE@_log(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Ln, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Ln(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -426,8 +458,14 @@ mkl_umath_@TYPE@_log(char **args, const npy_intp *dimensions, const npy_intp *st
 void
 mkl_umath_@TYPE@_log2(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
     UNARY_LOOP_DISPATCH(
-        DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
+        @type@, @type@
+        ,
+        can_vectorize
         ,
         const @type@ in1 = *(@type@ *)ip1;
         *(@type@ *)op1 = @scalarf@(in1);
@@ -447,18 +485,23 @@ mkl_umath_@TYPE@_log2(char **args, const npy_intp *dimensions, const npy_intp *s
 void
 mkl_umath_@TYPE@_log10(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Log10, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Log10(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -475,18 +518,23 @@ mkl_umath_@TYPE@_log10(char **args, const npy_intp *dimensions, const npy_intp *
 void
 mkl_umath_@TYPE@_log1p(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Log1p, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Log1p(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -503,18 +551,23 @@ mkl_umath_@TYPE@_log1p(char **args, const npy_intp *dimensions, const npy_intp *
 void
 mkl_umath_@TYPE@_cos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Cos, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Cos(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -531,18 +584,23 @@ mkl_umath_@TYPE@_cos(char **args, const npy_intp *dimensions, const npy_intp *st
 void
 mkl_umath_@TYPE@_sin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Sin, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Sin(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -559,18 +617,23 @@ mkl_umath_@TYPE@_sin(char **args, const npy_intp *dimensions, const npy_intp *st
 void
 mkl_umath_@TYPE@_tan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Tan, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Tan(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -587,18 +650,23 @@ mkl_umath_@TYPE@_tan(char **args, const npy_intp *dimensions, const npy_intp *st
 void
 mkl_umath_@TYPE@_arccos(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Acos, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Acos(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -615,18 +683,23 @@ mkl_umath_@TYPE@_arccos(char **args, const npy_intp *dimensions, const npy_intp
 void
 mkl_umath_@TYPE@_arcsin(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Asin, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Asin(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -643,18 +716,23 @@ mkl_umath_@TYPE@_arcsin(char **args, const npy_intp *dimensions, const npy_intp
 void
 mkl_umath_@TYPE@_arctan(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Atan, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Atan(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -671,18 +749,23 @@ mkl_umath_@TYPE@_arctan(char **args, const npy_intp *dimensions, const npy_intp
 void
 mkl_umath_@TYPE@_cosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Cosh, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Cosh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -699,18 +782,23 @@ mkl_umath_@TYPE@_cosh(char **args, const npy_intp *dimensions, const npy_intp *s
 void
 mkl_umath_@TYPE@_sinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Sinh, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Sinh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -727,18 +815,23 @@ mkl_umath_@TYPE@_sinh(char **args, const npy_intp *dimensions, const npy_intp *s
 void
 mkl_umath_@TYPE@_tanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Tanh, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Tanh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -755,18 +848,23 @@ mkl_umath_@TYPE@_tanh(char **args, const npy_intp *dimensions, const npy_intp *s
 void
 mkl_umath_@TYPE@_arccosh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Acosh, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Acosh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -783,18 +881,23 @@ mkl_umath_@TYPE@_arccosh(char **args, const npy_intp *dimensions, const npy_intp
 void
 mkl_umath_@TYPE@_arcsinh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Asinh, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Asinh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -811,18 +914,23 @@ mkl_umath_@TYPE@_arcsinh(char **args, const npy_intp *dimensions, const npy_intp
 void
 mkl_umath_@TYPE@_arctanh(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Atanh, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Atanh(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -839,8 +947,14 @@ mkl_umath_@TYPE@_arctanh(char **args, const npy_intp *dimensions, const npy_intp
 void
 mkl_umath_@TYPE@_fabs(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
     UNARY_LOOP_DISPATCH(
-        DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
+        @type@, @type@
+        ,
+        can_vectorize
         ,
         const @type@ in1 = *(@type@ *)ip1;
         *(@type@ *)op1 = @scalarf@(in1);
@@ -860,18 +974,23 @@ mkl_umath_@TYPE@_fabs(char **args, const npy_intp *dimensions, const npy_intp *s
 void
 mkl_umath_@TYPE@_floor(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(steps[0] == sizeof(@type@) && steps[1] == sizeof(@type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Floor, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Floor(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -888,18 +1007,23 @@ mkl_umath_@TYPE@_floor(char **args, const npy_intp *dimensions, const npy_intp *
 void
 mkl_umath_@TYPE@_ceil(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Ceil, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Ceil(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -916,18 +1040,23 @@ mkl_umath_@TYPE@_ceil(char **args, const npy_intp *dimensions, const npy_intp *s
 void
 mkl_umath_@TYPE@_rint(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(steps[0] == sizeof(@type@) && steps[1] == sizeof(@type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Rint, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Rint(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -944,18 +1073,23 @@ mkl_umath_@TYPE@_rint(char **args, const npy_intp *dimensions, const npy_intp *s
 void
 mkl_umath_@TYPE@_trunc(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if( can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Trunc, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Trunc(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -972,18 +1106,23 @@ mkl_umath_@TYPE@_trunc(char **args, const npy_intp *dimensions, const npy_intp *
 void
 mkl_umath_@TYPE@_cbrt(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func))
 {
-    if(IS_UNARY_CONT(@type@, @type@) &&
-           dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD &&
-           DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@)) ) {
+    const int contig = IS_UNARY_CONT(@type@, @type@);
+    const int disjoint_or_same = DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@));
+    const int can_vectorize = contig && disjoint_or_same;
+
+    if (can_vectorize && dimensions[0] > VML_TRANSCEDENTAL_THRESHOLD)
+    {
         CHUNKED_VML_CALL2(v@c@Cbrt, dimensions[0], @type@, args[0], args[1]);
         /* v@c@Cbrt(dimensions[0], (@type@*) args[0], (@type@*) args[1]); */
     } else {
         UNARY_LOOP_DISPATCH(
-            DISJOINT_OR_SAME(args[0], args[1], dimensions[0], sizeof(@type@))
-	    ,
+            @type@, @type@
+            ,
+            can_vectorize
+            ,
             const @type@ in1 = *(@type@ *)ip1;
             *(@type@ *)op1 = @scalarf@(in1);
-	)
+        )
     }
 }
 
@@ -1126,19 +1265,19 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
                     @type@ *op1_shifted = op1 + peel;
                     @type@ *ip2_shifted = ip2 + peel;
 
-		    if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
-			DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			NPY_PRAGMA_VECTOR
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    } else {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    }
+                    if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
+                        DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
+                        NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+                        NPY_PRAGMA_VECTOR
+                        for(j = 0; j < j_max; j++) {
+                            op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+                        }
+                    } else {
+                        NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+                        for(j = 0; j < j_max; j++) {
+                            op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+                        }
+                    }
 
                     i = blocked_end;
                 }
@@ -1294,19 +1433,19 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
                     @type@ *ip2_shifted = ip2 + peel;
                     @type@ *op1_shifted = op1 + peel;
 
-		    if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
-			DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			NPY_PRAGMA_VECTOR
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    } else {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    }
+            if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
+            DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
+            NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+            NPY_PRAGMA_VECTOR
+            for(j = 0; j < j_max; j++) {
+                op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+            }
+            } else {
+            NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+            for(j = 0; j < j_max; j++) {
+                op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+            }
+            }
 
                     i = blocked_end;
                 }
@@ -1462,19 +1601,19 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
                     @type@ *ip2_shifted = ip2 + peel;
                     @type@ *op1_shifted = op1 + peel;
 
-		    if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
-			DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			NPY_PRAGMA_VECTOR
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    } else {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    }
+            if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
+            DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
+            NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+            NPY_PRAGMA_VECTOR
+            for(j = 0; j < j_max; j++) {
+                op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+            }
+            } else {
+            NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+            for(j = 0; j < j_max; j++) {
+                op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+            }
+            }
 
                     i = blocked_end;
                 }
@@ -1619,37 +1758,37 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
             const npy_intp blocked_end = npy_blocked_end(peel, sizeof(@type@), vsize, n);
             npy_intp i;
 
-	    NPY_PRAGMA_NOVECTOR
+        NPY_PRAGMA_NOVECTOR
             for(i = 0; i < peel; i++) {
                 op1[i] = ip1[i] @OP@ ip2[i];
             }
 
             {
                 npy_intp j, j_max = blocked_end - peel;
-		j_max &= (~0xf);
-		const npy_intp blocked_end = j_max + peel;
+        j_max &= (~0xf);
+        const npy_intp blocked_end = j_max + peel;
                 if (j_max > 0) {
                     @type@ *ip1_aligned = ip1 + peel, *op1_shifted = op1 + peel, *ip2_shifted = ip2 + peel;
 
-		    if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
-			DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			NPY_PRAGMA_VECTOR
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    } else {
-			NPY_ASSUME_ALIGNED(ip1_aligned, 64)
-			for(j = 0; j < j_max; j++) {
-			    op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
-			}
-		    }
+            if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) &&
+            DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) {
+            NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+            NPY_PRAGMA_VECTOR
+            for(j = 0; j < j_max; j++) {
+                op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+            }
+            } else {
+            NPY_ASSUME_ALIGNED(ip1_aligned, 64)
+            for(j = 0; j < j_max; j++) {
+                op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j];
+            }
+            }
 
                     i = blocked_end;
                 }
             }
 
-	    NPY_PRAGMA_NOVECTOR
+        NPY_PRAGMA_NOVECTOR
             for(; i < n; i++) {
                 op1[i] = ip1[i] @OP@ ip2[i];
             }
@@ -1665,7 +1804,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
         npy_intp i;
 
         const @type@ ip1c = ip1[0];
-	NPY_PRAGMA_NOVECTOR
+    NPY_PRAGMA_NOVECTOR
         for(i = 0; i < peel; i++) {
             op1[i] = ip1c @OP@ ip2[i];
         }
@@ -1684,7 +1823,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
             }
         }
 
-	NPY_PRAGMA_NOVECTOR
+    NPY_PRAGMA_NOVECTOR
         for(; i < n; i++) {
             op1[i] = ip1c @OP@ ip2[i];
         }
@@ -1699,7 +1838,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
         npy_intp i;
 
         const @type@ ip2c = ip2[0];
-	NPY_PRAGMA_NOVECTOR
+    NPY_PRAGMA_NOVECTOR
         for(i = 0; i < peel; i++) {
             op1[i] = ip1[i] @OP@ ip2c;
         }
@@ -1718,7 +1857,7 @@ mkl_umath_@TYPE@_@kind@(char **args, const npy_intp *dimensions, const npy_intp
             }
         }
 
-	NPY_PRAGMA_NOVECTOR
+    NPY_PRAGMA_NOVECTOR
         for(; i < n; i++) {
             op1[i] = ip1[i] @OP@ ip2c;
         }
@@ -2147,13 +2286,13 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
         for (i = 8; i < n - (n % 8); i += 8) {
             /* small blocksizes seems to mess with hardware prefetch */
             NPY_PREFETCH(a + (i + 512 /(npy_intp)sizeof(@ftype@))*stride, 0, 3);
-	    r[0] += *((@ftype@ *)(a + (i + 0) * stride));
+        r[0] += *((@ftype@ *)(a + (i + 0) * stride));
             r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@)));
-	    r[2] += *((@ftype@ *)(a + (i + 2) * stride));
+        r[2] += *((@ftype@ *)(a + (i + 2) * stride));
             r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@)));
-	    r[4] += *((@ftype@ *)(a + (i + 4) * stride));
+        r[4] += *((@ftype@ *)(a + (i + 4) * stride));
             r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@)));
-	    r[6] += *((@ftype@ *)(a + (i + 6) * stride));
+        r[6] += *((@ftype@ *)(a + (i + 6) * stride));
             r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@)));
         }
 

From 9d1e3a3877961a4fd9685ff852909077131beb77 Mon Sep 17 00:00:00 2001
From: "Komarova, Evseniia" <evseniia.komarova@intel.com>
Date: Tue, 11 Jun 2024 20:09:11 +0200
Subject: [PATCH 18/38] add c99 standard

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6fc047f..4f5990c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,6 +91,7 @@ set(_linker_options ${MKL_UMATH_LINKER_OPTIONS})
 set(_trgt mkl_umath_loops)
 add_library(${_trgt} SHARED "mkl_umath/src/mkl_umath_loops.c")
 set_target_properties(${_trgt} PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON)
+set_target_properties(${_trgt} PROPERTIES  C_STANDARD 99)
 target_include_directories(${_trgt} PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR})
 target_link_libraries(${_trgt} PRIVATE mkl_rt ${Python_LIBRARIES})
 target_link_options(${_trgt} PRIVATE ${_linker_options})

From e26ba4d7a08a3f5c15419665d11342ecebb9b4f5 Mon Sep 17 00:00:00 2001
From: "Komarova, Evseniia" <evseniia.komarova@intel.com>
Date: Wed, 12 Jun 2024 15:53:08 +0200
Subject: [PATCH 19/38] add high precision flags

---
 CMakeLists.txt | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4f5990c..8654308 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,7 +40,13 @@ if(WIN32)
      "/GS "
      "/DynamicBase "
    )
-   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}")
+   string(CONCAT PRECISION_FLAGS
+     "/fp:fast=2 "
+     "/Qimf-precision=high "
+     "/Qprec-sqrt "
+     "/Qprotect-parens "
+   )
+   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS} ${PRECISION_FLAGS}")
    set(CMAKE_C_FLAGS_DEBUG
      "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG"
    )
@@ -73,7 +79,13 @@ elseif(UNIX)
      "${WARNING_FLAGS}"
      "${SDL_FLAGS}"
    )
-   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 ${CFLAGS}")
+   string(CONCAT PRECISION_FLAGS
+     "-prec-sqrt "
+     "-fprotect-parens "
+     "-fimf-precision=high "
+     "-fp-model fast=2 "
+   )
+   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 ${CFLAGS} ${PRECISION_FLAGS}")
    set(CMAKE_C_FLAGS_DEBUG
      "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O0 -g1 -DDEBUG"
    )
@@ -90,8 +102,10 @@ set(_linker_options ${MKL_UMATH_LINKER_OPTIONS})
 
 set(_trgt mkl_umath_loops)
 add_library(${_trgt} SHARED "mkl_umath/src/mkl_umath_loops.c")
-set_target_properties(${_trgt} PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON)
-set_target_properties(${_trgt} PROPERTIES  C_STANDARD 99)
+set_target_properties(${_trgt} PROPERTIES
+    CMAKE_POSITION_INDEPENDENT_CODE ON
+    C_STANDARD 99
+)
 target_include_directories(${_trgt} PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR})
 target_link_libraries(${_trgt} PRIVATE mkl_rt ${Python_LIBRARIES})
 target_link_options(${_trgt} PRIVATE ${_linker_options})

From 2d2448099999285735f059126066a0d10c5395a8 Mon Sep 17 00:00:00 2001
From: "Komarova, Evseniia" <evseniia.komarova@intel.com>
Date: Mon, 17 Jun 2024 12:41:25 +0200
Subject: [PATCH 20/38] replace test_basic with pytest

---
 conda-recipe/meta.yaml        |  4 +++-
 mkl_umath/tests/test_basic.py | 37 +++++++++++++++++------------------
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 9f98829..ec41497 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -37,10 +37,12 @@ requirements:
       - {{ pin_compatible('numpy') }}
 
 test:
+    requires:
+      - pytest
     source_files:
       - mkl_umath/tests/test_basic.py
     commands:
-      - python mkl_umath/tests/test_basic.py
+      - pytest mkl_umath/tests/test_basic.py
     imports:
       - mkl_umath
       - mkl_umath._ufuncs
diff --git a/mkl_umath/tests/test_basic.py b/mkl_umath/tests/test_basic.py
index 1a9fc53..f0b4ae8 100644
--- a/mkl_umath/tests/test_basic.py
+++ b/mkl_umath/tests/test_basic.py
@@ -23,6 +23,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import pytest
 import numpy as np
 import mkl_umath._ufuncs as mu
 import numpy.core.umath as nu
@@ -49,11 +50,8 @@ def get_args(args_str):
     return tuple(args)
 
 umaths = [i for i in dir(mu) if isinstance(getattr(mu, i), np.ufunc)]
-
 umaths.remove('arccosh') # expects input greater than 1
 
-# dictionary with test cases
-# (umath, types) : args
 generated_cases = {}
 for umath in umaths:
     mkl_umath = getattr(mu, umath)
@@ -64,29 +62,30 @@ def get_args(args_str):
         generated_cases[(umath, type)] = args
 
 additional_cases = {
-('arccosh', 'f->f') : (np.single(np.random.random_sample() + 1),),
-('arccosh', 'd->d') : (np.double(np.random.random_sample() + 1),),
+    ('arccosh', 'f->f'): (np.single(np.random.random_sample() + 1),),
+    ('arccosh', 'd->d'): (np.double(np.random.random_sample() + 1),),
 }
 
-test_cases = {}
-for d in (generated_cases, additional_cases):
-    test_cases.update(d)
+test_cases = {**generated_cases, **additional_cases}
 
-for case in test_cases:
-    umath = case[0]
-    type = case[1]
+@pytest.mark.parametrize("case", list(test_cases.keys()))
+def test_umath(case):
+    umath, type = case
     args = test_cases[case]
     mkl_umath = getattr(mu, umath)
     np_umath = getattr(nu, umath)
     print('*'*80)
-    print(umath, type)
-    print("args", args)
+    print(f"Testing {umath} with type {type}")
+    print("args:", args)
+    
     mkl_res = mkl_umath(*args)
     np_res = np_umath(*args)
-    print("mkl res", mkl_res)
-    print("npy res", np_res)
-
-    assert np.allclose(mkl_res, np_res)
+    
+    print("mkl res:", mkl_res)
+    print("npy res:", np_res)
+    
+    assert np.allclose(mkl_res, np_res), f"Results for {umath} do not match"
 
-print("Test cases count:", len(test_cases))
-print("All looks good!")
+def test_cases_count():
+    print("Test cases count:", len(test_cases))
+    assert len(test_cases) > 0, "No test cases found"

From 372bf68e998d0394c2b27f82a2110fdeba52fa8e Mon Sep 17 00:00:00 2001
From: "Komarova, Evseniia" <evseniia.komarova@intel.com>
Date: Tue, 18 Jun 2024 17:22:19 +0200
Subject: [PATCH 21/38] convert the generated integer to numpy.int64 using type

---
 mkl_umath/tests/test_basic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mkl_umath/tests/test_basic.py b/mkl_umath/tests/test_basic.py
index f0b4ae8..88770a9 100644
--- a/mkl_umath/tests/test_basic.py
+++ b/mkl_umath/tests/test_basic.py
@@ -44,7 +44,7 @@ def get_args(args_str):
         elif s == 'i':
             args.append(np.int_(np.random.randint(low=1, high=10)))
         elif s == 'l':
-            args.append(np.longlong(np.random.randint(low=1, high=10)))
+            args.append(np.dtype('long').type(np.random.randint(low=1, high=10)))
         else:
             raise ValueError("Unexpected type specified!")
     return tuple(args)

From 2cc4dd62d5fc3bac3233db3bd9d750684ca3637d Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 11 Sep 2024 13:07:02 -0700
Subject: [PATCH 22/38] Changes to enable compilation with NumPy 2

---
 CMakeLists.txt                      | 18 +++++++++---------
 mkl_umath/src/mkl_umath_loops.c.src |  1 +
 mkl_umath/src/ufuncsmodule.h        |  1 +
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8654308..676daae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,8 +20,6 @@ find_package(Cython REQUIRED)
 
 set(MKL_ARCH intel64)
 set(MKL_LINK sdl)
-set(MKL_THREADING intel_thread)
-set(MKL_INTERFACE ilp64)
 find_package(MKL REQUIRED)
 
 if(WIN32)
@@ -101,16 +99,16 @@ set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
 set(_linker_options ${MKL_UMATH_LINKER_OPTIONS})
 
 set(_trgt mkl_umath_loops)
-add_library(${_trgt} SHARED "mkl_umath/src/mkl_umath_loops.c")
+add_library(${_trgt} SHARED mkl_umath/src/mkl_umath_loops.c)
 set_target_properties(${_trgt} PROPERTIES
     CMAKE_POSITION_INDEPENDENT_CODE ON
     C_STANDARD 99
 )
-target_include_directories(${_trgt} PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR})
-target_link_libraries(${_trgt} PRIVATE mkl_rt ${Python_LIBRARIES})
-target_link_options(${_trgt} PRIVATE ${_linker_options})
-target_compile_options(${_trgt} PRIVATE -fveclib=SVML)
-target_compile_options(${_trgt} PRIVATE -fvectorize)
+target_include_directories(${_trgt} PUBLIC mkl_umath/src/ ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR})
+target_link_libraries(${_trgt} PUBLIC MKL::MKL ${Python_LIBRARIES})
+target_link_options(${_trgt} PUBLIC ${_linker_options})
+target_compile_options(${_trgt} PUBLIC -fveclib=SVML)
+target_compile_options(${_trgt} PUBLIC -fvectorize)
 if(OPTIMIZATION_REPORT)
   target_compile_options(${_trgt} PRIVATE -qopt-report=3)
 endif()
@@ -120,11 +118,12 @@ install(TARGETS ${_trgt}
   RUNTIME DESTINATION mkl_umath
 )
 
-Python_add_library(_ufuncs MODULE WITH_SOABI "mkl_umath/src/ufuncsmodule.c" "mkl_umath/src/__umath_generated.c")
+python_add_library(_ufuncs MODULE WITH_SOABI "mkl_umath/src/ufuncsmodule.c" "mkl_umath/src/__umath_generated.c")
 target_include_directories(_ufuncs PRIVATE "mkl_umath/src" ${NumPy_INCLUDE_DIR} ${MKL_INCLUDE_DIR})
 target_compile_definitions(_ufuncs PUBLIC NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION)
 target_link_options(_ufuncs PRIVATE ${_linker_options})
 target_link_libraries(_ufuncs PRIVATE mkl_umath_loops)
+set_target_properties(_ufuncs PROPERTIES C_STANDARD 99)
 if (UNIX)
   set_target_properties(_ufuncs PROPERTIES INSTALL_RPATH "$ORIGIN")
 endif()
@@ -135,6 +134,7 @@ Python_add_library(_patch MODULE WITH_SOABI ${_generated_src})
 target_include_directories(_patch PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR})
 target_compile_definitions(_patch PUBLIC NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION)
 target_link_libraries(_patch PRIVATE mkl_umath_loops)
+set_target_properties(_patch PROPERTIES C_STANDARD 99)
 if (UNIX)
   set_target_properties(_patch PROPERTIES INSTALL_RPATH "$ORIGIN")
 endif()
diff --git a/mkl_umath/src/mkl_umath_loops.c.src b/mkl_umath/src/mkl_umath_loops.c.src
index 50fc7ea..92d62c6 100644
--- a/mkl_umath/src/mkl_umath_loops.c.src
+++ b/mkl_umath/src/mkl_umath_loops.c.src
@@ -32,6 +32,7 @@
 #include "Python.h"
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define NP_IMPORT_ARRAY
 
 #include "numpy/npy_common.h"
 #include "numpy/ndarraytypes.h"
diff --git a/mkl_umath/src/ufuncsmodule.h b/mkl_umath/src/ufuncsmodule.h
index 2526763..acb6bbd 100644
--- a/mkl_umath/src/ufuncsmodule.h
+++ b/mkl_umath/src/ufuncsmodule.h
@@ -25,6 +25,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #include "Python.h"
+#define PY_ARRAY_UNIQUE_SYMBOL mkl_umath_ufunc_ext
 #include "numpy/arrayobject.h"
 #include "numpy/ndarraytypes.h"
 #include "numpy/ufuncobject.h"

From 3377d38bf6a1a1f103959190f96f0437325e00d6 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 11 Sep 2024 17:52:38 -0700
Subject: [PATCH 23/38] Provide w/a for ICC and recent libmmd library

---
 mkl_umath/src/mkl_umath_loops.c.src | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/mkl_umath/src/mkl_umath_loops.c.src b/mkl_umath/src/mkl_umath_loops.c.src
index 92d62c6..86a62c4 100644
--- a/mkl_umath/src/mkl_umath_loops.c.src
+++ b/mkl_umath/src/mkl_umath_loops.c.src
@@ -154,6 +154,11 @@ static inline npy_float spacingf(npy_float x) {
     return copysignf(nextafterf(fabsf(x), INFINITY), x) - x;
 }
 
+#if defined(_MSC_VER) && defined(__INTEL_COMPILER)
+extern __inline float __cdecl ldexpf( float _X, int _Y) { 
+    return (float)ldexp(_X, _Y); 
+}
+#endif
 
 /**begin repeat
  * Float types

From ea9ef2246481d8eb039d3bea6c5af2bc7df8c57e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 17 Sep 2024 08:59:03 -0500
Subject: [PATCH 24/38] Find NumPy as Python component

Adjust variables after move to use NumPy as Python component
---
 CMakeLists.txt | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 676daae..a4e3533 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,13 +12,17 @@ option(OPTIMIZATION_REPORT
   OFF
 )
 
-find_package(Python COMPONENTS Interpreter Development REQUIRED)
-find_package(NumPy REQUIRED)
+find_package(Python COMPONENTS Interpreter Development NumPy REQUIRED)
+
+# Print out the discovered paths
+include(CMakePrintHelpers)
+cmake_print_variables(Python_INCLUDE_DIRS)
+cmake_print_variables(Python_LIBRARIES)
+cmake_print_variables(Python_NumPy_INCLUDE_DIRS)
 
 set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"")
 find_package(Cython REQUIRED)
 
-set(MKL_ARCH intel64)
 set(MKL_LINK sdl)
 find_package(MKL REQUIRED)
 
@@ -104,7 +108,7 @@ set_target_properties(${_trgt} PROPERTIES
     CMAKE_POSITION_INDEPENDENT_CODE ON
     C_STANDARD 99
 )
-target_include_directories(${_trgt} PUBLIC mkl_umath/src/ ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR})
+target_include_directories(${_trgt} PUBLIC mkl_umath/src/ ${Python_NumPy_INCLUDE_DIRS} ${Python_INCLUDE_DIRS})
 target_link_libraries(${_trgt} PUBLIC MKL::MKL ${Python_LIBRARIES})
 target_link_options(${_trgt} PUBLIC ${_linker_options})
 target_compile_options(${_trgt} PUBLIC -fveclib=SVML)
@@ -119,7 +123,7 @@ install(TARGETS ${_trgt}
 )
 
 python_add_library(_ufuncs MODULE WITH_SOABI "mkl_umath/src/ufuncsmodule.c" "mkl_umath/src/__umath_generated.c")
-target_include_directories(_ufuncs PRIVATE "mkl_umath/src" ${NumPy_INCLUDE_DIR} ${MKL_INCLUDE_DIR})
+target_include_directories(_ufuncs PRIVATE "mkl_umath/src" ${Python_NumPy_INCLUDE_DIRS} ${MKL_INCLUDE_DIR})
 target_compile_definitions(_ufuncs PUBLIC NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION)
 target_link_options(_ufuncs PRIVATE ${_linker_options})
 target_link_libraries(_ufuncs PRIVATE mkl_umath_loops)
@@ -131,7 +135,7 @@ install(TARGETS _ufuncs LIBRARY DESTINATION mkl_umath)
 
 add_cython_target(_patch "mkl_umath/src/_patch.pyx" C OUTPUT_VAR _generated_src)
 Python_add_library(_patch MODULE WITH_SOABI ${_generated_src})
-target_include_directories(_patch PRIVATE "mkl_umath/src/" ${NumPy_INCLUDE_DIR} ${PYTHON_INCLUDE_DIR})
+target_include_directories(_patch PRIVATE "mkl_umath/src/" ${Python_NumPy_INCLUDE_DIRS} ${Python_INCLUDE_DIRS})
 target_compile_definitions(_patch PUBLIC NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION)
 target_link_libraries(_patch PRIVATE mkl_umath_loops)
 set_target_properties(_patch PROPERTIES C_STANDARD 99)

From 8fcf5e4ae4318ce333a2bf84a2b5c99a0af6bfac Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 17 Sep 2024 09:53:18 -0500
Subject: [PATCH 25/38] _patch is to use language_level=3

---
 mkl_umath/src/_patch.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mkl_umath/src/_patch.pyx b/mkl_umath/src/_patch.pyx
index 5814d54..fd78f8d 100644
--- a/mkl_umath/src/_patch.pyx
+++ b/mkl_umath/src/_patch.pyx
@@ -24,7 +24,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 # distutils: language = c
-# cython: language_level=2
+# cython: language_level=3
 
 import mkl_umath._ufuncs as mu
 import numpy.core.umath as nu

From 826a3308eec9ad2dd65f6dc14ae7c9549b92acd2 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 18 Sep 2024 08:05:19 -0500
Subject: [PATCH 26/38] Replace use of -c intel channel, replace use of -c main

Introduce conda-recipe-cf which does not depend on numpy-base.
Use it in conda-packages workflow to enable building for wider
range of Python versions than what is included in IDP.
---
 .github/workflows/conda-package.yml | 18 +++++-----
 conda-recipe-cf/bld.bat             | 25 +++++++++++++
 conda-recipe-cf/build.sh            | 23 ++++++++++++
 conda-recipe-cf/meta.yaml           | 54 +++++++++++++++++++++++++++++
 conda-recipe-cf/run_tests.bat       |  1 +
 conda-recipe-cf/run_tests.sh        |  1 +
 conda-recipe/meta.yaml              |  2 +-
 7 files changed, 114 insertions(+), 10 deletions(-)
 create mode 100644 conda-recipe-cf/bld.bat
 create mode 100644 conda-recipe-cf/build.sh
 create mode 100644 conda-recipe-cf/meta.yaml
 create mode 100644 conda-recipe-cf/run_tests.bat
 create mode 100644 conda-recipe-cf/run_tests.sh

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index b890920..0eefa64 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python: ['3.10']
+        python: ['3.10', '3.11', '3.12']
     steps:
       - uses: actions/checkout@v3
         with:
@@ -38,7 +38,7 @@ jobs:
         run: conda install conda-build
       - name: Build conda package
         run: |
-          CHANNELS="-c conda-forge -c intel --override-channels"
+          CHANNELS="-c conda-forge -c https://software.repos.intel.com/python/conda --override-channels"
           VERSIONS="--python ${{ matrix.python }}"
           TEST="--no-test"
 
@@ -46,7 +46,7 @@ jobs:
             $TEST \
             $VERSIONS \
             $CHANNELS \
-            conda-recipe
+            conda-recipe-cf
       - name: Upload artifact
         uses: actions/upload-artifact@v3
         with:
@@ -59,12 +59,12 @@ jobs:
 
     strategy:
       matrix:
-        python: ['3.10']
+        python: ['3.10', '3.11', '3.12']
         experimental: [false]
         runner: [ubuntu-latest]
     continue-on-error: ${{ matrix.experimental }}
     env:
-      CHANNELS: -c intel -c main --override-channels
+      CHANNELS: -c conda-forge -c https://software.repos.intel.com/python/conda --override-channels
 
     steps:
       - name: Download artifact
@@ -121,7 +121,7 @@ jobs:
 
     strategy:
       matrix:
-        python: ['3.10']
+        python: ['3.10', '3.11', '3.12']
     env:
       conda-bld: C:\Miniconda\conda-bld\win-64\
     steps:
@@ -147,7 +147,7 @@ jobs:
             ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-
             ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-
       - name: Build conda package
-        run: conda build --no-test --python ${{ matrix.python }} -c intel -c conda-forge --override-channels conda-recipe
+        run: conda build --no-test --python ${{ matrix.python }} -c conda-forge -c https://software.repos.intel.com/python/conda --override-channels conda-recipe-cf
       - name: Upload artifact
         uses: actions/upload-artifact@v3
         with:
@@ -162,13 +162,13 @@ jobs:
         shell: cmd /C CALL {0}
     strategy:
       matrix:
-        python: ['3.10']
+        python: ['3.10', '3.11', '3.12']
         experimental: [false]
         runner: [windows-latest]
     continue-on-error: ${{ matrix.experimental }}
     env:
       workdir: '${{ github.workspace }}'
-      CHANNELS: -c intel -c conda-forge --override-channels
+      CHANNELS: -c conda-forge -c https://software.repos.intel.com/python/conda --override-channels
 
     steps:
       - name: Download artifact
diff --git a/conda-recipe-cf/bld.bat b/conda-recipe-cf/bld.bat
new file mode 100644
index 0000000..e27318d
--- /dev/null
+++ b/conda-recipe-cf/bld.bat
@@ -0,0 +1,25 @@
+REM A workaround for activate-dpcpp.bat issue to be addressed in 2021.4
+set "LIB=%BUILD_PREFIX%\Library\lib;%BUILD_PREFIX%\compiler\lib;%LIB%"
+set "INCLUDE=%BUILD_PREFIX%\include;%INCLUDE%"
+
+"%PYTHON%" setup.py clean --all
+set "SKBUILD_ARGS=-G Ninja -- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
+
+FOR %%V IN (14.0.0 14 15.0.0 15 16.0.0 16 17.0.0 17) DO @(
+  REM set DIR_HINT if directory exists
+  IF EXIST "%BUILD_PREFIX%\Library\lib\clang\%%V\" (
+     SET "SYCL_INCLUDE_DIR_HINT=%BUILD_PREFIX%\Library\lib\clang\%%V"
+  )
+)
+
+if NOT "%WHEELS_OUTPUT_FOLDER%"=="" (
+    rem Install and assemble wheel package from the build bits
+    "%PYTHON%" setup.py install bdist_wheel %SKBUILD_ARGS%
+    if errorlevel 1 exit 1
+    copy dist\mkl_umath*.whl %WHEELS_OUTPUT_FOLDER%
+    if errorlevel 1 exit 1
+) ELSE (
+    rem Only install
+    "%PYTHON%" setup.py install %SKBUILD_ARGS%
+    if errorlevel 1 exit 1
+)
diff --git a/conda-recipe-cf/build.sh b/conda-recipe-cf/build.sh
new file mode 100644
index 0000000..fc4459c
--- /dev/null
+++ b/conda-recipe-cf/build.sh
@@ -0,0 +1,23 @@
+# This is necessary to help DPC++ find Intel libraries such as SVML, IRNG, etc in build prefix
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${BUILD_PREFIX}/lib"
+
+# Intel LLVM must cooperate with compiler and sysroot from conda
+echo "--gcc-toolchain=${BUILD_PREFIX} --sysroot=${BUILD_PREFIX}/${HOST}/sysroot -target ${HOST}" > icx_for_conda.cfg
+export ICXCFG="$(pwd)/icx_for_conda.cfg"
+
+export CMAKE_GENERATOR="Ninja"
+SKBUILD_ARGS="-- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
+
+if [ -n "${WHEELS_OUTPUT_FOLDER}" ]; then
+    # Install packages and assemble wheel package from built bits
+    if [ "$CONDA_PY" == "36" ]; then
+        WHEELS_BUILD_ARGS="-p manylinux1_x86_64"
+    else
+        WHEELS_BUILD_ARGS="-p manylinux2014_x86_64"
+    fi
+    ${PYTHON} setup.py install bdist_wheel ${WHEELS_BUILD_ARGS} ${SKBUILD_ARGS}
+    cp dist/mkl_umath*.whl ${WHEELS_OUTPUT_FOLDER}
+else
+    # Perform regular install
+    ${PYTHON} setup.py install ${SKBUILD_ARGS}
+fi
diff --git a/conda-recipe-cf/meta.yaml b/conda-recipe-cf/meta.yaml
new file mode 100644
index 0000000..4ecf657
--- /dev/null
+++ b/conda-recipe-cf/meta.yaml
@@ -0,0 +1,54 @@
+{% set version = "0.1.2" %}
+{% set buildnumber = 0 %}
+
+package:
+    name: mkl_umath
+    version: {{ version }}
+
+source:
+    path: ../
+
+build:
+    number: {{ buildnumber }}
+    ignore_run_exports:
+      - blas
+
+requirements:
+    build:
+      - {{ compiler('c') }}
+      - {{ compiler('cxx') }}
+      - {{ compiler('dpcpp') }} >=2024.2  # [not osx]
+      - sysroot_linux-64 >=2.28  # [linux]
+    host:
+      - setuptools
+      - cmake
+      - ninja
+      - git
+      - cython
+      - scikit-build
+      - python
+      - mkl-devel
+      - numpy
+    run:
+      - python
+      - mkl
+      - mkl-service
+      - {{ pin_compatible('intel-cmplr-lib-rt') }}
+
+test:
+    requires:
+      - pytest
+    source_files:
+      - mkl_umath/tests/test_basic.py
+    commands:
+      - pytest mkl_umath/tests/test_basic.py
+    imports:
+      - mkl_umath
+      - mkl_umath._ufuncs
+      - mkl_umath._patch
+
+about:
+    home: http://github.com/IntelPython/mkl_umath
+    license: BSD-3
+    license_file: LICENSE.txt
+    summary: Universal functions for real and complex floating point arrays powered by Intel(R) Math Kernel Library Vector (Intel(R) MKL) and Intel(R) Short Vector Math Library (Intel(R) SVML)
diff --git a/conda-recipe-cf/run_tests.bat b/conda-recipe-cf/run_tests.bat
new file mode 100644
index 0000000..590db89
--- /dev/null
+++ b/conda-recipe-cf/run_tests.bat
@@ -0,0 +1 @@
+%PYTHON% tests\test_basic.py
\ No newline at end of file
diff --git a/conda-recipe-cf/run_tests.sh b/conda-recipe-cf/run_tests.sh
new file mode 100644
index 0000000..7bfca5d
--- /dev/null
+++ b/conda-recipe-cf/run_tests.sh
@@ -0,0 +1 @@
+$PYTHON tests/test_basic.py
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index ec41497..dcafd45 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -17,7 +17,7 @@ requirements:
     build:
       - {{ compiler('c') }}
       - {{ compiler('cxx') }}
-      - {{ compiler('dpcpp') }} >=2023.2  # [not osx]
+      - {{ compiler('dpcpp') }} >=2024.2  # [not osx]
       - sysroot_linux-64 >=2.28  # [linux]
     host:
       - setuptools

From aa3876fc4616e0534d3e406edb5484e031e02737 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 18 Sep 2024 09:08:49 -0500
Subject: [PATCH 27/38] Fix issue with Windows build/test steps

---
 .github/workflows/conda-package.yml | 56 ++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 0eefa64..4d81eb2 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -20,6 +20,7 @@ jobs:
       - name: Set pkgs_dirs
         run: |
           echo "pkgs_dirs: [~/.conda/pkgs]" >> ~/.condarc
+
       - name: Cache conda packages
         uses: actions/cache@v3
         env:
@@ -34,24 +35,28 @@ jobs:
 
       - name: Add conda to system path
         run: echo $CONDA/bin >> $GITHUB_PATH
+
       - name: Install conda-build
         run: conda install conda-build
+
       - name: Build conda package
         run: |
           CHANNELS="-c conda-forge -c https://software.repos.intel.com/python/conda --override-channels"
           VERSIONS="--python ${{ matrix.python }}"
           TEST="--no-test"
+          echo "CONDA_BLD=${CONDA}/conda-bld/linux-64" >> $GITHUB_ENV
 
           conda build \
             $TEST \
             $VERSIONS \
             $CHANNELS \
             conda-recipe-cf
+
       - name: Upload artifact
         uses: actions/upload-artifact@v3
         with:
           name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
-          path: /usr/share/miniconda/conda-bld/linux-64/${{ env.PACKAGE_NAME }}-*.tar.bz2
+          path: ${{ env.CONDA_BLD }}/${{ env.PACKAGE_NAME }}-*.tar.bz2
 
   test:
     needs: build
@@ -89,9 +94,11 @@ jobs:
           conda create -n test_mkl_umath $PACKAGE_NAME python=${{ matrix.python }} $CHANNELS --only-deps --dry-run > lockfile
       - name: Display lockfile
         run: cat lockfile
+
       - name: Set pkgs_dirs
         run: |
           echo "pkgs_dirs: [~/.conda/pkgs]" >> ~/.condarc
+
       - name: Cache conda packages
         uses: actions/cache@v3
         env:
@@ -110,6 +117,7 @@ jobs:
           conda create -n test_mkl_umath python=${{ matrix.python }} $PACKAGE_NAME pytest $CHANNELS
           # Test installed packages
           conda list -n test_mkl_umath
+
       - name: Run tests
         run: |
           source $CONDA/etc/profile.d/conda.sh
@@ -128,11 +136,13 @@ jobs:
       - uses: actions/checkout@v3
         with:
           fetch-depth: 0
-      - uses: conda-incubator/setup-miniconda@v2
+
+      - uses: conda-incubator/setup-miniconda@v3
         with:
-          auto-activate-base: true
-          conda-build-version: "*"
-          activate-environment: true
+          miniforge-variant: Miniforge3
+          miniforge-version: latest
+          activate-environment: build
+          channels: conda-forge
           python-version: ${{ matrix.python }}
 
       - name: Cache conda packages
@@ -146,8 +156,23 @@ jobs:
           restore-keys: |
             ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-
             ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-
+
+      - name: Store conda paths as envs
+        shell: bash -l {0}
+        run: |
+          echo "CONDA_BLD=$CONDA/conda-bld/win-64/" | tr "\\\\" '/' >> $GITHUB_ENV
+
+      - name: Install conda build
+        run: |
+          conda activate
+          conda install -y conda-build
+          conda list -n base
+
       - name: Build conda package
-        run: conda build --no-test --python ${{ matrix.python }} -c conda-forge -c https://software.repos.intel.com/python/conda --override-channels conda-recipe-cf
+        run: |
+          conda activate
+          conda build --no-test --python ${{ matrix.python }} -c conda-forge -c https://software.repos.intel.com/python/conda --override-channels conda-recipe-cf
+
       - name: Upload artifact
         uses: actions/upload-artifact@v3
         with:
@@ -175,13 +200,14 @@ jobs:
         uses: actions/download-artifact@v3
         with:
           name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
-      - uses: conda-incubator/setup-miniconda@v2
+      - uses: conda-incubator/setup-miniconda@v3
         with:
-          auto-update-conda: true
-          conda-build-version: '*'
-          miniconda-version: 'latest'
-          activate-environment: mkl_umath_test
+          miniforge-variant: Miniforge3
+          miniforge-version: latest
+          activate-environment: build
+          channels: conda-forge
           python-version: ${{ matrix.python }}
+
       - name: Create conda channel with the artifact bit
         shell: cmd /C CALL {0}
         run: |
@@ -189,13 +215,17 @@ jobs:
           mkdir ${{ env.workdir }}\channel\win-64
           move ${{ env.PACKAGE_NAME }}-*.tar.bz2 ${{ env.workdir }}\channel\win-64
           dir ${{ env.workdir }}\channel\win-64
+
       - name: Index the channel
         shell: cmd /C CALL {0}
-        run: conda index ${{ env.workdir }}\channel
+        run: |
+          conda activate
+          conda index ${{ env.workdir }}\channel
 
       - name: Dump mkl_umath version info from created channel into ver.json
         shell: cmd /C CALL {0}
         run: |
+          conda activate
           conda search ${{ env.PACKAGE_NAME }} -c ${{ env.workdir }}/channel --override-channels --info --json > ${{ env.workdir }}\ver.json
       - name: Output content of produced ver.json
         shell: pwsh
@@ -210,6 +240,7 @@ jobs:
           FOR /F "tokens=* USEBACKQ" %%F IN (`python -c "%SCRIPT%"`) DO (
              SET PACKAGE_VERSION=%%F
           )
+          conda activate
           conda install -n mkl_umath_test ${{ env.PACKAGE_NAME }}=%PACKAGE_VERSION% python=${{ matrix.python }} -c ${{ env.workdir }}/channel ${{ env.CHANNELS }} --only-deps --dry-run > lockfile
       - name: Display lockfile content
         shell: pwsh
@@ -241,6 +272,7 @@ jobs:
       - name: Report content of test environment
         shell: cmd /C CALL {0}
         run: |
+          conda activate
           echo "Value of CONDA enviroment variable was: " %CONDA%
           echo "Value of CONDA_PREFIX enviroment variable was: " %CONDA_PREFIX%
           conda info && conda list -n mkl_umath_test

From 3025b1bad9094e2142c3d06fa014a5295dabb4f5 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 18 Sep 2024 15:15:02 -0500
Subject: [PATCH 28/38] Use Windows-2019 container over windows-latest

---
 .github/workflows/conda-package.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 4d81eb2..a4ac110 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -125,7 +125,7 @@ jobs:
           python -c "import mkl_umath, numpy as np; mkl_umath.use_in_numpy(); np.sin(np.linspace(0, 1, num=10**6));"
 
   build_windows:
-    runs-on: windows-latest
+    runs-on: windows-2019
 
     strategy:
       matrix:
@@ -189,7 +189,7 @@ jobs:
       matrix:
         python: ['3.10', '3.11', '3.12']
         experimental: [false]
-        runner: [windows-latest]
+        runner: [windows-2019]
     continue-on-error: ${{ matrix.experimental }}
     env:
       workdir: '${{ github.workspace }}'

From a98eae4c72c42911f87d8530f1a2818eca196461 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 23 Sep 2024 15:20:22 -0500
Subject: [PATCH 29/38] Add CODEOWNERS file

---
 .github/CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .github/CODEOWNERS

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 0000000..05c1669
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1 @@
+*       @oleksandr-pavlyk @xaleryb @ekomarova

From 2b584cf29114d7d704911a37fd8c642c4e2b3e15 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 23 Sep 2024 15:20:32 -0500
Subject: [PATCH 30/38] Add dependabot file

---
 .github/dependabot.yml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 .github/dependabot.yml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..5ace460
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"

From 8ff6665d087c77e72c9c012cc7ded482357e8689 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 23 Sep 2024 15:20:47 -0500
Subject: [PATCH 31/38] Add OpenSSF scorecard workflow

---
 .github/workflows/openssf-scorecard.yml | 74 +++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 .github/workflows/openssf-scorecard.yml

diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
new file mode 100644
index 0000000..586f7bc
--- /dev/null
+++ b/.github/workflows/openssf-scorecard.yml
@@ -0,0 +1,74 @@
+# This workflow uses actions that are not certified by GitHub. They are provided
+# by a third-party and are governed by separate terms of service, privacy
+# policy, and support documentation.
+
+name: Scorecard supply-chain security
+on:
+  # For Branch-Protection check. Only the default branch is supported. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
+  branch_protection_rule:
+  # To guarantee Maintained check is occasionally updated. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
+  schedule:
+    - cron: '28 2 * * 1'
+    - cron: '28 2 * * 4'
+  push:
+    branches: [ "master" ]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analysis:
+    name: Scorecard analysis
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    permissions:
+      # Needed to upload the results to code-scanning dashboard.
+      security-events: write
+      # Needed to publish results and get a badge (see publish_results below).
+      id-token: write
+      # Uncomment the permissions below if installing in a private repository.
+      # contents: read
+      # actions: read
+
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          persist-credentials: false
+
+      - name: "Run analysis"
+        uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0
+        with:
+          results_file: results.sarif
+          results_format: sarif
+          # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
+          # - you want to enable the Branch-Protection check on a *public* repository, or
+          # - you are installing Scorecard on a *private* repository
+          # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
+          # repo_token: ${{ secrets.SCORECARD_TOKEN }}
+
+          # Public repositories:
+          #   - Publish results to OpenSSF REST API for easy access by consumers
+          #   - Allows the repository to include the Scorecard badge.
+          #   - See https://github.com/ossf/scorecard-action#publishing-results.
+          # For private repositories:
+          #   - `publish_results` will always be set to `false`, regardless
+          #     of the value entered here.
+          publish_results: true
+
+      # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
+      # format to the repository Actions tab.
+      - name: "Upload artifact"
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        with:
+          name: SARIF file
+          path: results.sarif
+          retention-days: 14
+
+      # Upload the results to GitHub's code scanning dashboard.
+      - name: "Upload to code-scanning"
+        uses: github/codeql-action/upload-sarif@294a9d92911152fe08befb9ec03e240add280cb3 # v3.26.8
+        with:
+          sarif_file: results.sarif

From a2d618299e1afdd3276d5fec2dff8fecc8a409cb Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 23 Sep 2024 16:14:01 -0500
Subject: [PATCH 32/38] Fixed upload of Windows build artifact

---
 .github/workflows/conda-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index a4ac110..6f96bfa 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -177,7 +177,7 @@ jobs:
         uses: actions/upload-artifact@v3
         with:
           name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
-          path: ${{ env.conda-bld }}${{ env.PACKAGE_NAME }}-*.tar.bz2
+          path: ${{ env.CONDA_BLD }}${{ env.PACKAGE_NAME }}-*.tar.bz2
 
   test_windows:
     needs: build_windows

From 5270b7c9ad6258616688273a9e56081fb0b7d16d Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 23 Sep 2024 16:46:05 -0500
Subject: [PATCH 33/38] Add a step to output content of workdir

---
 .github/workflows/conda-package.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 6f96bfa..f4a7206 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -222,11 +222,19 @@ jobs:
           conda activate
           conda index ${{ env.workdir }}\channel
 
+      - name: Dump mkl_umath version info from created channel to STDOUT
+        shell: cmd /C CALL {0}
+        run: |
+          conda activate
+          conda search ${{ env.PACKAGE_NAME }} -c ${{ env.workdir }}/channel --override-channels --info --json
       - name: Dump mkl_umath version info from created channel into ver.json
         shell: cmd /C CALL {0}
         run: |
           conda activate
           conda search ${{ env.PACKAGE_NAME }} -c ${{ env.workdir }}/channel --override-channels --info --json > ${{ env.workdir }}\ver.json
+      - name: Output content of workdir
+        shell: pwsh
+        run: Get-ChildItem -Path ${{ env.workdir }}
       - name: Output content of produced ver.json
         shell: pwsh
         run: Get-Content -Path ${{ env.workdir }}\ver.json

From ff3bc26a698b1b0f65bc4469c635d33b7a675d15 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 23 Sep 2024 17:24:58 -0500
Subject: [PATCH 34/38] Add SECURITY.md file

---
 SECURITY.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 SECURITY.md

diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000..556938b
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,12 @@
+# Security Policy
+
+## Report a Vulnerability
+
+Please report security issues or vulnerabilities to the [Intel® Security Center].
+
+For more information on how Intel® works to resolve security issues, see
+[Vulnerability Handling Guidelines].
+
+[Intel® Security Center]:https://www.intel.com/content/www/us/en/security-center/default.html
+
+[Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html

From 535e68a37ad4ba7d882ad7198d0f7dee324daf2f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 24 Sep 2024 07:24:36 -0500
Subject: [PATCH 35/38] Attempt to fix test step for Windows

---
 .github/workflows/conda-package.yml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index f4a7206..6a25b70 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -200,11 +200,14 @@ jobs:
         uses: actions/download-artifact@v3
         with:
           name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
+
       - uses: conda-incubator/setup-miniconda@v3
         with:
+          auto-update-conda: true
+          conda-build-version: '*'
           miniforge-variant: Miniforge3
           miniforge-version: latest
-          activate-environment: build
+          activate-environment: mkl_umath_test
           channels: conda-forge
           python-version: ${{ matrix.python }}
 
@@ -219,18 +222,15 @@ jobs:
       - name: Index the channel
         shell: cmd /C CALL {0}
         run: |
-          conda activate
           conda index ${{ env.workdir }}\channel
 
       - name: Dump mkl_umath version info from created channel to STDOUT
         shell: cmd /C CALL {0}
         run: |
-          conda activate
           conda search ${{ env.PACKAGE_NAME }} -c ${{ env.workdir }}/channel --override-channels --info --json
       - name: Dump mkl_umath version info from created channel into ver.json
         shell: cmd /C CALL {0}
         run: |
-          conda activate
           conda search ${{ env.PACKAGE_NAME }} -c ${{ env.workdir }}/channel --override-channels --info --json > ${{ env.workdir }}\ver.json
       - name: Output content of workdir
         shell: pwsh
@@ -248,7 +248,6 @@ jobs:
           FOR /F "tokens=* USEBACKQ" %%F IN (`python -c "%SCRIPT%"`) DO (
              SET PACKAGE_VERSION=%%F
           )
-          conda activate
           conda install -n mkl_umath_test ${{ env.PACKAGE_NAME }}=%PACKAGE_VERSION% python=${{ matrix.python }} -c ${{ env.workdir }}/channel ${{ env.CHANNELS }} --only-deps --dry-run > lockfile
       - name: Display lockfile content
         shell: pwsh

From 030e1620b0d567502ce8ebc3be4d93f148b09644 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 24 Sep 2024 08:22:49 -0500
Subject: [PATCH 36/38] Replace use of "-c intel" in the README.

---
 README.md | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 006aa88..0e2bd0b 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Patches were factored out per community feedback ([NEP-36](https://numpy.org/nep
 as a stand-alone package. It can be installed into conda environment using 
 
 ```
-   conda install -c intel mkl_umath
+   conda install -c https://software.repos.intel.com/python/conda mkl_umath
 ```
 
 ---
@@ -18,17 +18,9 @@ as a stand-alone package. It can be installed into conda environment using
 To install mkl_umath Pypi package please use following command:
 
 ```
-   python -m pip install --i https://pypi.anaconda.org/intel/simple -extra-index-url https://pypi.org/simple mkl_umath
+   python -m pip install mkl_umath
 ```
 
-If command above installs NumPy package from the Pypi, please use following command to install Intel optimized NumPy wheel package from Anaconda Cloud:
-
-```
-   python -m pip install --i https://pypi.anaconda.org/intel/simple -extra-index-url https://pypi.org/simple mkl_umath numpy==<numpy_version>
-```
-
-Where `<numpy_version>` should be the latest version from https://anaconda.org/intel/numpy
-
 ---
 
 ## Building

From 6c8c1dbda2cd77df3730639c7ef84c151f6bf5e7 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 24 Sep 2024 09:43:50 -0500
Subject: [PATCH 37/38] Bump up versions of actions per GH warnings

---
 .github/workflows/conda-package.yml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 6a25b70..0c5bd0b 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -13,7 +13,7 @@ jobs:
       matrix:
         python: ['3.10', '3.11', '3.12']
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4.1.7
         with:
           fetch-depth: 0
 
@@ -22,7 +22,7 @@ jobs:
           echo "pkgs_dirs: [~/.conda/pkgs]" >> ~/.condarc
 
       - name: Cache conda packages
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         env:
           CACHE_NUMBER: 0  # Increase to reset cache
         with:
@@ -53,7 +53,7 @@ jobs:
             conda-recipe-cf
 
       - name: Upload artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4.4.0
         with:
           name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
           path: ${{ env.CONDA_BLD }}/${{ env.PACKAGE_NAME }}-*.tar.bz2
@@ -73,7 +73,7 @@ jobs:
 
     steps:
       - name: Download artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
       - name: Add conda to system path
@@ -100,7 +100,7 @@ jobs:
           echo "pkgs_dirs: [~/.conda/pkgs]" >> ~/.condarc
 
       - name: Cache conda packages
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         env:
           CACHE_NUMBER: 0  # Increase to reset cache
         with:
@@ -133,7 +133,7 @@ jobs:
     env:
       conda-bld: C:\Miniconda\conda-bld\win-64\
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4.1.7
         with:
           fetch-depth: 0
 
@@ -146,7 +146,7 @@ jobs:
           python-version: ${{ matrix.python }}
 
       - name: Cache conda packages
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         env:
           CACHE_NUMBER: 3  # Increase to reset cache
         with:
@@ -174,7 +174,7 @@ jobs:
           conda build --no-test --python ${{ matrix.python }} -c conda-forge -c https://software.repos.intel.com/python/conda --override-channels conda-recipe-cf
 
       - name: Upload artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4.4.0
         with:
           name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
           path: ${{ env.CONDA_BLD }}${{ env.PACKAGE_NAME }}-*.tar.bz2
@@ -197,7 +197,7 @@ jobs:
 
     steps:
       - name: Download artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
 
@@ -253,7 +253,7 @@ jobs:
         shell: pwsh
         run: Get-Content -Path .\lockfile
       - name: Cache conda packages
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         env:
           CACHE_NUMBER: 0  # Increase to reset cache
         with:

From 6e73a4cee03ed70bbe9faf169d7c710899b6caca Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 24 Sep 2024 09:46:02 -0500
Subject: [PATCH 38/38] Fix build.sh per review comment

---
 conda-recipe-cf/build.sh | 6 +-----
 conda-recipe/build.sh    | 6 +-----
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/conda-recipe-cf/build.sh b/conda-recipe-cf/build.sh
index fc4459c..2792f27 100644
--- a/conda-recipe-cf/build.sh
+++ b/conda-recipe-cf/build.sh
@@ -10,11 +10,7 @@ SKBUILD_ARGS="-- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
 
 if [ -n "${WHEELS_OUTPUT_FOLDER}" ]; then
     # Install packages and assemble wheel package from built bits
-    if [ "$CONDA_PY" == "36" ]; then
-        WHEELS_BUILD_ARGS="-p manylinux1_x86_64"
-    else
-        WHEELS_BUILD_ARGS="-p manylinux2014_x86_64"
-    fi
+    WHEELS_BUILD_ARGS="-p manylinux_${GLIBC_MAJOR}_${GLIBC_MINOR}_x86_64"
     ${PYTHON} setup.py install bdist_wheel ${WHEELS_BUILD_ARGS} ${SKBUILD_ARGS}
     cp dist/mkl_umath*.whl ${WHEELS_OUTPUT_FOLDER}
 else
diff --git a/conda-recipe/build.sh b/conda-recipe/build.sh
index fc4459c..2792f27 100644
--- a/conda-recipe/build.sh
+++ b/conda-recipe/build.sh
@@ -10,11 +10,7 @@ SKBUILD_ARGS="-- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON"
 
 if [ -n "${WHEELS_OUTPUT_FOLDER}" ]; then
     # Install packages and assemble wheel package from built bits
-    if [ "$CONDA_PY" == "36" ]; then
-        WHEELS_BUILD_ARGS="-p manylinux1_x86_64"
-    else
-        WHEELS_BUILD_ARGS="-p manylinux2014_x86_64"
-    fi
+    WHEELS_BUILD_ARGS="-p manylinux_${GLIBC_MAJOR}_${GLIBC_MINOR}_x86_64"
     ${PYTHON} setup.py install bdist_wheel ${WHEELS_BUILD_ARGS} ${SKBUILD_ARGS}
     cp dist/mkl_umath*.whl ${WHEELS_OUTPUT_FOLDER}
 else