%PDF- %PDF-
Direktori : /backups/router/usr/local/lib/python3.11/site-packages/numexpr/ |
Current File : //backups/router/usr/local/lib/python3.11/site-packages/numexpr/interpreter.cpp |
/********************************************************************* Numexpr - Fast numerical array expression evaluator for NumPy. License: MIT Author: See AUTHORS.txt See LICENSE.txt for details about copyright and rights to use. **********************************************************************/ #include "module.hpp" #include <numpy/npy_cpu.h> #include <math.h> #include <string.h> #include <assert.h> #include <vector> #include "numexpr_config.hpp" #include "complex_functions.hpp" #include "interpreter.hpp" #include "numexpr_object.hpp" #ifdef _MSC_VER /* Some missing symbols and functions for Win */ #define fmax max #define fmin min #define NE_INFINITY (DBL_MAX+DBL_MAX) #define NE_NAN (INFINITY-INFINITY) #else #define NE_INFINITY INFINITY #define NE_NAN NAN #endif #ifndef SIZE_MAX #define SIZE_MAX ((size_t)-1) #endif #define RETURN_TYPE char* // AVAILABLE(Haystack, Haystack_Len, J, Needle_Len) // A macro that returns nonzero if there are at least Needle_Len // bytes left starting at Haystack[J]. // Haystack is 'unsigned char *', Haystack_Len, J, and Needle_Len // are 'size_t'; Haystack_Len is an lvalue. For NUL-terminated // searches, Haystack_Len can be modified each iteration to avoid // having to compute the end of Haystack up front. #define AVAILABLE(Haystack, Haystack_Len, J, Needle_Len) \ ((Haystack_Len) >= (J) + (Needle_Len)) // To allow building with NumPy<2 locally define the new NumPy macros: #if NPY_ABI_VERSION < 0x02000000 #define PyDataType_ELSIZE(descr) ((descr)->elsize) #define PyDataType_SET_ELSIZE(descr, size) (descr)->elsize = size #endif #include "str-two-way.hpp" #ifdef DEBUG #define DEBUG_TEST 1 #else #define DEBUG_TEST 0 #endif using namespace std; // Global state thread_data th_params; /* This file and interp_body should really be generated from a description of the opcodes -- there's too much repetition here for manually editing */ /* bit of a misnomer; includes the return value. */ #define NUMEXPR_MAX_ARGS 4 static char op_signature_table[][NUMEXPR_MAX_ARGS] = { #define Tb 'b' #define Ti 'i' #define Tl 'l' #define Tf 'f' #define Td 'd' #define Tc 'c' #define Ts 's' #define Tn 'n' #define T0 0 #define OPCODE(n, e, ex, rt, a1, a2, a3) {rt, a1, a2, a3}, #include "opcodes.hpp" #undef OPCODE #undef Tb #undef Ti #undef Tl #undef Tf #undef Td #undef Tc #undef Ts #undef Tn #undef T0 }; /* returns the sig of the nth op, '\0' if no more ops -1 on failure */ static int op_signature(int op, unsigned int n) { if (n >= NUMEXPR_MAX_ARGS) { return 0; } if (op < 0 || op > OP_END) { return -1; } return op_signature_table[op][n]; } /* To add a function to the lookup table, add to FUNC_CODES (first group is 1-arg functions, second is 2-arg functions), also to functions_f or functions_ff as appropriate. Finally, use add_func down below to add to funccodes. Functions with more arguments aren't implemented at present, but should be easy; just copy the 1- or 2-arg case. Some functions (for example, sqrt) are repeated in this table that are opcodes, but there's no problem with that as the compiler selects opcodes over functions, and this makes it easier to compare opcode vs. function speeds. */ typedef float (*FuncFFPtr)(float); #ifdef _WIN32 FuncFFPtr functions_ff[] = { #define FUNC_FF(fop, s, f, f_win32, ...) f_win32, #include "functions.hpp" #undef FUNC_FF }; #else FuncFFPtr functions_ff[] = { #define FUNC_FF(fop, s, f, ...) f, #include "functions.hpp" #undef FUNC_FF }; #endif #ifdef USE_VML /* Fake vsConj function just for casting purposes inside numexpr */ static void vsConj(MKL_INT n, const float* x1, float* dest) { MKL_INT j; for (j=0; j<n; j++) { dest[j] = x1[j]; }; }; #endif #ifdef USE_VML typedef void (*FuncFFPtr_vml)(MKL_INT, const float*, float*); FuncFFPtr_vml functions_ff_vml[] = { #define FUNC_FF(fop, s, f, f_win32, f_vml) f_vml, #include "functions.hpp" #undef FUNC_FF }; #endif typedef float (*FuncFFFPtr)(float, float); #ifdef _WIN32 FuncFFFPtr functions_fff[] = { #define FUNC_FFF(fop, s, f, f_win32, ...) f_win32, #include "functions.hpp" #undef FUNC_FFF }; #else FuncFFFPtr functions_fff[] = { #define FUNC_FFF(fop, s, f, ...) f, #include "functions.hpp" #undef FUNC_FFF }; #endif #ifdef USE_VML /* fmod not available in VML */ static void vsfmod(MKL_INT n, const float* x1, const float* x2, float* dest) { MKL_INT j; for(j=0; j < n; j++) { dest[j] = fmod(x1[j], x2[j]); }; }; typedef void (*FuncFFFPtr_vml)(MKL_INT, const float*, const float*, float*); FuncFFFPtr_vml functions_fff_vml[] = { #define FUNC_FFF(fop, s, f, f_win32, f_vml) f_vml, #include "functions.hpp" #undef FUNC_FFF }; #endif typedef double (*FuncDDPtr)(double); FuncDDPtr functions_dd[] = { #define FUNC_DD(fop, s, f, ...) f, #include "functions.hpp" #undef FUNC_DD }; #ifdef USE_VML /* Fake vdConj function just for casting purposes inside numexpr */ static void vdConj(MKL_INT n, const double* x1, double* dest) { MKL_INT j; for (j=0; j<n; j++) { dest[j] = x1[j]; }; }; #endif #ifdef USE_VML typedef void (*FuncDDPtr_vml)(MKL_INT, const double*, double*); FuncDDPtr_vml functions_dd_vml[] = { #define FUNC_DD(fop, s, f, f_vml) f_vml, #include "functions.hpp" #undef FUNC_DD }; #endif typedef double (*FuncDDDPtr)(double, double); FuncDDDPtr functions_ddd[] = { #define FUNC_DDD(fop, s, f, ...) f, #include "functions.hpp" #undef FUNC_DDD }; #ifdef USE_VML /* fmod not available in VML */ static void vdfmod(MKL_INT n, const double* x1, const double* x2, double* dest) { MKL_INT j; for(j=0; j < n; j++) { dest[j] = fmod(x1[j], x2[j]); }; }; typedef void (*FuncDDDPtr_vml)(MKL_INT, const double*, const double*, double*); FuncDDDPtr_vml functions_ddd_vml[] = { #define FUNC_DDD(fop, s, f, f_vml) f_vml, #include "functions.hpp" #undef FUNC_DDD }; #endif typedef void (*FuncCCPtr)(std::complex<double>*, std::complex<double>*); FuncCCPtr functions_cc[] = { #define FUNC_CC(fop, s, f, ...) f, #include "functions.hpp" #undef FUNC_CC }; #ifdef USE_VML /* complex expm1 not available in VML */ static void vzExpm1(MKL_INT n, const MKL_Complex16* x1, MKL_Complex16* dest) { MKL_INT j; vzExp(n, x1, dest); for (j=0; j<n; j++) { dest[j].real -= 1.0; }; }; static void vzLog1p(MKL_INT n, const MKL_Complex16* x1, MKL_Complex16* dest) { MKL_INT j; for (j=0; j<n; j++) { dest[j].real = x1[j].real + 1; dest[j].imag = x1[j].imag; }; vzLn(n, dest, dest); }; /* Use this instead of native vzAbs in VML as it seems to work badly */ static void vzAbs_(MKL_INT n, const MKL_Complex16* x1, MKL_Complex16* dest) { MKL_INT j; for (j=0; j<n; j++) { dest[j].real = sqrt(x1[j].real*x1[j].real + x1[j].imag*x1[j].imag); dest[j].imag = 0; }; }; typedef void (*FuncCCPtr_vml)(MKL_INT, const MKL_Complex16[], MKL_Complex16[]); FuncCCPtr_vml functions_cc_vml[] = { #define FUNC_CC(fop, s, f, f_vml) f_vml, #include "functions.hpp" #undef FUNC_CC }; #endif typedef void (*FuncCCCPtr)(std::complex<double>*, std::complex<double>*, std::complex<double>*); FuncCCCPtr functions_ccc[] = { #define FUNC_CCC(fop, s, f) f, #include "functions.hpp" #undef FUNC_CCC }; char get_return_sig(PyObject* program) { int sig; char last_opcode; Py_ssize_t end = PyBytes_Size(program); char *program_str = PyBytes_AS_STRING(program); do { end -= 4; if (end < 0) return 'X'; last_opcode = program_str[end]; } while (last_opcode == OP_NOOP); sig = op_signature(last_opcode, 0); if (sig <= 0) { return 'X'; } else { return (char)sig; } } static int typecode_from_char(char c) { switch (c) { case 'b': return NPY_BOOL; case 'i': return NPY_INT; case 'l': return NPY_LONGLONG; case 'f': return NPY_FLOAT; case 'd': return NPY_DOUBLE; case 'c': return NPY_CDOUBLE; case 's': return NPY_STRING; default: PyErr_SetString(PyExc_TypeError, "signature value not in 'bilfdcs'"); return -1; } } static int last_opcode(PyObject *program_object) { Py_ssize_t n; unsigned char *program; PyBytes_AsStringAndSize(program_object, (char **)&program, &n); return program[n-4]; } static int get_reduction_axis(PyObject* program) { Py_ssize_t end = PyBytes_Size(program); int axis = ((unsigned char *)PyBytes_AS_STRING(program))[end-1]; if (axis != 255 && axis >= NPY_MAXDIMS) axis = NPY_MAXDIMS - axis; return axis; } int check_program(NumExprObject *self) { unsigned char *program; Py_ssize_t prog_len, n_buffers, n_inputs; int pc, arg, argloc, argno, sig; char *fullsig, *signature; if (PyBytes_AsStringAndSize(self->program, (char **)&program, &prog_len) < 0) { PyErr_Format(PyExc_RuntimeError, "invalid program: can't read program"); return -1; } if (prog_len % 4 != 0) { PyErr_Format(PyExc_RuntimeError, "invalid program: prog_len mod 4 != 0"); return -1; } if (PyBytes_AsStringAndSize(self->fullsig, (char **)&fullsig, &n_buffers) < 0) { PyErr_Format(PyExc_RuntimeError, "invalid program: can't read fullsig"); return -1; } if (PyBytes_AsStringAndSize(self->signature, (char **)&signature, &n_inputs) < 0) { PyErr_Format(PyExc_RuntimeError, "invalid program: can't read signature"); return -1; } if (n_buffers > 255) { PyErr_Format(PyExc_RuntimeError, "invalid program: too many buffers"); return -1; } for (pc = 0; pc < prog_len; pc += 4) { unsigned int op = program[pc]; if (op == OP_NOOP) { continue; } if ((op >= OP_REDUCTION) && pc != prog_len-4) { PyErr_Format(PyExc_RuntimeError, "invalid program: reduction operations must occur last"); return -1; } for (argno = 0; ; argno++) { sig = op_signature(op, argno); if (sig == -1) { PyErr_Format(PyExc_RuntimeError, "invalid program: illegal opcode at %i (%d)", pc, op); return -1; } if (sig == 0) break; if (argno < 3) { argloc = pc+argno+1; } if (argno >= 3) { if (pc + 1 >= prog_len) { PyErr_Format(PyExc_RuntimeError, "invalid program: double opcode (%c) at end (%i)", pc, sig); return -1; } argloc = pc+argno+2; } arg = program[argloc]; if (sig != 'n' && ((arg >= n_buffers) || (arg < 0))) { PyErr_Format(PyExc_RuntimeError, "invalid program: buffer out of range (%i) at %i", arg, argloc); return -1; } if (sig == 'n') { if (op == OP_FUNC_FFN) { if (arg < 0 || arg >= FUNC_FF_LAST) { PyErr_Format(PyExc_RuntimeError, "invalid program: funccode out of range (%i) at %i", arg, argloc); return -1; } } else if (op == OP_FUNC_FFFN) { if (arg < 0 || arg >= FUNC_FFF_LAST) { PyErr_Format(PyExc_RuntimeError, "invalid program: funccode out of range (%i) at %i", arg, argloc); return -1; } } else if (op == OP_FUNC_DDN) { if (arg < 0 || arg >= FUNC_DD_LAST) { PyErr_Format(PyExc_RuntimeError, "invalid program: funccode out of range (%i) at %i", arg, argloc); return -1; } } else if (op == OP_FUNC_DDDN) { if (arg < 0 || arg >= FUNC_DDD_LAST) { PyErr_Format(PyExc_RuntimeError, "invalid program: funccode out of range (%i) at %i", arg, argloc); return -1; } } else if (op == OP_FUNC_CCN) { if (arg < 0 || arg >= FUNC_CC_LAST) { PyErr_Format(PyExc_RuntimeError, "invalid program: funccode out of range (%i) at %i", arg, argloc); return -1; } } else if (op == OP_FUNC_CCCN) { if (arg < 0 || arg >= FUNC_CCC_LAST) { PyErr_Format(PyExc_RuntimeError, "invalid program: funccode out of range (%i) at %i", arg, argloc); return -1; } } else if (op >= OP_REDUCTION) { ; } else { PyErr_Format(PyExc_RuntimeError, "invalid program: internal checker error processing %i", argloc); return -1; } /* The next is to avoid problems with the ('i','l') duality, specially in 64-bit platforms */ } else if (((sig == 'l') && (fullsig[arg] == 'i')) || ((sig == 'i') && (fullsig[arg] == 'l'))) { ; } else if (sig != fullsig[arg]) { PyErr_Format(PyExc_RuntimeError, "invalid : opcode signature doesn't match buffer (%c vs %c) at %i", sig, fullsig[arg], argloc); return -1; } } } return 0; } struct index_data { int count; int size; int findex; npy_intp *shape; npy_intp *strides; int *index; char *buffer; }; // BOUNDS_CHECK is used in interp_body.cpp #define DO_BOUNDS_CHECK 1 #if DO_BOUNDS_CHECK #define BOUNDS_CHECK(arg) if ((arg) >= params.r_end) { \ *pc_error = pc; \ return -2; \ } #else #define BOUNDS_CHECK(arg) #endif int stringcmp(const char *s1, const char *s2, npy_intp maxlen1, npy_intp maxlen2) { npy_intp maxlen, nextpos; /* Point to this when the end of a string is found, to simulate infinte trailing NULL characters. */ const char null = 0; // First check if some of the operands is the empty string and if so, // just check that the first char of the other is the NULL one. // Fixes #121 if (maxlen2 == 0) return *s1 != null; if (maxlen1 == 0) return *s2 != null; maxlen = (maxlen1 > maxlen2) ? maxlen1 : maxlen2; for (nextpos = 1; nextpos <= maxlen; nextpos++) { if (*s1 < *s2) return -1; if (*s1 > *s2) return +1; s1 = (nextpos >= maxlen1) ? &null : s1+1; s2 = (nextpos >= maxlen2) ? &null : s2+1; } return 0; } /* contains(str1, str2) function for string columns. Based on Newlib/strstr.c. */ int stringcontains(const char *haystack_start, const char *needle_start, npy_intp max_haystack_len, npy_intp max_needle_len) { // needle_len - Length of needle. // haystack_len - Known minimum length of haystack. size_t needle_len = (size_t)max_needle_len; size_t haystack_len = (size_t)max_haystack_len; const char *haystack = haystack_start; const char *needle = needle_start; bool ok = true; /* needle is prefix of haystack. */ char *res; size_t si = 0; size_t min_len = min(needle_len, haystack_len); while (*haystack && *needle && si < min_len) { ok &= *haystack++ == *needle++; si++; } /* check needle is prefix of haystack and calc needle length */ if (si == needle_len || *needle == 0) { if (ok) return 1; needle_len = si; } else { /* haystack less needle */ return 0; } /* calc haystack length */ while (*haystack && si < haystack_len) { haystack++; si++; } haystack_len = si; if (needle_len < LONG_NEEDLE_THRESHOLD) { res = two_way_short_needle((const unsigned char *)haystack_start, haystack_len, (const unsigned char *)needle_start, needle_len); } else { res = two_way_long_needle((const unsigned char *)haystack_start, haystack_len, (const unsigned char *)needle_start, needle_len); } return res != NULL ? 1 : 0; } /* Get space for VM temporary registers */ int get_temps_space(const vm_params& params, char **mem, size_t block_size) { int r, k = 1 + params.n_inputs + params.n_constants; for (r = k; r < k + params.n_temps; r++) { mem[r] = (char *)malloc(block_size * params.memsizes[r]); if (mem[r] == NULL) { return -1; } } return 0; } /* Free space for VM temporary registers */ void free_temps_space(const vm_params& params, char **mem) { int r, k = 1 + params.n_inputs + params.n_constants; for (r = k; r < k + params.n_temps; r++) { free(mem[r]); } } /* Serial/parallel task iterator version of the VM engine */ int vm_engine_iter_task(NpyIter *iter, npy_intp *memsteps, const vm_params& params, int *pc_error, char **errmsg) { char **mem = params.mem; NpyIter_IterNextFunc *iternext; npy_intp block_size, *size_ptr; char **iter_dataptr; npy_intp *iter_strides; iternext = NpyIter_GetIterNext(iter, errmsg); if (iternext == NULL) { return -1; } size_ptr = NpyIter_GetInnerLoopSizePtr(iter); iter_dataptr = NpyIter_GetDataPtrArray(iter); iter_strides = NpyIter_GetInnerStrideArray(iter); /* * First do all the blocks with a compile-time fixed size. * This makes a big difference (30-50% on some tests). */ block_size = *size_ptr; while (block_size == BLOCK_SIZE1) { #define REDUCTION_INNER_LOOP #define BLOCK_SIZE BLOCK_SIZE1 #include "interp_body.cpp" #undef BLOCK_SIZE #undef REDUCTION_INNER_LOOP iternext(iter); block_size = *size_ptr; } /* Then finish off the rest */ if (block_size > 0) do { #define REDUCTION_INNER_LOOP #define BLOCK_SIZE block_size #include "interp_body.cpp" #undef BLOCK_SIZE #undef REDUCTION_INNER_LOOP } while (iternext(iter)); return 0; } static int vm_engine_iter_outer_reduce_task(NpyIter *iter, npy_intp *memsteps, const vm_params& params, int *pc_error, char **errmsg) { char **mem = params.mem; NpyIter_IterNextFunc *iternext; npy_intp block_size, *size_ptr; char **iter_dataptr; npy_intp *iter_strides; iternext = NpyIter_GetIterNext(iter, errmsg); if (iternext == NULL) { return -1; } size_ptr = NpyIter_GetInnerLoopSizePtr(iter); iter_dataptr = NpyIter_GetDataPtrArray(iter); iter_strides = NpyIter_GetInnerStrideArray(iter); /* * First do all the blocks with a compile-time fixed size. * This makes a big difference (30-50% on some tests). */ block_size = *size_ptr; while (block_size == BLOCK_SIZE1) { #define BLOCK_SIZE BLOCK_SIZE1 #define NO_OUTPUT_BUFFERING // Because it's a reduction #include "interp_body.cpp" #undef NO_OUTPUT_BUFFERING #undef BLOCK_SIZE iternext(iter); block_size = *size_ptr; } /* Then finish off the rest */ if (block_size > 0) do { #define BLOCK_SIZE block_size #define NO_OUTPUT_BUFFERING // Because it's a reduction #include "interp_body.cpp" #undef NO_OUTPUT_BUFFERING #undef BLOCK_SIZE } while (iternext(iter)); return 0; } /* Parallel iterator version of VM engine */ static int vm_engine_iter_parallel(NpyIter *iter, const vm_params& params, bool need_output_buffering, int *pc_error, char **errmsg) { int i, ret = -1; npy_intp numblocks, taskfactor; if (errmsg == NULL) { return -1; } /* Ensure only one parallel job is running at a time (otherwise the global th_params get corrupted). */ Py_BEGIN_ALLOW_THREADS; pthread_mutex_lock(&gs.parallel_mutex); Py_END_ALLOW_THREADS; /* Populate parameters for worker threads */ NpyIter_GetIterIndexRange(iter, &th_params.start, &th_params.vlen); /* * Try to make it so each thread gets 16 tasks. This is a compromise * between 1 task per thread and one block per task. */ taskfactor = 16*BLOCK_SIZE1*gs.nthreads; numblocks = (th_params.vlen - th_params.start + taskfactor - 1) / taskfactor; th_params.block_size = numblocks * BLOCK_SIZE1; th_params.params = params; th_params.need_output_buffering = need_output_buffering; th_params.ret_code = 0; th_params.pc_error = pc_error; th_params.errmsg = errmsg; th_params.iter[0] = iter; /* Make one copy for each additional thread */ for (i = 1; i < gs.nthreads; ++i) { th_params.iter[i] = NpyIter_Copy(iter); if (th_params.iter[i] == NULL) { --i; for (; i > 0; --i) { NpyIter_Deallocate(th_params.iter[i]); } goto end; } } th_params.memsteps[0] = params.memsteps; /* Make one copy of memsteps for each additional thread */ for (i = 1; i < gs.nthreads; ++i) { th_params.memsteps[i] = PyMem_New(npy_intp, 1 + params.n_inputs + params.n_constants + params.n_temps); if (th_params.memsteps[i] == NULL) { --i; for (; i > 0; --i) { PyMem_Del(th_params.memsteps[i]); } for (i = 0; i < gs.nthreads; ++i) { NpyIter_Deallocate(th_params.iter[i]); } goto end; } memcpy(th_params.memsteps[i], th_params.memsteps[0], sizeof(npy_intp) * (1 + params.n_inputs + params.n_constants + params.n_temps)); } Py_BEGIN_ALLOW_THREADS; /* Synchronization point for all threads (wait for initialization) */ pthread_mutex_lock(&gs.count_threads_mutex); if (gs.count_threads < gs.nthreads) { gs.count_threads++; /* Beware of spurious wakeups. See issue pydata/numexpr#306. */ do { pthread_cond_wait(&gs.count_threads_cv, &gs.count_threads_mutex); } while (!gs.barrier_passed); } else { gs.barrier_passed = 1; pthread_cond_broadcast(&gs.count_threads_cv); } pthread_mutex_unlock(&gs.count_threads_mutex); /* Synchronization point for all threads (wait for finalization) */ pthread_mutex_lock(&gs.count_threads_mutex); if (gs.count_threads > 0) { gs.count_threads--; do { pthread_cond_wait(&gs.count_threads_cv, &gs.count_threads_mutex); } while (gs.barrier_passed); } else { gs.barrier_passed = 0; pthread_cond_broadcast(&gs.count_threads_cv); } pthread_mutex_unlock(&gs.count_threads_mutex); Py_END_ALLOW_THREADS; /* Deallocate all the iterator and memsteps copies */ for (i = 1; i < gs.nthreads; ++i) { NpyIter_Deallocate(th_params.iter[i]); PyMem_Del(th_params.memsteps[i]); } ret = th_params.ret_code; end: pthread_mutex_unlock(&gs.parallel_mutex); return ret; } static int run_interpreter(NumExprObject *self, NpyIter *iter, NpyIter *reduce_iter, bool reduction_outer_loop, bool need_output_buffering, int *pc_error) { int r; Py_ssize_t plen; vm_params params; char *errmsg = NULL; *pc_error = -1; if (PyBytes_AsStringAndSize(self->program, (char **)&(params.program), &plen) < 0) { return -1; } params.prog_len = (int)plen; params.output = NULL; params.inputs = NULL; params.index_data = NULL; params.n_inputs = self->n_inputs; params.n_constants = self->n_constants; params.n_temps = self->n_temps; params.mem = self->mem; params.memsteps = self->memsteps; params.memsizes = self->memsizes; params.r_end = (int)PyBytes_Size(self->fullsig); params.out_buffer = NULL; if ((gs.nthreads == 1) || gs.force_serial) { // Can do it as one "task" if (reduce_iter == NULL) { // Allocate memory for output buffering if needed vector<char> out_buffer(need_output_buffering ? (self->memsizes[0] * BLOCK_SIZE1) : 0); params.out_buffer = need_output_buffering ? &out_buffer[0] : NULL; // Reset the iterator to allocate its buffers if(NpyIter_Reset(iter, NULL) != NPY_SUCCEED) { return -1; } get_temps_space(params, params.mem, BLOCK_SIZE1); Py_BEGIN_ALLOW_THREADS; r = vm_engine_iter_task(iter, params.memsteps, params, pc_error, &errmsg); Py_END_ALLOW_THREADS; free_temps_space(params, params.mem); } else { if (reduction_outer_loop) { char **dataptr; NpyIter_IterNextFunc *iternext; dataptr = NpyIter_GetDataPtrArray(reduce_iter); iternext = NpyIter_GetIterNext(reduce_iter, NULL); if (iternext == NULL) { return -1; } get_temps_space(params, params.mem, BLOCK_SIZE1); Py_BEGIN_ALLOW_THREADS; do { r = NpyIter_ResetBasePointers(iter, dataptr, &errmsg); if (r >= 0) { r = vm_engine_iter_outer_reduce_task(iter, params.memsteps, params, pc_error, &errmsg); } if (r < 0) { break; } } while (iternext(reduce_iter)); Py_END_ALLOW_THREADS; free_temps_space(params, params.mem); } else { char **dataptr; NpyIter_IterNextFunc *iternext; dataptr = NpyIter_GetDataPtrArray(iter); iternext = NpyIter_GetIterNext(iter, NULL); if (iternext == NULL) { return -1; } get_temps_space(params, params.mem, BLOCK_SIZE1); Py_BEGIN_ALLOW_THREADS; do { r = NpyIter_ResetBasePointers(reduce_iter, dataptr, &errmsg); if (r >= 0) { r = vm_engine_iter_task(reduce_iter, params.memsteps, params, pc_error, &errmsg); } if (r < 0) { break; } } while (iternext(iter)); Py_END_ALLOW_THREADS; free_temps_space(params, params.mem); } } } else { if (reduce_iter == NULL) { r = vm_engine_iter_parallel(iter, params, need_output_buffering, pc_error, &errmsg); } else { errmsg = (char *) "Parallel engine doesn't support reduction yet"; r = -1; } } if (r < 0 && errmsg != NULL) { PyErr_SetString(PyExc_RuntimeError, errmsg); } return 0; } static int run_interpreter_const(NumExprObject *self, char *output, int *pc_error) { vm_params params; Py_ssize_t plen; char **mem; npy_intp *memsteps; *pc_error = -1; if (PyBytes_AsStringAndSize(self->program, (char **)&(params.program), &plen) < 0) { return -1; } if (self->n_inputs != 0) { return -1; } params.prog_len = (int)plen; params.output = output; params.inputs = NULL; params.index_data = NULL; params.n_inputs = self->n_inputs; params.n_constants = self->n_constants; params.n_temps = self->n_temps; params.mem = self->mem; memsteps = self->memsteps; params.memsizes = self->memsizes; params.r_end = (int)PyBytes_Size(self->fullsig); mem = params.mem; get_temps_space(params, mem, 1); #define SINGLE_ITEM_CONST_LOOP #define BLOCK_SIZE 1 #define NO_OUTPUT_BUFFERING // Because it's constant #include "interp_body.cpp" #undef NO_OUTPUT_BUFFERING #undef BLOCK_SIZE #undef SINGLE_ITEM_CONST_LOOP free_temps_space(params, mem); return 0; } PyObject * NumExpr_run(NumExprObject *self, PyObject *args, PyObject *kwds) { PyArrayObject *operands[NE_MAXARGS]; PyArray_Descr *dtypes[NE_MAXARGS], **dtypes_tmp; PyObject *tmp, *ret; npy_uint32 op_flags[NE_MAXARGS]; NPY_CASTING casting = NPY_SAFE_CASTING; NPY_ORDER order = NPY_KEEPORDER; unsigned int i, n_inputs; int r, pc_error = 0; int reduction_axis = -1; npy_intp reduction_size = -1; // For #277 change this 1 -> -1 to be in-line with NumPy 1.8, #ifdef USE_VML int ex_uses_vml = 0; #endif int is_reduction = 0; bool reduction_outer_loop = false, need_output_buffering = false, full_reduction = false; // To specify axes when doing a reduction int op_axes_values[NE_MAXARGS][NPY_MAXDIMS], op_axes_reduction_values[NE_MAXARGS]; int *op_axes_ptrs[NPY_MAXDIMS]; int oa_ndim = 0; int **op_axes = NULL; NpyIter *iter = NULL, *reduce_iter = NULL; // Check whether we need to restart threads if (!gs.init_threads_done || gs.pid != getpid()) { numexpr_set_nthreads(gs.nthreads); } // Don't force serial mode by default gs.force_serial = 0; // Check whether there's a reduction as the final step is_reduction = last_opcode(self->program) > OP_REDUCTION; n_inputs = (int)PyTuple_Size(args); if (PyBytes_Size(self->signature) != n_inputs) { return PyErr_Format(PyExc_ValueError, "number of inputs doesn't match program"); } else if (n_inputs+1 > NPY_MAXARGS) { return PyErr_Format(PyExc_ValueError, "too many inputs"); } memset(operands, 0, sizeof(operands)); memset(dtypes, 0, sizeof(dtypes)); if (kwds && PyDict_Size(kwds) > 0) { tmp = PyDict_GetItemString(kwds, "casting"); // borrowed ref if (tmp != NULL && !PyArray_CastingConverter(tmp, &casting)) { return NULL; } tmp = PyDict_GetItemString(kwds, "order"); // borrowed ref if (tmp != NULL && !PyArray_OrderConverter(tmp, &order)) { return NULL; } tmp = PyDict_GetItemString(kwds, "ex_uses_vml"); // borrowed ref if (tmp == NULL) { return PyErr_Format(PyExc_ValueError, "ex_uses_vml parameter is required"); } #ifdef USE_VML if (tmp == Py_True) { ex_uses_vml = 1; } #endif // borrowed ref operands[0] = (PyArrayObject *)PyDict_GetItemString(kwds, "out"); if (operands[0] != NULL) { if ((PyObject *)operands[0] == Py_None) { operands[0] = NULL; } else if (!PyArray_Check(operands[0])) { return PyErr_Format(PyExc_ValueError, "out keyword parameter is not an array"); } else { Py_INCREF(operands[0]); } } } for (i = 0; i < n_inputs; i++) { PyObject *o = PyTuple_GET_ITEM(args, i); // borrowed ref PyObject *a; char c = PyBytes_AS_STRING(self->signature)[i]; int typecode = typecode_from_char(c); // Convert it if it's not an array if (!PyArray_Check(o)) { if (typecode == -1) goto fail; a = PyArray_FROM_OTF(o, typecode, NPY_ARRAY_NOTSWAPPED); } else { Py_INCREF(o); a = o; } operands[i+1] = (PyArrayObject *)a; dtypes[i+1] = PyArray_DescrFromType(typecode); if (operands[0] != NULL) { // Check for the case where "out" is one of the inputs // TODO: Probably should deal with the general overlap case, // but NumPy ufuncs don't do that yet either. if (PyArray_DATA(operands[0]) == PyArray_DATA(operands[i+1])) { need_output_buffering = true; } } if (operands[i+1] == NULL || dtypes[i+1] == NULL) { goto fail; } op_flags[i+1] = NPY_ITER_READONLY| #ifdef USE_VML (ex_uses_vml ? (NPY_ITER_CONTIG|NPY_ITER_ALIGNED) : 0)| #endif #ifndef USE_UNALIGNED_ACCESS NPY_ITER_ALIGNED| #endif NPY_ITER_NBO ; } if (is_reduction) { // A reduction can not result in a string, // so we don't need to worry about item sizes here. char retsig = get_return_sig(self->program); reduction_axis = get_reduction_axis(self->program); // Need to set up op_axes for the non-reduction part if (reduction_axis != 255) { // Get the number of broadcast dimensions for (i = 0; i < n_inputs; ++i) { int ndim = PyArray_NDIM(operands[i+1]); if (ndim > oa_ndim) { oa_ndim = ndim; } } if (reduction_axis < 0 || reduction_axis >= oa_ndim) { PyErr_Format(PyExc_ValueError, "reduction axis is out of bounds"); goto fail; } // Fill in the op_axes op_axes_ptrs[0] = NULL; op_axes_reduction_values[0] = -1; for (i = 0; i < n_inputs; ++i) { int j = 0, idim, ndim = PyArray_NDIM(operands[i+1]); for (idim = 0; idim < oa_ndim-ndim; ++idim) { if (idim != reduction_axis) { op_axes_values[i+1][j++] = -1; } else { op_axes_reduction_values[i+1] = -1; } } for (idim = oa_ndim-ndim; idim < oa_ndim; ++idim) { if (idim != reduction_axis) { op_axes_values[i+1][j++] = idim-(oa_ndim-ndim); } else { npy_intp size = PyArray_DIM(operands[i+1], idim-(oa_ndim-ndim)); if (size > reduction_size) { reduction_size = size; } op_axes_reduction_values[i+1] = idim-(oa_ndim-ndim); } } op_axes_ptrs[i+1] = op_axes_values[i+1]; } // op_axes has one less than the broadcast dimensions --oa_ndim; if (oa_ndim > 0) { op_axes = op_axes_ptrs; } else { reduction_size = 1; } } // A full reduction can be done without nested iteration if (oa_ndim == 0) { full_reduction = true; if (operands[0] == NULL) { npy_intp dim = 1; operands[0] = (PyArrayObject *)PyArray_SimpleNew(0, &dim, typecode_from_char(retsig)); if (!operands[0]) goto fail; } else if (PyArray_SIZE(operands[0]) != 1) { PyErr_Format(PyExc_ValueError, "out argument must have size 1 for a full reduction"); goto fail; } } dtypes[0] = PyArray_DescrFromType(typecode_from_char(retsig)); op_flags[0] = NPY_ITER_READWRITE| NPY_ITER_ALLOCATE| // Copy, because it can't buffer the reduction NPY_ITER_UPDATEIFCOPY| NPY_ITER_NBO| #ifndef USE_UNALIGNED_ACCESS NPY_ITER_ALIGNED| #endif (oa_ndim == 0 ? 0 : NPY_ITER_NO_BROADCAST); } else { char retsig = get_return_sig(self->program); if (retsig != 's') { dtypes[0] = PyArray_DescrFromType(typecode_from_char(retsig)); } else { /* Since the *only* supported operation returning a string * is a copy, the size of returned strings * can be directly gotten from the first (and only) * input/constant/temporary. */ if (n_inputs > 0) { // input, like in 'a' where a -> 'foo' dtypes[0] = PyArray_DESCR(operands[1]); Py_INCREF(dtypes[0]); } else { // constant, like in '"foo"' dtypes[0] = PyArray_DescrNewFromType(NPY_STRING); PyDataType_SET_ELSIZE(dtypes[0], (npy_intp)self->memsizes[1]); } // no string temporaries, so no third case } if (dtypes[0] == NULL) { goto fail; } op_flags[0] = NPY_ITER_WRITEONLY| NPY_ITER_ALLOCATE| NPY_ITER_CONTIG| NPY_ITER_NBO| #ifndef USE_UNALIGNED_ACCESS NPY_ITER_ALIGNED| #endif NPY_ITER_NO_BROADCAST; } // Check for empty arrays in expression if (n_inputs > 0) { char retsig = get_return_sig(self->program); // Check length for all inputs int zeroi, zerolen = 0; for (i=0; i < n_inputs; i++) { if (PyArray_SIZE(operands[i+1]) == 0) { zerolen = 1; zeroi = i+1; break; } } if (zerolen != 0) { // Allocate the output int ndim = PyArray_NDIM(operands[zeroi]); npy_intp *dims = PyArray_DIMS(operands[zeroi]); operands[0] = (PyArrayObject *)PyArray_SimpleNew(ndim, dims, typecode_from_char(retsig)); if (operands[0] == NULL) { goto fail; } ret = (PyObject *)operands[0]; Py_INCREF(ret); goto cleanup_and_exit; } } /* A case with a single constant output */ PyArrayObject *singleton; bool writeback; // NOTE: cannot assign on declaration due to `goto` statements singleton = NULL; writeback = false; if (n_inputs == 0) { char retsig = get_return_sig(self->program); /* Allocate the output */ if (operands[0] == NULL) { npy_intp dim = 1; operands[0] = (PyArrayObject *)PyArray_SimpleNew(0, &dim, typecode_from_char(retsig)); if (operands[0] == NULL) { goto fail; } } else { // Use the provided output array if (PyArray_SIZE(operands[0]) != 1) { PyErr_SetString(PyExc_ValueError, "output for a constant expression must have size 1"); goto fail; } else if (!PyArray_ISWRITEABLE(operands[0])) { PyErr_SetString(PyExc_ValueError, "output is not writeable"); goto fail; } Py_INCREF(dtypes[0]); // NumPy folks suggested using WRITEBACKIFCOPY to resolve issue #397 singleton = (PyArrayObject *)PyArray_FromArray(operands[0], dtypes[0], NPY_ARRAY_ALIGNED|NPY_ARRAY_WRITEBACKIFCOPY); if (singleton == NULL) { goto fail; } writeback = true; Py_DECREF(operands[0]); operands[0] = singleton; } r = run_interpreter_const(self, PyArray_BYTES(operands[0]), &pc_error); if (writeback) { // Write-back our copy to the passed in output array if we had to make a copy // (which only happens if the input was not aligned) int retval = PyArray_ResolveWritebackIfCopy(singleton); if (retval < 0) { // 1 means it copied the value, 0 means no copy, only -1 is an error. PyErr_Format(PyExc_ValueError, "Writeback to singleton failed with error code: %d", retval); goto fail; } } ret = (PyObject *)operands[0]; Py_INCREF(ret); goto cleanup_and_exit; } /* Allocate the iterator or nested iterators */ if (reduction_size < 0 || full_reduction) { /* When there's no reduction, reduction_size is 1 as well */ // RAM: in issue #277 this was also the case for reductions on arrays // with axis=0 having singleton dimension, i.e. such ops were interpreted // as full_reductions when they weren't in Numpy. As such, the default // reduction_size is now -1 and we add the flag for full_reduction, // e.g. ne.evaluate("sum(a)")" iter = NpyIter_AdvancedNew(n_inputs+1, operands, NPY_ITER_BUFFERED| NPY_ITER_REDUCE_OK| NPY_ITER_RANGED| NPY_ITER_DELAY_BUFALLOC| NPY_ITER_EXTERNAL_LOOP, order, casting, op_flags, dtypes, -1, NULL, NULL, BLOCK_SIZE1); if (iter == NULL) { goto fail; } } else { npy_uint32 op_flags_outer[NPY_MAXDIMS]; /* The outer loop is unbuffered */ op_flags_outer[0] = NPY_ITER_READWRITE| NPY_ITER_ALLOCATE| NPY_ITER_NO_BROADCAST; for (i = 0; i < n_inputs; ++i) { op_flags_outer[i+1] = NPY_ITER_READONLY; } /* Arbitrary threshold for which is the inner loop...benchmark? */ if (reduction_size < 64) { reduction_outer_loop = true; iter = NpyIter_AdvancedNew(n_inputs+1, operands, NPY_ITER_BUFFERED| NPY_ITER_RANGED| NPY_ITER_DELAY_BUFALLOC| NPY_ITER_EXTERNAL_LOOP, order, casting, op_flags, dtypes, oa_ndim, op_axes, NULL, BLOCK_SIZE1); if (iter == NULL) { goto fail; } /* If the output was allocated, get it for the second iterator */ if (operands[0] == NULL) { operands[0] = NpyIter_GetOperandArray(iter)[0]; Py_INCREF(operands[0]); } op_axes[0] = &op_axes_reduction_values[0]; for (i = 0; i < n_inputs; ++i) { op_axes[i+1] = &op_axes_reduction_values[i+1]; } op_flags_outer[0] &= ~NPY_ITER_NO_BROADCAST; reduce_iter = NpyIter_AdvancedNew(n_inputs+1, operands, NPY_ITER_REDUCE_OK, order, casting, op_flags_outer, NULL, 1, op_axes, NULL, 0); if (reduce_iter == NULL) { goto fail; } } else { PyArray_Descr *dtypes_outer[NPY_MAXDIMS]; /* If the output is being allocated, need to specify its dtype */ dtypes_outer[0] = dtypes[0]; for (i = 0; i < n_inputs; ++i) { dtypes_outer[i+1] = NULL; } iter = NpyIter_AdvancedNew(n_inputs+1, operands, NPY_ITER_RANGED, order, casting, op_flags_outer, dtypes_outer, oa_ndim, op_axes, NULL, 0); if (iter == NULL) { goto fail; } /* If the output was allocated, get it for the second iterator */ if (operands[0] == NULL) { operands[0] = NpyIter_GetOperandArray(iter)[0]; Py_INCREF(operands[0]); } op_axes[0] = &op_axes_reduction_values[0]; for (i = 0; i < n_inputs; ++i) { op_axes[i+1] = &op_axes_reduction_values[i+1]; } op_flags[0] &= ~NPY_ITER_NO_BROADCAST; reduce_iter = NpyIter_AdvancedNew(n_inputs+1, operands, NPY_ITER_BUFFERED| NPY_ITER_REDUCE_OK| NPY_ITER_DELAY_BUFALLOC| NPY_ITER_EXTERNAL_LOOP, order, casting, op_flags, dtypes, 1, op_axes, NULL, BLOCK_SIZE1); if (reduce_iter == NULL) { goto fail; } } } /* Initialize the output to the reduction unit */ if (is_reduction) { PyArrayObject *a = NpyIter_GetOperandArray(iter)[0]; PyObject *fill; int op = last_opcode(self->program); if (op < OP_PROD) { /* sum identity is 0 */ fill = PyLong_FromLong(0); } else if (op >= OP_PROD && op < OP_MIN) { /* product identity is 1 */ fill = PyLong_FromLong(1); } else if (PyArray_DESCR(a)->kind == 'f') { /* floating point min/max identity is NaN */ fill = PyFloat_FromDouble(NE_NAN); } else if (op >= OP_MIN && op < OP_MAX) { /* integer min identity */ fill = PyLong_FromLong(LONG_MAX); } else { /* integer max identity */ fill = PyLong_FromLong(LONG_MIN); } PyArray_FillWithScalar(a, fill); Py_DECREF(fill); } /* Get the sizes of all the operands */ dtypes_tmp = NpyIter_GetDescrArray(iter); for (i = 0; i < n_inputs+1; ++i) { self->memsizes[i] = PyDataType_ELSIZE(dtypes_tmp[i]); } /* For small calculations, just use 1 thread */ if (NpyIter_GetIterSize(iter) < 2*BLOCK_SIZE1) { gs.force_serial = 1; } /* Reductions do not support parallel execution yet */ if (is_reduction) { gs.force_serial = 1; } r = run_interpreter(self, iter, reduce_iter, reduction_outer_loop, need_output_buffering, &pc_error); if (r < 0) { if (r == -1) { if (!PyErr_Occurred()) { PyErr_SetString(PyExc_RuntimeError, "an error occurred while running the program"); } } else if (r == -2) { PyErr_Format(PyExc_RuntimeError, "bad argument at pc=%d", pc_error); } else if (r == -3) { PyErr_Format(PyExc_RuntimeError, "bad opcode at pc=%d", pc_error); } else { PyErr_SetString(PyExc_RuntimeError, "unknown error occurred while running the program"); } goto fail; } /* Get the output from the iterator */ ret = (PyObject *)NpyIter_GetOperandArray(iter)[0]; Py_INCREF(ret); NpyIter_Deallocate(iter); if (reduce_iter != NULL) { NpyIter_Deallocate(reduce_iter); } cleanup_and_exit: for (i = 0; i < n_inputs+1; i++) { Py_XDECREF(operands[i]); Py_XDECREF(dtypes[i]); } return ret; fail: for (i = 0; i < n_inputs+1; i++) { Py_XDECREF(operands[i]); Py_XDECREF(dtypes[i]); } if (iter != NULL) { NpyIter_Deallocate(iter); } if (reduce_iter != NULL) { NpyIter_Deallocate(reduce_iter); } return NULL; } /* Local Variables: c-basic-offset: 4 End: */