%PDF- %PDF-
Direktori : /backups/router/usr/local/lib/python3.11/site-packages/numexpr/ |
Current File : //backups/router/usr/local/lib/python3.11/site-packages/numexpr/module.cpp |
// Numexpr - Fast numerical array expression evaluator for NumPy. // // License: MIT // Author: See AUTHORS.txt // // See LICENSE.txt for details about copyright and rights to use. // // module.cpp contains the CPython-specific module exposure. #define DO_NUMPY_IMPORT_ARRAY #include "module.hpp" #include <structmember.h> #include <vector> #include <signal.h> #include "interpreter.hpp" #include "numexpr_object.hpp" using namespace std; // Global state. The file interpreter.hpp also has some global state // in its 'th_params' variable global_state gs; long global_max_threads=DEFAULT_MAX_THREADS; /* Do the worker job for a certain thread */ void *th_worker(void *tidptr) { int tid = *(int *)tidptr; /* Parameters for threads */ npy_intp start; npy_intp vlen; npy_intp block_size; NpyIter *iter; vm_params params; int *pc_error; int ret; int n_inputs; int n_constants; int n_temps; size_t memsize; char **mem; npy_intp *memsteps; npy_intp istart, iend; char **errmsg; // For output buffering if needed vector<char> out_buffer; while (1) { /* Sentinels have to be initialised yet */ gs.init_sentinels_done = 0; /* Meeting point for all threads (wait for initialization) */ pthread_mutex_lock(&gs.count_threads_mutex); if (gs.count_threads < gs.nthreads) { gs.count_threads++; /* Beware of spurious wakeups. See issue pydata/numexpr#306. */ do { pthread_cond_wait(&gs.count_threads_cv, &gs.count_threads_mutex); } while (!gs.barrier_passed); } else { gs.barrier_passed = 1; pthread_cond_broadcast(&gs.count_threads_cv); } pthread_mutex_unlock(&gs.count_threads_mutex); /* Check if thread has been asked to return */ if (gs.end_threads) { return(0); } /* Get parameters for this thread before entering the main loop */ start = th_params.start; vlen = th_params.vlen; block_size = th_params.block_size; params = th_params.params; pc_error = th_params.pc_error; // If output buffering is needed, allocate it if (th_params.need_output_buffering) { out_buffer.resize(params.memsizes[0] * BLOCK_SIZE1); params.out_buffer = &out_buffer[0]; } else { params.out_buffer = NULL; } /* Populate private data for each thread */ n_inputs = params.n_inputs; n_constants = params.n_constants; n_temps = params.n_temps; memsize = (1+n_inputs+n_constants+n_temps) * sizeof(char *); /* XXX malloc seems thread safe for POSIX, but for Win? */ mem = (char **)malloc(memsize); memcpy(mem, params.mem, memsize); errmsg = th_params.errmsg; params.mem = mem; /* Loop over blocks */ pthread_mutex_lock(&gs.count_mutex); if (!gs.init_sentinels_done) { /* Set sentinels and other global variables */ gs.gindex = start; istart = gs.gindex; iend = istart + block_size; if (iend > vlen) { iend = vlen; } gs.init_sentinels_done = 1; /* sentinels have been initialised */ gs.giveup = 0; /* no giveup initially */ } else { gs.gindex += block_size; istart = gs.gindex; iend = istart + block_size; if (iend > vlen) { iend = vlen; } } /* Grab one of the iterators */ iter = th_params.iter[tid]; if (iter == NULL) { th_params.ret_code = -1; gs.giveup = 1; } memsteps = th_params.memsteps[tid]; /* Get temporary space for each thread */ ret = get_temps_space(params, mem, BLOCK_SIZE1); if (ret < 0) { /* Propagate error to main thread */ th_params.ret_code = ret; gs.giveup = 1; } pthread_mutex_unlock(&gs.count_mutex); while (istart < vlen && !gs.giveup) { /* Reset the iterator to the range for this task */ ret = NpyIter_ResetToIterIndexRange(iter, istart, iend, errmsg); /* Execute the task */ if (ret >= 0) { ret = vm_engine_iter_task(iter, memsteps, params, pc_error, errmsg); } if (ret < 0) { pthread_mutex_lock(&gs.count_mutex); gs.giveup = 1; /* Propagate error to main thread */ th_params.ret_code = ret; pthread_mutex_unlock(&gs.count_mutex); break; } pthread_mutex_lock(&gs.count_mutex); gs.gindex += block_size; istart = gs.gindex; iend = istart + block_size; if (iend > vlen) { iend = vlen; } pthread_mutex_unlock(&gs.count_mutex); } /* Meeting point for all threads (wait for finalization) */ pthread_mutex_lock(&gs.count_threads_mutex); if (gs.count_threads > 0) { gs.count_threads--; do { pthread_cond_wait(&gs.count_threads_cv, &gs.count_threads_mutex); } while (gs.barrier_passed); } else { gs.barrier_passed = 0; pthread_cond_broadcast(&gs.count_threads_cv); } pthread_mutex_unlock(&gs.count_threads_mutex); /* Release resources */ free_temps_space(params, mem); free(mem); } /* closes while(1) */ /* This should never be reached, but anyway */ return(0); } /* Initialize threads */ int init_threads(void) { int tid, rc; if ( !(gs.nthreads > 1 && (!gs.init_threads_done || gs.pid != getpid())) ) { /* Thread pool must always be initialized once and once only. */ return(0); } /* Initialize mutex and condition variable objects */ pthread_mutex_init(&gs.count_mutex, NULL); pthread_mutex_init(&gs.parallel_mutex, NULL); /* Barrier initialization */ pthread_mutex_init(&gs.count_threads_mutex, NULL); pthread_cond_init(&gs.count_threads_cv, NULL); gs.count_threads = 0; /* Reset threads counter */ gs.barrier_passed = 0; /* * Our worker threads should not deal with signals from the rest of the * application - mask everything temporarily in this thread, so our workers * can inherit that mask */ sigset_t sigset_block_all, sigset_restore; rc = sigfillset(&sigset_block_all); if (rc != 0) { fprintf(stderr, "ERROR; failed to block signals: sigfillset: %s", strerror(rc)); exit(-1); } rc = pthread_sigmask( SIG_BLOCK, &sigset_block_all, &sigset_restore); if (rc != 0) { fprintf(stderr, "ERROR; failed to block signals: pthread_sigmask: %s", strerror(rc)); exit(-1); } /* Now create the threads */ for (tid = 0; tid < gs.nthreads; tid++) { gs.tids[tid] = tid; rc = pthread_create(&gs.threads[tid], NULL, th_worker, (void *)&gs.tids[tid]); if (rc) { fprintf(stderr, "ERROR; return code from pthread_create() is %d\n", rc); fprintf(stderr, "\tError detail: %s\n", strerror(rc)); exit(-1); } } /* * Restore the signal mask so the main thread can process signals as * expected */ rc = pthread_sigmask( SIG_SETMASK, &sigset_restore, NULL); if (rc != 0) { fprintf(stderr, "ERROR: failed to restore signal mask: pthread_sigmask: %s", strerror(rc)); exit(-1); } gs.init_threads_done = 1; /* Initialization done! */ gs.pid = (int)getpid(); /* save the PID for this process */ return(0); } /* Set the number of threads in numexpr's VM */ int numexpr_set_nthreads(int nthreads_new) { int nthreads_old = gs.nthreads; int t, rc; void *status; // if (nthreads_new > MAX_THREADS) { // fprintf(stderr, // "Error. nthreads cannot be larger than MAX_THREADS (%d)", // MAX_THREADS); // return -1; // } if (nthreads_new > global_max_threads) { fprintf(stderr, "Error. nthreads cannot be larger than environment variable \"NUMEXPR_MAX_THREADS\" (%ld)", global_max_threads); return -1; } else if (nthreads_new <= 0) { fprintf(stderr, "Error. nthreads must be a positive integer"); return -1; } /* Only join threads if they are not initialized or if our PID is different from that in pid var (probably means that we are a subprocess, and thus threads are non-existent). */ if (gs.nthreads > 1 && gs.init_threads_done && gs.pid == getpid()) { /* Tell all existing threads to finish */ gs.end_threads = 1; pthread_mutex_lock(&gs.count_threads_mutex); if (gs.count_threads < gs.nthreads) { gs.count_threads++; do { pthread_cond_wait(&gs.count_threads_cv, &gs.count_threads_mutex); } while (!gs.barrier_passed); } else { gs.barrier_passed = 1; pthread_cond_broadcast(&gs.count_threads_cv); } pthread_mutex_unlock(&gs.count_threads_mutex); /* Join exiting threads */ for (t=0; t<gs.nthreads; t++) { rc = pthread_join(gs.threads[t], &status); if (rc) { fprintf(stderr, "ERROR; return code from pthread_join() is %d\n", rc); fprintf(stderr, "\tError detail: %s\n", strerror(rc)); exit(-1); } } gs.init_threads_done = 0; gs.end_threads = 0; } /* Launch a new pool of threads (if necessary) */ gs.nthreads = nthreads_new; init_threads(); return nthreads_old; } #ifdef USE_VML static PyObject * _get_vml_version(PyObject *self, PyObject *args) { int len=198; char buf[198]; mkl_get_version_string(buf, len); return Py_BuildValue("s", buf); } static PyObject * _set_vml_accuracy_mode(PyObject *self, PyObject *args) { int mode_in, mode_old; if (!PyArg_ParseTuple(args, "i", &mode_in)) return NULL; mode_old = vmlGetMode() & VML_ACCURACY_MASK; vmlSetMode((mode_in & VML_ACCURACY_MASK) | VML_ERRMODE_IGNORE ); return Py_BuildValue("i", mode_old); } static PyObject * _set_vml_num_threads(PyObject *self, PyObject *args) { int max_num_threads; if (!PyArg_ParseTuple(args, "i", &max_num_threads)) return NULL; mkl_domain_set_num_threads(max_num_threads, MKL_DOMAIN_VML); Py_RETURN_NONE; } static PyObject * _get_vml_num_threads(PyObject *self, PyObject *args) { int max_num_threads = mkl_domain_get_max_threads (MKL_DOMAIN_VML); return Py_BuildValue("i", max_num_threads); } #endif static PyObject* Py_set_num_threads(PyObject *self, PyObject *args) { int num_threads, nthreads_old; if (!PyArg_ParseTuple(args, "i", &num_threads)) return NULL; nthreads_old = numexpr_set_nthreads(num_threads); return Py_BuildValue("i", nthreads_old); } static PyObject* Py_get_num_threads(PyObject *self, PyObject *args) { int n_thread; n_thread = gs.nthreads; return Py_BuildValue("i", n_thread); } static PyMethodDef module_methods[] = { #ifdef USE_VML {"_get_vml_version", _get_vml_version, METH_VARARGS, "Get the VML/MKL library version."}, {"_set_vml_accuracy_mode", _set_vml_accuracy_mode, METH_VARARGS, "Set accuracy mode for VML functions."}, {"_set_vml_num_threads", _set_vml_num_threads, METH_VARARGS, "Suggests a maximum number of threads to be used in VML operations."}, {"_get_vml_num_threads", _get_vml_num_threads, METH_VARARGS, "Gets the maximum number of threads to be used in VML operations."}, #endif {"_set_num_threads", Py_set_num_threads, METH_VARARGS, "Suggests a maximum number of threads to be used in operations."}, {"_get_num_threads", Py_get_num_threads, METH_VARARGS, "Gets the maximum number of threads currently in use for operations."}, {NULL} }; static int add_symbol(PyObject *d, const char *sname, int name, const char* routine_name) { PyObject *o, *s; int r; if (!sname) { return 0; } o = PyLong_FromLong(name); s = PyBytes_FromString(sname); if (!o || !s) { PyErr_SetString(PyExc_RuntimeError, routine_name); r = -1; } else { r = PyDict_SetItem(d, s, o); } Py_XDECREF(o); Py_XDECREF(s); return r; } #ifdef __cplusplus extern "C" { #endif /* XXX: handle the "global_state" state via moduledef */ static struct PyModuleDef moduledef = { PyModuleDef_HEAD_INIT, "interpreter", NULL, -1, /* sizeof(struct global_state), */ module_methods, NULL, NULL, /* module_traverse, */ NULL, /* module_clear, */ NULL }; #define INITERROR return NULL PyObject * PyInit_interpreter(void) { PyObject *m, *d; char *max_thread_str = getenv("NUMEXPR_MAX_THREADS"); char *end; if (max_thread_str != NULL) { global_max_threads = strtol(max_thread_str, &end, 10); } th_params.memsteps = (npy_intp**)calloc(sizeof(npy_intp*), global_max_threads); th_params.iter = (NpyIter**)calloc(sizeof(NpyIter*), global_max_threads); th_params.reduce_iter = (NpyIter**)calloc(sizeof(NpyIter*), global_max_threads); gs.threads = (pthread_t*)calloc(sizeof(pthread_t), global_max_threads); gs.tids = (int*)calloc(sizeof(int), global_max_threads); // TODO: for Py3, deallocate: https://docs.python.org/3/c-api/module.html#c.PyModuleDef.m_free // For Python 2.7, people have to exit the process to reclaim the memory. if (PyType_Ready(&NumExprType) < 0) INITERROR; m = PyModule_Create(&moduledef); if (m == NULL) INITERROR; Py_INCREF(&NumExprType); PyModule_AddObject(m, "NumExpr", (PyObject *)&NumExprType); import_array(); d = PyDict_New(); if (!d) INITERROR; #define OPCODE(n, name, sname, ...) \ if (add_symbol(d, sname, name, "add_op") < 0) { INITERROR; } #include "opcodes.hpp" #undef OPCODE if (PyModule_AddObject(m, "opcodes", d) < 0) INITERROR; d = PyDict_New(); if (!d) INITERROR; #define add_func(name, sname) \ if (add_symbol(d, sname, name, "add_func") < 0) { INITERROR; } #define FUNC_FF(name, sname, ...) add_func(name, sname); #define FUNC_FFF(name, sname, ...) add_func(name, sname); #define FUNC_DD(name, sname, ...) add_func(name, sname); #define FUNC_DDD(name, sname, ...) add_func(name, sname); #define FUNC_CC(name, sname, ...) add_func(name, sname); #define FUNC_CCC(name, sname, ...) add_func(name, sname); #include "functions.hpp" #undef FUNC_CCC #undef FUNC_CC #undef FUNC_DDD #undef FUNC_DD #undef FUNC_DD #undef FUNC_FFF #undef FUNC_FF #undef add_func if (PyModule_AddObject(m, "funccodes", d) < 0) INITERROR; if (PyModule_AddObject(m, "allaxes", PyLong_FromLong(255)) < 0) INITERROR; if (PyModule_AddObject(m, "maxdims", PyLong_FromLong(NPY_MAXDIMS)) < 0) INITERROR; if(PyModule_AddIntConstant(m, "MAX_THREADS", global_max_threads) < 0) INITERROR; // Let's export the block sizes to Python side for benchmarking comparisons if(PyModule_AddIntConstant(m, "__BLOCK_SIZE1__", BLOCK_SIZE1) < 0) INITERROR; // Export if we are using VML or not #ifdef USE_VML if(PyModule_AddObject(m, "use_vml", Py_True) < 0) INITERROR; #else if(PyModule_AddObject(m, "use_vml", Py_False) < 0) INITERROR; #endif return m; } #ifdef __cplusplus } // extern "C" #endif