%PDF- %PDF-
Direktori : /proc/985914/root/lib/python3/dist-packages/pythran/pythonic/utils/ |
Current File : //proc/985914/root/lib/python3/dist-packages/pythran/pythonic/utils/broadcast_copy.hpp |
#ifndef PYTHONIC_UTILS_BROADCAST_COPY_HPP #define PYTHONIC_UTILS_BROADCAST_COPY_HPP #include "pythonic/include/utils/broadcast_copy.hpp" #include "pythonic/types/tuple.hpp" #ifdef _OPENMP #include <omp.h> // as a macro so that an enlightened user can modify this variable :-) #ifndef PYTHRAN_OPENMP_MIN_ITERATION_COUNT #define PYTHRAN_OPENMP_MIN_ITERATION_COUNT 1000 #endif #endif PYTHONIC_NS_BEGIN namespace utils { /* helper for specialization of the broadcasting, vectorizing copy operator * due to expression templates, this may also triggers a lot of *computations! * * ``vector_form'' is set to true if the operation can be done using *Boost.SIMD * * the call operator has four template parameters: * * template <class E, class F, size_t N> * void operator()(E &&self, F const &other, utils::int_<N>, utils::int_<M>) * * ``E'' is the type of the object to which the data are copied * * ``F'' is the type of the object from which the data are copied * * ``N'' is the depth of the loop nest. When it reaches ``1'', we have a raw *loop * that may be vectorizable * * ``D'' is the delta between the number of dimensions of E && F. When set *to a * value greater than ``0'', some broadcasting is needed */ template <typename vector_form, size_t N, size_t D> struct _broadcast_copy; struct fast_novectorize { }; template <> struct _broadcast_copy<fast_novectorize, 0, 0> { template <class E, class F, class SelfIndices, class OtherIndices, size_t... Is> void helper(E &&self, F const &other, SelfIndices &&self_indices, OtherIndices &&other_indices, utils::index_sequence<Is...>) { std::forward<E>(self) .store((typename std::decay<E>::type::dtype)other.load( (long)std::get<Is>(other_indices)...), (long)std::get<Is>(self_indices)...); } template <class E, class F, class SelfIndices, class OtherIndices> void operator()(E &&self, F const &other, SelfIndices &&self_indices, OtherIndices &&other_indices) { helper(std::forward<E>(self), other, self_indices, other_indices, utils::make_index_sequence<std::tuple_size< typename std::decay<SelfIndices>::type>::value>()); } }; template <size_t N> struct _broadcast_copy<fast_novectorize, N, 0> { template <class E, class F, class SelfIndices, class OtherIndices> void operator()(E &&self, F const &other, SelfIndices &&self_indices, OtherIndices &&other_indices) { long const other_size = other.template shape<std::decay<E>::type::value - N>(); long const self_size = self.template shape<std::decay<E>::type::value - N>(); if (self_size == other_size) for (long i = 0; i < self_size; ++i) _broadcast_copy<fast_novectorize, N - 1, 0>{}( std::forward<E>(self), other, std::tuple_cat(self_indices, std::make_tuple(i)), std::tuple_cat(other_indices, std::make_tuple(i))); else for (long i = 0; i < self_size; ++i) _broadcast_copy<fast_novectorize, N - 1, 0>{}( std::forward<E>(self), other, std::tuple_cat(self_indices, std::make_tuple(i)), std::tuple_cat(other_indices, std::make_tuple(0))); } }; template <size_t N, size_t D> struct _broadcast_copy<fast_novectorize, N, D> { template <class E, class F, class SelfIndices, class OtherIndices> void operator()(E &&self, F const &other, SelfIndices &&self_indices, OtherIndices &&other_indices) { using broadcaster = typename std::conditional< types::is_dtype<F>::value, types::broadcast<F, typename std::decay<E>::type::dtype>, types::broadcasted<F>>::type; _broadcast_copy<fast_novectorize, N, D - 1>{}( std::forward<E>(self), broadcaster(other), std::forward<SelfIndices>(self_indices), std::forward<OtherIndices>(other_indices)); } }; template <size_t N, class vectorizer> struct _broadcast_copy<vectorizer, N, 0> { template <class E, class F, class... Indices> void operator()(E &&self, F const &other, Indices... indices) { long self_size = std::distance(self.begin(), self.end()), other_size = std::distance(other.begin(), other.end()); #ifdef _OPENMP if (other_size >= PYTHRAN_OPENMP_MIN_ITERATION_COUNT) { auto siter = self.begin(); auto oiter = other.begin(); #pragma omp parallel for for (long i = 0; i < other_size; ++i) *(siter + i) = *(oiter + i); } else #endif std::copy(other.begin(), other.end(), self.begin()); // eventually repeat the pattern #ifdef _OPENMP if (self_size >= PYTHRAN_OPENMP_MIN_ITERATION_COUNT * other_size) #pragma omp parallel for for (long i = other_size; i < self_size; i += other_size) std::copy_n(self.begin(), other_size, self.begin() + i); else #endif for (long i = other_size; i < self_size; i += other_size) std::copy_n(self.begin(), other_size, self.begin() + i); } }; // ``D'' is not ``0'' so we should broadcast template <class vectorizer, size_t N, size_t D> struct _broadcast_copy { template <class E, class F> void operator()(E &&self, F const &other) { if (types::is_dtype<F>::value) { std::fill(self.begin(), self.end(), other); } else { auto sfirst = self.begin(); *sfirst = other; #ifdef _OPENMP auto siter = sfirst; long n = self.template shape<0>(); if (n >= PYTHRAN_OPENMP_MIN_ITERATION_COUNT) #pragma omp parallel for for (long i = 1; i < n; ++i) *(siter + i) = *sfirst; else #endif std::fill(self.begin() + 1, self.end(), *sfirst); } } template <class E, class F, class ES, class FS> void operator()(E &&self, F const &other, ES, FS) { if (types::is_dtype<F>::value) { std::fill(self.begin(), self.end(), other); } else { auto sfirst = self.begin(); *sfirst = other; #ifdef _OPENMP auto siter = sfirst; long n = self.template shape<0>(); if (n >= PYTHRAN_OPENMP_MIN_ITERATION_COUNT) #pragma omp parallel for for (long i = 1; i < n; ++i) *(siter + i) = *sfirst; else #endif std::fill(self.begin() + 1, self.end(), *sfirst); } } }; #ifdef USE_XSIMD // specialize for SIMD only if available // otherwise use the std::copy fallback template <class vectorizer, class E, class F> void vbroadcast_copy(E &&self, F const &other) { using T = typename F::dtype; using vT = xsimd::simd_type<T>; static const std::size_t vN = vT::size; long self_size = std::distance(self.begin(), self.end()), other_size = std::distance(other.begin(), other.end()); auto oiter = vectorizer::vbegin(other); const long bound = std::distance(vectorizer::vbegin(other), vectorizer::vend(other)); #ifdef _OPENMP if (bound >= PYTHRAN_OPENMP_MIN_ITERATION_COUNT) { auto iter = vectorizer::vbegin(self); #pragma omp parallel for for (long i = 0; i < bound; ++i) { (iter + i).store(*(oiter + i)); } } else #endif for (auto iter = vectorizer::vbegin(self), end = vectorizer::vend(self); iter != end; ++iter, ++oiter) { iter.store(*oiter); } // tail { auto siter = self.begin(); auto oiter = other.begin(); for (long i = bound * vN; i < other_size; ++i) *(siter + i) = *(oiter + i); } #ifdef _OPENMP if (self_size >= PYTHRAN_OPENMP_MIN_ITERATION_COUNT * other_size) #pragma omp parallel for for (long i = other_size; i < self_size; i += other_size) std::copy_n(self.begin(), other_size, self.begin() + i); else #endif for (long i = other_size; i < self_size; i += other_size) std::copy_n(self.begin(), other_size, self.begin() + i); } template <> struct _broadcast_copy<types::vectorizer, 1, 0> { template <class E, class F> void operator()(E &&self, F const &other) { return vbroadcast_copy<types::vectorizer>(std::forward<E>(self), other); } }; template <> struct _broadcast_copy<types::vectorizer_nobroadcast, 1, 0> { template <class E, class F> void operator()(E &&self, F const &other) { return vbroadcast_copy<types::vectorizer_nobroadcast>( std::forward<E>(self), other); } }; #endif template <class E, class F, size_t N, size_t D, bool vector_form> struct broadcast_copy_dispatcher; template <class E, class F, size_t N, size_t D> struct broadcast_copy_dispatcher<E, F, N, D, false> { void operator()(E &self, F const &other) { if (utils::no_broadcast_ex(other)) _broadcast_copy<fast_novectorize, N, D>{}( self, other, std::make_tuple(), std::make_tuple()); else _broadcast_copy<types::novectorize, N, D>{}(self, other); } }; template <class E, class F, size_t N, size_t D> struct broadcast_copy_dispatcher<E, F, N, D, true> { void operator()(E &self, F const &other) { if (utils::no_broadcast_ex(other)) _broadcast_copy<fast_novectorize, N, D>{}( self, other, std::make_tuple(), std::make_tuple()); else _broadcast_copy<types::vectorizer, N, D>{}(self, other); } }; template <class E, class F, size_t N, size_t D, bool vector_form> E &broadcast_copy(E &self, F const &other) { if (self.size()) broadcast_copy_dispatcher<E, F, N, D, vector_form>{}(self, other); return self; } /* update */ // ``D'' is not ``0'' so we should broadcast template <class Op, typename vector_form, size_t N, size_t D> struct _broadcast_update { template <class E, class F> void operator()(E &&self, F const &other) { long n = self.template shape<0>(); auto siter = self.begin(); #ifdef _OPENMP if (n >= PYTHRAN_OPENMP_MIN_ITERATION_COUNT) #pragma omp parallel for for (long i = 0; i < n; ++i) Op{}(*(siter + i), other); else #endif for (long i = 0; i < n; ++i) Op{}(*(siter + i), other); } }; template <class Op, size_t N, class vector_form> struct _broadcast_update<Op, vector_form, N, 0> { template <class E, class F> void operator()(E &&self, F const &other) { long other_size = std::distance(other.begin(), other.end()); auto siter = self.begin(); auto oiter = other.begin(); #ifdef _OPENMP if (other_size >= PYTHRAN_OPENMP_MIN_ITERATION_COUNT) #pragma omp parallel for for (long i = 0; i < other_size; ++i) Op{}(*(siter + i), *(oiter + i)); else #endif if (other_size == 1) { auto value = *oiter; for (auto send = self.end(); siter != send; ++siter) Op{}(*siter, value); } else for (auto send = self.end(); siter != send;) { auto ooiter = oiter; for (long i = 0; i < other_size; ++i, ++siter, ++ooiter) Op{}(*siter, *ooiter); } } template <class E, class F0, class F1> void operator()(E &&self, types::broadcast<F0, F1> const &other) { auto value = *other.begin(); for (auto siter = self.begin(), send = self.end(); siter != send; ++siter) Op{}(*siter, value); } template <class E, class F> void operator()(E &&self, types::broadcasted<F> const &other) { auto value = *other.end(); for (auto siter = self.begin(), send = self.end(); siter != send; ++siter) Op{}(*siter, value); } }; template <class Op> struct _broadcast_update<Op, fast_novectorize, 0, 0> { template <class E, class F, class SelfIndices, class OtherIndices, size_t... Is> void helper(E &&self, F const &other, SelfIndices &&self_indices, OtherIndices &&other_indices, utils::index_sequence<Is...>) { self.template update<Op>(other.load((long)std::get<Is>(other_indices)...), (long)std::get<Is>(self_indices)...); } template <class E, class F, class SelfIndices, class OtherIndices> void operator()(E &&self, F const &other, SelfIndices &&self_indices, OtherIndices &&other_indices) { helper(std::forward<E>(self), other, self_indices, other_indices, utils::make_index_sequence<std::tuple_size< typename std::decay<SelfIndices>::type>::value>()); } }; template <class Op, size_t N> struct _broadcast_update<Op, fast_novectorize, N, 0> { template <class E, class F, class SelfIndices, class OtherIndices> void operator()(E &&self, F const &other, SelfIndices &&self_indices, OtherIndices &&other_indices) { auto const other_size = other.template shape<std::decay<E>::type::value - N>(); auto const self_size = self.template shape<std::decay<E>::type::value - N>(); if (self_size == other_size) for (long i = 0; i < self_size; ++i) _broadcast_update<Op, fast_novectorize, N - 1, 0>{}( std::forward<E>(self), other, std::tuple_cat(self_indices, std::make_tuple(i)), std::tuple_cat(other_indices, std::make_tuple(i))); else for (long i = 0; i < self_size; ++i) _broadcast_update<Op, fast_novectorize, N - 1, 0>{}( std::forward<E>(self), other, std::tuple_cat(self_indices, std::make_tuple(i)), std::tuple_cat(other_indices, std::make_tuple(0))); } }; template <class Op, size_t N, size_t D> struct _broadcast_update<Op, fast_novectorize, N, D> { template <class E, class F, class SelfIndices, class OtherIndices> void operator()(E &&self, F const &other, SelfIndices &&self_indices, OtherIndices &&other_indices) { using broadcaster = typename std::conditional< types::is_dtype<F>::value, types::broadcast<F, typename std::decay<E>::type::dtype>, types::broadcasted<F>>::type; _broadcast_update<Op, fast_novectorize, N, D - 1>{}( std::forward<E>(self), broadcaster(other), std::forward<SelfIndices>(self_indices), std::forward<OtherIndices>(other_indices)); } }; #ifdef USE_XSIMD // specialize for SIMD only if available // otherwise use the std::copy fallback template <class Op, class vectorizer, class E, class F> void vbroadcast_update(E &&self, F const &other) { using T = typename F::dtype; using vT = typename xsimd::simd_type<T>; long other_size = std::distance(other.begin(), other.end()); static const std::size_t vN = vT::size; auto oiter = vectorizer::vbegin(other); auto iter = vectorizer::vbegin(self); const long bound = std::distance(vectorizer::vbegin(other), vectorizer::vend(other)); #ifdef _OPENMP if (bound >= PYTHRAN_OPENMP_MIN_ITERATION_COUNT) #pragma omp parallel for for (long i = 0; i < bound; i++) { (iter + i).store(Op{}(*(iter + i), *(oiter + i))); } else #endif for (auto end = vectorizer::vend(self); iter != end; ++iter, ++oiter) { iter.store(Op{}(*iter, *oiter)); } // tail { auto siter = self.begin(); auto oiter = other.begin(); for (long i = bound * vN; i < other_size; ++i) Op{}(*(siter + i), *(oiter + i)); } } template <class Op, class vectorizer, class E, class F0, class F1> void vbroadcast_update(E &&self, types::broadcast<F0, F1> const &other) { auto value = *other.begin(); for (auto siter = self.begin(), send = self.end(); siter != send; ++siter) Op{}(*siter, value); } template <class Op, class vectorizer, class E, class F> void vbroadcast_update(E &&self, types::broadcasted<F> const &other) { auto value = *other.end(); for (auto siter = self.begin(), send = self.end(); siter != send; ++siter) Op{}(*siter, value); } template <class Op> struct _broadcast_update<Op, types::vectorizer, 1, 0> { template <class... Args> void operator()(Args &&... args) { vbroadcast_update<Op, types::vectorizer>(std::forward<Args>(args)...); } }; template <class Op> struct _broadcast_update<Op, types::vectorizer_nobroadcast, 1, 0> { template <class... Args> void operator()(Args &&... args) { vbroadcast_update<Op, types::vectorizer_nobroadcast>( std::forward<Args>(args)...); } }; #endif template <class Op, bool vector_form, class E, class F, size_t N, size_t D> struct broadcast_update_dispatcher; template <class Op, class E, class F, size_t N, size_t D> struct broadcast_update_dispatcher<Op, false, E, F, N, D> { void operator()(E &self, F const &other) { if (utils::no_broadcast_ex(other)) _broadcast_update<Op, fast_novectorize, N, D>{}( self, other, std::make_tuple(), std::make_tuple()); else _broadcast_update<Op, types::novectorize, N, D>{}(self, other); } }; template <class Op, class E, class F, size_t N, size_t D> struct broadcast_update_dispatcher<Op, true, E, F, N, D> { void operator()(E &self, F const &other) { if (utils::no_broadcast_ex(other)) _broadcast_update<Op, fast_novectorize, N, D>{}( self, other, std::make_tuple(), std::make_tuple()); else _broadcast_update<Op, types::vectorizer, N, D>{}(self, other); } }; template <class Op, class E, class F, size_t N, size_t D, bool vector_form> E &broadcast_update(E &self, F const &other) { if (self.size()) broadcast_update_dispatcher<Op, vector_form, E, F, N, D>{}(self, other); return self; } } PYTHONIC_NS_END #endif